瀏覽代碼

Round 3 Kyber

tags/v0.0.1
John M. Schanck 4 年之前
committed by Kris Kwiatkowski
父節點
當前提交
127e9ec326
共有 100 個檔案被更改,包括 3701 行新增4293 行删除
  1. +3
    -3
      crypto_kem/kyber1024-90s/META.yml
  2. +27
    -30
      crypto_kem/kyber1024-90s/avx2/aes256ctr.c
  3. +9
    -13
      crypto_kem/kyber1024-90s/avx2/align.h
  4. +96
    -237
      crypto_kem/kyber1024-90s/avx2/basemul.S
  5. +41
    -43
      crypto_kem/kyber1024-90s/avx2/cbd.c
  6. +4
    -1
      crypto_kem/kyber1024-90s/avx2/cbd.h
  7. +7
    -3
      crypto_kem/kyber1024-90s/avx2/cdecl.h
  8. +88
    -120
      crypto_kem/kyber1024-90s/avx2/consts.c
  9. +2
    -11
      crypto_kem/kyber1024-90s/avx2/consts.h
  10. +8
    -51
      crypto_kem/kyber1024-90s/avx2/fq.S
  11. +5
    -4
      crypto_kem/kyber1024-90s/avx2/fq.inc
  12. +120
    -110
      crypto_kem/kyber1024-90s/avx2/indcpa.c
  13. +122
    -154
      crypto_kem/kyber1024-90s/avx2/invntt.S
  14. +36
    -36
      crypto_kem/kyber1024-90s/avx2/kem.c
  15. +154
    -185
      crypto_kem/kyber1024-90s/avx2/ntt.S
  16. +12
    -15
      crypto_kem/kyber1024-90s/avx2/ntt.h
  17. +3
    -2
      crypto_kem/kyber1024-90s/avx2/params.h
  18. +157
    -116
      crypto_kem/kyber1024-90s/avx2/poly.c
  19. +8
    -11
      crypto_kem/kyber1024-90s/avx2/poly.h
  20. +93
    -72
      crypto_kem/kyber1024-90s/avx2/polyvec.c
  21. +3
    -7
      crypto_kem/kyber1024-90s/avx2/polyvec.h
  22. +4
    -5
      crypto_kem/kyber1024-90s/avx2/reduce.h
  23. +72
    -309
      crypto_kem/kyber1024-90s/avx2/rejsample.c
  24. +5
    -2
      crypto_kem/kyber1024-90s/avx2/rejsample.h
  25. +6
    -6
      crypto_kem/kyber1024-90s/avx2/shuffle.S
  26. +10
    -10
      crypto_kem/kyber1024-90s/avx2/shuffle.inc
  27. +4
    -6
      crypto_kem/kyber1024-90s/avx2/symmetric.h
  28. +33
    -35
      crypto_kem/kyber1024-90s/avx2/verify.c
  29. +2
    -2
      crypto_kem/kyber1024-90s/clean/Makefile
  30. +1
    -1
      crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake
  31. +564
    -0
      crypto_kem/kyber1024-90s/clean/aes256ctr.c
  32. +28
    -0
      crypto_kem/kyber1024-90s/clean/aes256ctr.h
  33. +38
    -5
      crypto_kem/kyber1024-90s/clean/cbd.c
  34. +3
    -1
      crypto_kem/kyber1024-90s/clean/cbd.h
  35. +72
    -77
      crypto_kem/kyber1024-90s/clean/indcpa.c
  36. +16
    -15
      crypto_kem/kyber1024-90s/clean/kem.c
  37. +42
    -57
      crypto_kem/kyber1024-90s/clean/ntt.c
  38. +1
    -6
      crypto_kem/kyber1024-90s/clean/ntt.h
  39. +7
    -9
      crypto_kem/kyber1024-90s/clean/params.h
  40. +55
    -50
      crypto_kem/kyber1024-90s/clean/poly.c
  41. +6
    -5
      crypto_kem/kyber1024-90s/clean/poly.h
  42. +15
    -36
      crypto_kem/kyber1024-90s/clean/polyvec.c
  43. +4
    -8
      crypto_kem/kyber1024-90s/clean/polyvec.h
  44. +4
    -20
      crypto_kem/kyber1024-90s/clean/reduce.c
  45. +0
    -2
      crypto_kem/kyber1024-90s/clean/reduce.h
  46. +10
    -92
      crypto_kem/kyber1024-90s/clean/symmetric-aes.c
  47. +0
    -19
      crypto_kem/kyber1024-90s/clean/symmetric-aes.h
  48. +12
    -7
      crypto_kem/kyber1024-90s/clean/symmetric.h
  49. +3
    -3
      crypto_kem/kyber1024/META.yml
  50. +9
    -13
      crypto_kem/kyber1024/avx2/align.h
  51. +96
    -237
      crypto_kem/kyber1024/avx2/basemul.S
  52. +41
    -43
      crypto_kem/kyber1024/avx2/cbd.c
  53. +4
    -1
      crypto_kem/kyber1024/avx2/cbd.h
  54. +7
    -3
      crypto_kem/kyber1024/avx2/cdecl.h
  55. +88
    -120
      crypto_kem/kyber1024/avx2/consts.c
  56. +2
    -11
      crypto_kem/kyber1024/avx2/consts.h
  57. +30
    -46
      crypto_kem/kyber1024/avx2/fips202x4.c
  58. +2
    -2
      crypto_kem/kyber1024/avx2/fips202x4.h
  59. +8
    -51
      crypto_kem/kyber1024/avx2/fq.S
  60. +5
    -4
      crypto_kem/kyber1024/avx2/fq.inc
  61. +112
    -120
      crypto_kem/kyber1024/avx2/indcpa.c
  62. +122
    -154
      crypto_kem/kyber1024/avx2/invntt.S
  63. +36
    -36
      crypto_kem/kyber1024/avx2/kem.c
  64. +154
    -185
      crypto_kem/kyber1024/avx2/ntt.S
  65. +12
    -15
      crypto_kem/kyber1024/avx2/ntt.h
  66. +3
    -2
      crypto_kem/kyber1024/avx2/params.h
  67. +176
    -133
      crypto_kem/kyber1024/avx2/poly.c
  68. +10
    -12
      crypto_kem/kyber1024/avx2/poly.h
  69. +93
    -72
      crypto_kem/kyber1024/avx2/polyvec.c
  70. +3
    -7
      crypto_kem/kyber1024/avx2/polyvec.h
  71. +4
    -5
      crypto_kem/kyber1024/avx2/reduce.h
  72. +72
    -309
      crypto_kem/kyber1024/avx2/rejsample.c
  73. +5
    -2
      crypto_kem/kyber1024/avx2/rejsample.h
  74. +6
    -6
      crypto_kem/kyber1024/avx2/shuffle.S
  75. +10
    -10
      crypto_kem/kyber1024/avx2/shuffle.inc
  76. +12
    -18
      crypto_kem/kyber1024/avx2/symmetric-shake.c
  77. +3
    -8
      crypto_kem/kyber1024/avx2/symmetric.h
  78. +33
    -35
      crypto_kem/kyber1024/avx2/verify.c
  79. +38
    -5
      crypto_kem/kyber1024/clean/cbd.c
  80. +3
    -1
      crypto_kem/kyber1024/clean/cbd.h
  81. +72
    -77
      crypto_kem/kyber1024/clean/indcpa.c
  82. +16
    -15
      crypto_kem/kyber1024/clean/kem.c
  83. +42
    -57
      crypto_kem/kyber1024/clean/ntt.c
  84. +1
    -6
      crypto_kem/kyber1024/clean/ntt.h
  85. +7
    -9
      crypto_kem/kyber1024/clean/params.h
  86. +55
    -50
      crypto_kem/kyber1024/clean/poly.c
  87. +6
    -5
      crypto_kem/kyber1024/clean/poly.h
  88. +15
    -36
      crypto_kem/kyber1024/clean/polyvec.c
  89. +4
    -8
      crypto_kem/kyber1024/clean/polyvec.h
  90. +4
    -20
      crypto_kem/kyber1024/clean/reduce.c
  91. +0
    -2
      crypto_kem/kyber1024/clean/reduce.h
  92. +12
    -18
      crypto_kem/kyber1024/clean/symmetric-shake.c
  93. +3
    -8
      crypto_kem/kyber1024/clean/symmetric.h
  94. +4
    -4
      crypto_kem/kyber512-90s/META.yml
  95. +27
    -30
      crypto_kem/kyber512-90s/avx2/aes256ctr.c
  96. +9
    -13
      crypto_kem/kyber512-90s/avx2/align.h
  97. +1
    -1
      crypto_kem/kyber512-90s/avx2/api.h
  98. +96
    -205
      crypto_kem/kyber512-90s/avx2/basemul.S
  99. +109
    -50
      crypto_kem/kyber512-90s/avx2/cbd.c
  100. +4
    -1
      crypto_kem/kyber512-90s/avx2/cbd.h

+ 3
- 3
crypto_kem/kyber1024-90s/META.yml 查看文件

@@ -6,7 +6,7 @@ length-public-key: 1568
length-ciphertext: 1568
length-secret-key: 3168
length-shared-secret: 32
nistkat-sha256: d3064040a33c15b65eb55dfd1bb116d092dab2cf5d693f8ab02b91ed105d66e3
nistkat-sha256: a1b564348a126a118fbc49a6aeaebcb74896753fd99f30eeb0f75f0b2d25115f
principal-submitters:
- Peter Schwabe
auxiliary-submitters:
@@ -21,9 +21,9 @@ auxiliary-submitters:
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber
- name: avx2
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber
supported_platforms:
- architecture: x86_64
operating_systems:


+ 27
- 30
crypto_kem/kyber1024-90s/avx2/aes256ctr.c 查看文件

@@ -2,52 +2,48 @@
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
/*
Based heavily on public-domain code by Romain Dolbeau
Different handling of nonce+counter than original version
using separated 64-bit nonce and internal 64-bit counter, starting from zero
Public Domain
*/
/* Based heavily on public-domain code by Romain Dolbeau
* Different handling of nonce+counter than original version using
* separated 64-bit nonce and internal 64-bit counter, starting from zero
* Public Domain */


static inline void aesni_encrypt4(uint8_t out[64],
__m128i *n,
const __m128i rkeys[16]) {
__m128i f, f0, f1, f2, f3, t;
static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) {
__m128i f, f0, f1, f2, f3;
const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);

/* Load current counter value */
f = _mm_load_si128(n);

/* Increase counter in 4 consecutive blocks */
t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t);
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t);
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t);
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t);
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx);
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx);
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx);
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx);

/* Write counter for next iteration, increased by 4 */
_mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0)));

/* Actual AES encryption, 4x interleaved */
t = _mm_load_si128(&rkeys[0]);
f0 = _mm_xor_si128(f0, t);
f1 = _mm_xor_si128(f1, t);
f2 = _mm_xor_si128(f2, t);
f3 = _mm_xor_si128(f3, t);
f = _mm_load_si128(&rkeys[0]);
f0 = _mm_xor_si128(f0, f);
f1 = _mm_xor_si128(f1, f);
f2 = _mm_xor_si128(f2, f);
f3 = _mm_xor_si128(f3, f);

for (int i = 1; i < 14; i++) {
t = _mm_load_si128(&rkeys[i]);
f0 = _mm_aesenc_si128(f0, t);
f1 = _mm_aesenc_si128(f1, t);
f2 = _mm_aesenc_si128(f2, t);
f3 = _mm_aesenc_si128(f3, t);
f = _mm_load_si128(&rkeys[i]);
f0 = _mm_aesenc_si128(f0, f);
f1 = _mm_aesenc_si128(f1, f);
f2 = _mm_aesenc_si128(f2, f);
f3 = _mm_aesenc_si128(f3, f);
}

t = _mm_load_si128(&rkeys[14]);
f0 = _mm_aesenclast_si128(f0, t);
f1 = _mm_aesenclast_si128(f1, t);
f2 = _mm_aesenclast_si128(f2, t);
f3 = _mm_aesenclast_si128(f3, t);
f = _mm_load_si128(&rkeys[14]);
f0 = _mm_aesenclast_si128(f0, f);
f1 = _mm_aesenclast_si128(f1, f);
f2 = _mm_aesenclast_si128(f2, f);
f3 = _mm_aesenclast_si128(f3, f);

/* Write results */
_mm_storeu_si128((__m128i *)(out + 0), f0);
@@ -134,6 +130,7 @@ void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out,
while (outlen >= 64) {
aesni_encrypt4(out, &state.n, state.rkeys);
outlen -= 64;
out += 64;
}

if (outlen) {


+ 9
- 13
crypto_kem/kyber1024-90s/avx2/align.h 查看文件

@@ -2,22 +2,18 @@
#define PQCLEAN_KYBER102490S_AVX2_ALIGN_H

#include <immintrin.h>
#include <stdint.h>

#define ALIGN16_TYPE(t) \
union { \
__m128i vec; \
t orig; \
#define ALIGNED_UINT8(N) \
union { \
uint8_t coeffs[(N)]; \
__m256i vec[((N)+31)/32]; \
}

#define ALIGN32_ARRAY(t, s) \
union { \
__m256i vec; \
t arr[(s)]; \
#define ALIGNED_INT16(N) \
union { \
int16_t coeffs[(N)]; \
__m256i vec[((N)+15)/16]; \
}

#define ALIGN32_ARRAY_2D(t, n, m) \
union { \
__m256i vec; \
t arr[(n)][(m)]; \
}
#endif

+ 96
- 237
crypto_kem/kyber1024-90s/avx2/basemul.S 查看文件

@@ -1,248 +1,107 @@
#include "cdecl.h"
#include "params.h"

.macro schoolbook off,sign
#load
vmovdqa \off+32(%rsi),%ymm7 # b
vmovdqa \off+32(%rdx),%ymm8 # d
vmovdqa \off(%rsi),%ymm9 # a
vmovdqa \off(%rdx),%ymm10 # c

#mul
vpmullw %ymm7,%ymm8,%ymm11 # bd.lo
vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi
vpmullw %ymm7,%ymm10,%ymm13 # bc.lo
vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi
vpmullw %ymm9,%ymm8,%ymm14 # ad.lo
vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi
vpmullw %ymm9,%ymm10,%ymm15 # ac.lo
vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi

#reduce
vpmullw %ymm1,%ymm11,%ymm11
vpmulhw %ymm0,%ymm11,%ymm11
vpsubw %ymm11,%ymm12,%ymm11 # bd

#mul
vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo
vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi

#unpack
vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0
vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1
vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0
vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1
vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0
vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1
vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0
vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1

#add
.ifeq \sign
vpaddd %ymm14,%ymm15,%ymm14 # x0
vpaddd %ymm9,%ymm10,%ymm9 # x1
.else
vpsubd %ymm15,%ymm14,%ymm14 # x0
vpsubd %ymm10,%ymm9,%ymm9 # x1
.endif
vpaddd %ymm12,%ymm13,%ymm12 # y0
vpaddd %ymm7,%ymm8,%ymm7 # y1
.endm

.macro red a0,a1,b0,b1,x,y,z
#pack
vpxor %ymm\x,%ymm\x,%ymm\x
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y
vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z
vpsrld $16,%ymm\a0,%ymm\a0
vpsrld $16,%ymm\a1,%ymm\a1
vpackusdw %ymm\z,%ymm\y,%ymm\z
vpackusdw %ymm\a1,%ymm\a0,%ymm\a0
vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y
vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x
vpsrld $16,%ymm\b0,%ymm\b0
vpsrld $16,%ymm\b1,%ymm\b1
vpackusdw %ymm\x,%ymm\y,%ymm\y
vpackusdw %ymm\b1,%ymm\b0,%ymm\b0

#reduce
vpmullw %ymm1,%ymm\z,%ymm\z
vpmullw %ymm1,%ymm\y,%ymm\y
vpmulhw %ymm0,%ymm\z,%ymm\z
vpmulhw %ymm0,%ymm\y,%ymm\y
vpsubw %ymm\z,%ymm\a0,%ymm\a0
vpsubw %ymm\y,%ymm\b0,%ymm\b0
.macro schoolbook off
vmovdqa _16XQINV*2(%rcx),%ymm0
vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0
vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0
vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1
vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1

vpmullw %ymm0,%ymm1,%ymm9 # a0.lo
vpmullw %ymm0,%ymm2,%ymm10 # b0.lo
vpmullw %ymm0,%ymm3,%ymm11 # a1.lo
vpmullw %ymm0,%ymm4,%ymm12 # b1.lo

vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0
vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0

vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi
vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi
vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi
vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi

vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1
vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1

vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi
vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi
vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi
vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi

vmovdqa %ymm13,(%rsp)

vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo
vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo
vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo
vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo

vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo
vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo
vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo
vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo

vmovdqa _16XQ*2(%rcx),%ymm8
vpmulhw %ymm8,%ymm13,%ymm13
vpmulhw %ymm8,%ymm9,%ymm9
vpmulhw %ymm8,%ymm5,%ymm5
vpmulhw %ymm8,%ymm10,%ymm10
vpmulhw %ymm8,%ymm6,%ymm6
vpmulhw %ymm8,%ymm11,%ymm11
vpmulhw %ymm8,%ymm7,%ymm7
vpmulhw %ymm8,%ymm12,%ymm12

vpsubw (%rsp),%ymm13,%ymm13 # -a0c0
vpsubw %ymm9,%ymm1,%ymm9 # a0d0
vpsubw %ymm5,%ymm14,%ymm5 # b0c0
vpsubw %ymm10,%ymm2,%ymm10 # b0d0

vpsubw %ymm6,%ymm15,%ymm6 # a1c1
vpsubw %ymm11,%ymm3,%ymm11 # a1d1
vpsubw %ymm7,%ymm0,%ymm7 # b1c1
vpsubw %ymm12,%ymm4,%ymm12 # b1d1

vmovdqa (%r9),%ymm0
vmovdqa 32(%r9),%ymm1
vpmullw %ymm0,%ymm10,%ymm2
vpmullw %ymm0,%ymm12,%ymm3
vpmulhw %ymm1,%ymm10,%ymm10
vpmulhw %ymm1,%ymm12,%ymm12
vpmulhw %ymm8,%ymm2,%ymm2
vpmulhw %ymm8,%ymm3,%ymm3
vpsubw %ymm2,%ymm10,%ymm10 # rb0d0
vpsubw %ymm3,%ymm12,%ymm12 # rb1d1

vpaddw %ymm5,%ymm9,%ymm9
vpaddw %ymm7,%ymm11,%ymm11
vpsubw %ymm13,%ymm10,%ymm13
vpsubw %ymm12,%ymm6,%ymm6

vmovdqa %ymm13,(64*\off+ 0)*2(%rdi)
vmovdqa %ymm9,(64*\off+16)*2(%rdi)
vmovdqa %ymm6,(64*\off+32)*2(%rdi)
vmovdqa %ymm11,(64*\off+48)*2(%rdi)
.endm

.text
basemul64_acc_avx:
poly0.0:
schoolbook 0,0

#mov
vmovdqa %ymm14,%ymm3
vmovdqa %ymm9,%ymm4
vmovdqa %ymm12,%ymm5
vmovdqa %ymm7,%ymm6

poly1.0:
schoolbook 512,0

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

poly2.0:
schoolbook 1024,0

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

poly3.0:
schoolbook 1536,0

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

#reduce
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,(%rdi)
vmovdqa %ymm5,32(%rdi)

poly0.1:
schoolbook 64,1

#mov
vmovdqa %ymm14,%ymm3
vmovdqa %ymm9,%ymm4
vmovdqa %ymm12,%ymm5
vmovdqa %ymm7,%ymm6

poly1.1:
schoolbook 576,1

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

poly2.1:
schoolbook 1088,1

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

poly3.1:
schoolbook 1600,1

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

#reduce
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,64(%rdi)
vmovdqa %ymm5,96(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx)
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx):
_cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx):
#consts
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

ret

basemul64_avx:
schoolbook 0,0

#reduce
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,(%rdi)
vmovdqa %ymm12,32(%rdi)

schoolbook 64,1

#reduce
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,64(%rdi)
vmovdqa %ymm12,96(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx)
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx):
_cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx):
#consts
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx
mov %rsp,%r8
and $-32,%rsp
sub $32,%rsp

lea (_ZETAS_EXP+176)*2(%rcx),%r9
schoolbook 0

add $32*2,%r9
schoolbook 1

add $192*2,%r9
schoolbook 2

add $32*2,%r9
schoolbook 3

mov %r8,%rsp
ret

+ 41
- 43
crypto_kem/kyber1024-90s/avx2/cbd.c 查看文件

@@ -4,66 +4,64 @@
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_cbd
* Name: cbd2
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
* a centered binomial distribution with parameter eta=2
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *buf: pointer to input byte array
* Arguments: - poly *r: pointer to output polynomial
* - const __m256i *buf: pointer to aligned input byte array
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) {
static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) {
unsigned int i;
__m256i vec0, vec1, vec2, vec3, tmp;
__m256i f0, f1, f2, f3;
const __m256i mask55 = _mm256_set1_epi32(0x55555555);
const __m256i mask33 = _mm256_set1_epi32(0x33333333);
const __m256i mask03 = _mm256_set1_epi32(0x03030303);
const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);

for (i = 0; i < KYBER_N / 64; i++) {
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]);
f0 = _mm256_load_si256(&buf[i]);

vec1 = _mm256_srli_epi32(vec0, 1);
vec0 = _mm256_and_si256(mask55, vec0);
vec1 = _mm256_and_si256(mask55, vec1);
vec0 = _mm256_add_epi32(vec0, vec1);
f1 = _mm256_srli_epi16(f0, 1);
f0 = _mm256_and_si256(mask55, f0);
f1 = _mm256_and_si256(mask55, f1);
f0 = _mm256_add_epi8(f0, f1);

vec1 = _mm256_srli_epi32(vec0, 2);
vec0 = _mm256_and_si256(mask33, vec0);
vec1 = _mm256_and_si256(mask33, vec1);
f1 = _mm256_srli_epi16(f0, 2);
f0 = _mm256_and_si256(mask33, f0);
f1 = _mm256_and_si256(mask33, f1);
f0 = _mm256_add_epi8(f0, mask33);
f0 = _mm256_sub_epi8(f0, f1);

vec2 = _mm256_srli_epi32(vec0, 4);
vec3 = _mm256_srli_epi32(vec1, 4);
vec0 = _mm256_and_si256(mask03, vec0);
vec1 = _mm256_and_si256(mask03, vec1);
vec2 = _mm256_and_si256(mask03, vec2);
vec3 = _mm256_and_si256(mask03, vec3);
f1 = _mm256_srli_epi16(f0, 4);
f0 = _mm256_and_si256(mask0F, f0);
f1 = _mm256_and_si256(mask0F, f1);
f0 = _mm256_sub_epi8(f0, mask03);
f1 = _mm256_sub_epi8(f1, mask03);

vec1 = _mm256_sub_epi8(vec0, vec1);
vec3 = _mm256_sub_epi8(vec2, vec3);
f2 = _mm256_unpacklo_epi8(f0, f1);
f3 = _mm256_unpackhi_epi8(f0, f1);

vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1));
vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1));
vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3));
vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1));
f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1));
f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1));

tmp = _mm256_unpacklo_epi16(vec0, vec2);
vec2 = _mm256_unpackhi_epi16(vec0, vec2);
vec0 = tmp;
tmp = _mm256_unpacklo_epi16(vec1, vec3);
vec3 = _mm256_unpackhi_epi16(vec1, vec3);
vec1 = tmp;
_mm256_store_si256(&r->vec[4 * i + 0], f0);
_mm256_store_si256(&r->vec[4 * i + 1], f2);
_mm256_store_si256(&r->vec[4 * i + 2], f1);
_mm256_store_si256(&r->vec[4 * i + 3], f3);
}
}

tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20);
vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31);
vec0 = tmp;
tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20);
vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31);
vec1 = tmp;

_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3);
}
/* buf 32 bytes longer for cbd3 */
void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) {
cbd2(r, buf);
}

void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) {
cbd2(r, buf);
}

+ 4
- 1
crypto_kem/kyber1024-90s/avx2/cbd.h 查看文件

@@ -2,8 +2,11 @@
#define PQCLEAN_KYBER102490S_AVX2_CBD_H
#include "params.h"
#include "poly.h"
#include <immintrin.h>
#include <stdint.h>

void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);
void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]);

void PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]);

#endif

+ 7
- 3
crypto_kem/kyber1024-90s/avx2/cdecl.h 查看文件

@@ -1,6 +1,8 @@
#ifndef PQCLEAN_KYBER102490S_AVX2_CDECL_H
#define PQCLEAN_KYBER102490S_AVX2_CDECL_H



#define _16XQ 0
#define _16XQINV 16
#define _16XV 32
@@ -9,9 +11,10 @@
#define _16XMONTSQLO 80
#define _16XMONTSQHI 96
#define _16XMASK 112
#define _ZETAS_EXP 128
#define _ZETAS_INV_EXP 528

#define _REVIDXB 128
#define _REVIDXD 144
#define _ZETAS_EXP 160
#define _16XSHIFT 624

/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
@@ -23,4 +26,5 @@

#define _cdecl(s) _##s
#define cdecl(s) s

#endif

+ 88
- 120
crypto_kem/kyber1024-90s/avx2/consts.c 查看文件

@@ -1,155 +1,123 @@
#include "align.h"
#include "consts.h"
#include "params.h"
#include <stdint.h>

#define Q KYBER_Q
#define MONT ((1U << 16) % Q)
#define QINV 62209 // q^-1 mod 2^16
#define V (((1U << 26) + Q/2)/Q)
#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q)
#define FLO (FHI*QINV % 65536)
#define MONTSQHI (MONT*MONT % Q)
#define MONTSQLO (MONTSQHI*QINV % 65536)
#define MONT (-1044) // 2^16 mod q
#define QINV (-3327) // q^-1 mod 2^16
#define V 20159 // floor(2^26/q + 0.5)
#define FHI 1441 // mont^2/128
#define FLO (-10079) // qinv*FHI
#define MONTSQHI 1353 // mont^2
#define MONTSQLO 20553 // qinv*MONTSQHI
#define MASK 4095
#define SHIFT 32


const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.as_arr = {
#define _16XQ 0
const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.coeffs = {
//#define _16XQ 0
Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,

#define _16XQINV 16
//#define _16XQINV 16
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,

#define _16XV 32
//#define _16XV 32
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,

#define _16XFLO 48
//#define _16XFLO 48
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,

#define _16XFHI 64
//#define _16XFHI 64
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,

#define _16XMONTSQLO 80
//#define _16XMONTSQLO 80
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,

#define _16XMONTSQHI 96
//#define _16XMONTSQHI 96
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,

#define _16XMASK 112
//#define _16XMASK 112
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,

#define _ZETAS_EXP 128
31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970,
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525,
53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134,
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422,
44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758,
61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846,
3158, 3158, 3158, 3158, 622, 622, 622, 622,
1577, 1577, 1577, 1577, 182, 182, 182, 182,
59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479,
5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295,
573, 573, 2004, 2004, 264, 264, 383, 383,
2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199,
59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081,
52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837,
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785,
516, 3321, 3009, 2663, 1711, 2167, 126, 1469,
65202, 54059, 33310, 20494, 37798, 945, 50654, 6182,
32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261,
2226, 555, 2078, 1550, 422, 177, 3038, 1574,
3083, 1159, 2552, 2727, 1739, 2457, 418, 3173,
11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493,
33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918,
430, 843, 871, 105, 587, 3094, 2869, 1653,
778, 3182, 1483, 1119, 644, 349, 329, 3254,
788, 788, 1812, 1812, 28191, 28191, 28191, 28191,
28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842,
48842, 48842, 48842, 48842, 287, 287, 287, 287,
287, 287, 287, 287, 202, 202, 202, 202,
202, 202, 202, 202, 10690, 10690, 10690, 10690,
1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335,
31164, 31164, 31164, 31164, 962, 962, 962, 962,
2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855,
1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313,
55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859,
26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017,
732, 732, 608, 608, 1787, 1787, 411, 411,
3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638,
37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780,
16832, 4312, 41381, 47622, 2476, 3239, 3058, 830,
107, 1908, 3082, 2378, 2931, 961, 1821, 2604,
448, 2264, 677, 2054, 34353, 25435, 58154, 24392,
44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907,
31637, 28644, 23998, 48114, 817, 603, 1322, 1864,
2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459,
3221, 996, 958, 1522, 20297, 2146, 15356, 33152,
59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094,
41677, 45279, 7757, 23132, 1097, 610, 2044, 384,
3193, 1994, 220, 1670, 1799, 794, 2475, 478,
3021, 991, 1869, 1628, 0, 0, 0, 0,
//#define _REVIDXB 128
3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
3854, 3340, 2826, 2312, 1798, 1284, 770, 256,

//#define _REVIDXD 144
7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0,

//#define _ZETAS_EXP 160
31498, 31498, 31498, 31498, -758, -758, -758, -758,
5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397,
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
-359, -359, -359, -359, -359, -359, -359, -359,
-359, -359, -359, -359, -359, -359, -359, -359,
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525,
-12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402,
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422,
-20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758,
-3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690,
-171, -171, -171, -171, 622, 622, 622, 622,
1577, 1577, 1577, 1577, 182, 182, 182, 182,
-5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057,
5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242,
573, 573, -1325, -1325, 264, 264, 383, 383,
-829, -829, 1458, 1458, -1602, -1602, -130, -130,
-5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080,
-12796, 26616, 16064, -12442, 9134, -650, -25986, 27837,
1223, 652, -552, 1015, -1293, 1491, -282, -1544,
516, -8, -320, -666, -1618, -1162, 126, 1469,
-335, -11477, -32227, 20494, -27738, 945, -14883, 6182,
32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276,
-1103, 555, -1251, 1550, 422, 177, -291, 1574,
-246, 1159, -777, -602, -1590, -872, 418, -156,
11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493,
-32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619,
430, 843, 871, 105, 587, -235, -460, 1653,
778, -147, 1483, 1119, 644, 349, 329, -75,
787, 787, 787, 787, 787, 787, 787, 787,
787, 787, 787, 787, 787, 787, 787, 787,
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191,
-16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694,
287, 287, 287, 287, 287, 287, 287, 287,
202, 202, 202, 202, 202, 202, 202, 202,
10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358,
-11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164,
962, 962, 962, 962, -1202, -1202, -1202, -1202,
-1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468,
-28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800,
18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163,
-681, -681, 1017, 1017, 732, 732, 608, 608,
-1542, -1542, 411, 411, -205, -205, -1571, -1571,
19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249,
13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915,
-853, -90, -271, 830, 107, -1421, -247, -951,
-398, 961, -1508, -725, 448, -1065, 677, -1275,
-31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989,
10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422,
817, 603, 1322, -1465, -1215, 1218, -874, -1187,
-1185, -1278, -1510, -870, -108, 996, 958, 1522,
20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469,
-21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132,
1097, 610, -1285, 384, -136, -1335, 220, -1659,
-1530, 794, -854, 478, -308, 991, -1460, 1628,

#define _ZETAS_INV_EXP 528
42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498,
51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240,
1701, 1460, 2338, 308, 2851, 854, 2535, 1530,
1659, 3109, 1335, 136, 2945, 1285, 2719, 2232,
17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201,
48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184,
1807, 2371, 2333, 108, 870, 1510, 1278, 1185,
1187, 874, 2111, 1215, 1465, 2007, 2726, 2512,
17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110,
47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653,
1275, 2652, 1065, 2881, 725, 1508, 2368, 398,
951, 247, 1421, 3222, 2499, 271, 90, 853,
16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110,
56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073,
1571, 1571, 205, 205, 2918, 2918, 1542, 1542,
2721, 2721, 2597, 2597, 2312, 2312, 681, 681,
34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202,
64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847,
1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474,
1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367,
16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695,
37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346,
3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127,
3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042,
64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437,
52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406,
21656, 14234, 52150, 54355, 75, 3000, 2980, 2685,
2210, 1846, 147, 2551, 1676, 460, 235, 2742,
3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486,
28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739,
45043, 32227, 11478, 335, 156, 2911, 872, 1590,
602, 777, 2170, 246, 1755, 291, 3152, 2907,
1779, 1251, 2774, 1103, 37700, 25987, 650, 56402,
12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565,
34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618,
666, 320, 8, 2813, 1544, 282, 1838, 1293,
2314, 552, 2677, 2106, 26242, 26242, 44098, 44098,
1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361,
48173, 48173, 5828, 5828, 130, 130, 1602, 1602,
1871, 1871, 829, 829, 2946, 2946, 3065, 3065,
1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691,
3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779,
20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147,
1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707,
171, 171, 171, 171, 12403, 12403, 12403, 12403,
12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012,
52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907,
1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836,
1836, 1836, 1836, 1836, 50791, 50791, 359, 359,
60300, 60300, 1932, 1932, 0, 0, 0, 0
//#define _16XSHIFT 624
SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT
}
};

+ 2
- 11
crypto_kem/kyber1024-90s/avx2/consts.h 查看文件

@@ -1,19 +1,10 @@
#ifndef PQCLEAN_KYBER102490S_AVX2_CONSTS_H
#define PQCLEAN_KYBER102490S_AVX2_CONSTS_H
#include "align.h"
#include "cdecl.h"
#include "params.h"
#include <immintrin.h>
#include <stdint.h>


#define ALIGNED_UINT16_T(N) \
union { \
__m256i as_vec; \
uint16_t as_arr[(N)]; \
}

typedef ALIGNED_UINT16_T(928) qdata_t;

typedef ALIGNED_INT16(640) qdata_t;
extern const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata;

#endif

+ 8
- 51
crypto_kem/kyber1024-90s/avx2/fq.S 查看文件

@@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7
vmovdqa 192(%rdi),%ymm8
vmovdqa 224(%rdi),%ymm9

red16 2,10
red16 3,11
red16 4,12
red16 5,13
red16 6,14
red16 7,15
red16 8,10
red16 9,11
red16 2
red16 3
red16 4
red16 5
red16 6
red16 7
red16 8
red16 9

#store
vmovdqa %ymm2,(%rdi)
@@ -46,49 +46,6 @@ add $256,%rdi
call reduce128_avx
ret

csubq128_avx:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm2
vmovdqa 64(%rdi),%ymm3
vmovdqa 96(%rdi),%ymm4
vmovdqa 128(%rdi),%ymm5
vmovdqa 160(%rdi),%ymm6
vmovdqa 192(%rdi),%ymm7
vmovdqa 224(%rdi),%ymm8

csubq 1,9
csubq 2,10
csubq 3,11
csubq 4,12
csubq 5,13
csubq 6,14
csubq 7,15
csubq 8,9

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm2,32(%rdi)
vmovdqa %ymm3,64(%rdi)
vmovdqa %ymm4,96(%rdi)
vmovdqa %ymm5,128(%rdi)
vmovdqa %ymm6,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm8,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx)
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx):
_cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
call csubq128_avx
add $256,%rdi
call csubq128_avx
ret

tomont128_avx:
#load
vmovdqa (%rdi),%ymm3


+ 5
- 4
crypto_kem/kyber1024-90s/avx2/fq.inc 查看文件

@@ -1,6 +1,10 @@
.macro red16 r,x=12
.macro red16 r,rs=0,x=12
vpmulhw %ymm1,%ymm\r,%ymm\x
.if \rs
vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
.else
vpsraw $10,%ymm\x,%ymm\x
.endif
vpmullw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\r,%ymm\r
.endm
@@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
#vpcmpgtw %ymm0,%ymm\r,%ymm\x
#vpand %ymm0,%ymm\x,%ymm\x
#vpsubw %ymm\x,%ymm\r,%ymm\r
.endm

.macro caddq r,x=12


+ 120
- 110
crypto_kem/kyber1024-90s/avx2/indcpa.c 查看文件

@@ -8,6 +8,7 @@
#include "randombytes.h"
#include "rejsample.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>

@@ -15,11 +16,14 @@
* Name: pack_pk
*
* Description: Serialize the public key as concatenation of the
* serialized vector of polynomials pk
* and the public seed used to generate the matrix A.
* serialized vector of polynomials pk and the
* public seed used to generate the matrix A.
* The polynomial coefficients in pk are assumed to
* lie in the invertal [0,q], i.e. pk must be reduced
* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce().
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* polyvec *pk: pointer to the input public-key polyvec
* Arguments: uint8_t *r: pointer to the output serialized public key
* polyvec *pk: pointer to the input public-key polyvec
* const uint8_t *seed: pointer to the input public seed
**************************************************/
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
@@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk,
/*************************************************
* Name: pack_sk
*
* Description: Serialize the secret key
* Description: Serialize the secret key.
* The polynomial coefficients in sk are assumed to
* lie in the invertal [0,q], i.e. sk must be reduced
* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce().
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - polyvec *sk: pointer to input vector of polynomials (secret key)
**************************************************/
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
@@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
/*************************************************
* Name: unpack_sk
*
* Description: De-serialize the secret key;
* inverse of pack_sk
* Description: De-serialize the secret key; inverse of pack_sk
*
* Arguments: - polyvec *sk: pointer to output vector of polynomials
* (secret key)
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret key
**************************************************/
static void unpack_sk(polyvec *sk,
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(sk, packedsk);
}

@@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk,
*
* Description: Serialize the ciphertext as concatenation of the
* compressed and serialized vector of polynomials b
* and the compressed and serialized polynomial v
* and the compressed and serialized polynomial v.
* The polynomial coefficients in b and v are assumed to
* lie in the invertal [0,q], i.e. b and v must be reduced
* by PQCLEAN_KYBER102490S_AVX2_polyvec_reduce() and PQCLEAN_KYBER102490S_AVX2_poly_reduce(), respectively.
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
**************************************************/
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
polyvec *b,
poly *v) {
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) {
PQCLEAN_KYBER102490S_AVX2_polyvec_compress(r, b);
PQCLEAN_KYBER102490S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
}
@@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of pack_ciphertext
*
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
**************************************************/
static void unpack_ciphertext(polyvec *b,
poly *v,
const uint8_t c[KYBER_INDCPA_BYTES]) {
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) {
PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(b, c);
PQCLEAN_KYBER102490S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
}
@@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b,
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers
* (uniform mod q)
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* Arguments: - int16_t *r: pointer to output array
* - unsigned int len: requested number of 16-bit integers (uniform mod q)
* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
* - unsigned int buflen: length of input buffer in bytes
*
* Returns number of sampled 16-bit integers (at most len)
@@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint16_t val;
uint16_t val0, val1;

ctr = pos = 0;
while (ctr < len && pos + 2 <= buflen) {
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;
while (ctr < len && pos + 3 <= buflen) {
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
pos += 3;

if (val < 19 * KYBER_Q) {
val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction
r[ctr++] = (int16_t)val;
if (val0 < KYBER_Q) {
r[ctr++] = val0;
}
if (ctr < len && val1 < KYBER_Q) {
r[ctr++] = val1;
}
}

@@ -165,12 +169,11 @@ static unsigned int rej_uniform(int16_t *r,
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T is generated
**************************************************/
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) {
unsigned int ctr, i, j;
ALIGN16_TYPE(uint64_t) nonce = {.orig = 0};
ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf;
void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) {
unsigned int ctr, i, j, k;
unsigned int buflen, off;
uint64_t nonce = 0;
ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES) buf;
aes256ctr_ctx state;

PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, 0);
@@ -178,19 +181,24 @@ void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_S
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_K; j++) {
if (transposed) {
nonce.orig = (j << 8) | i;
nonce = (j << 8) | i;
} else {
nonce.orig = (i << 8) | j;
nonce = (i << 8) | j;
}

state.n = _mm_loadl_epi64(&nonce.vec);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state);
ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr);
state.n = _mm_loadl_epi64((__m128i *)&nonce);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);
buflen = REJ_UNIFORM_AVX_NBLOCKS * AES256CTR_BLOCKBYTES;
ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.coeffs);

while (ctr < KYBER_N) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state);
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr,
XOF_BLOCKBYTES);
off = buflen % 3;
for (k = 0; k < off; k++) {
buf.coeffs[k] = buf.coeffs[buflen - off + k];
}
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs + off, 1, &state);
buflen = off + AES256CTR_BLOCKBYTES;
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.coeffs, buflen);
}

PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(&a[i].vec[j]);
@@ -212,39 +220,41 @@ void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_S
void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
unsigned int i;
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
const uint8_t *publicseed = buf.arr;
const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES;
uint8_t buf[2 * KYBER_SYMBYTES];
const uint8_t *publicseed = buf;
const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
polyvec a[KYBER_K], e, pkpv, skpv;

randombytes(buf.arr, KYBER_SYMBYTES);
hash_g(buf.arr, buf.arr, KYBER_SYMBYTES);
randombytes(buf, KYBER_SYMBYTES);
hash_g(buf, buf, KYBER_SYMBYTES);

gen_a(a, publicseed);

ALIGN16_TYPE(uint64_t) nonce = {.orig = 0};
#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */
uint64_t nonce = 0;
ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) coins; // +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1
aes256ctr_ctx state;
ALIGN32_ARRAY(uint8_t, 128) coins;
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce++);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&skpv.vec[i], coins.arr);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state);
state.n = _mm_loadl_epi64((__m128i *)&nonce);
nonce += 1;
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&skpv.vec[i], coins.vec);
}
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&e.vec[i], coins.arr);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.coeffs, NOISE_NBLOCKS, &state);
state.n = _mm_loadl_epi64((__m128i *)&nonce);
nonce += 1;
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&e.vec[i], coins.vec);
}

PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&skpv);
PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&skpv);
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&e);

// matrix-vector multiplication
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER102490S_AVX2_poly_tomont(&pkpv.vec[i]);
}

@@ -261,70 +271,70 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYB
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins
* used as seed (of length KYBER_SYMBYTES)
* to deterministically generate all
* randomness
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins used as seed
* (of length KYBER_SYMBYTES) to deterministically
* generate all randomness
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
const uint8_t coins[KYBER_SYMBYTES]) {
unsigned int i;
ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed;
polyvec sp, pkpv, ep, at[KYBER_K], bp;
uint8_t seed[KYBER_SYMBYTES];
polyvec sp, pkpv, ep, at[KYBER_K], b;
poly v, k, epp;

unpack_pk(&pkpv, seed.arr, pk);
unpack_pk(&pkpv, seed, pk);
PQCLEAN_KYBER102490S_AVX2_poly_frommsg(&k, m);
gen_at(at, seed.arr);
gen_at(at, seed);

ALIGN16_TYPE(uint64_t) nonce = {.orig = 0};
#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */
#define CIPHERTEXTNOISE_NBLOCKS ((KYBER_ETA2*KYBER_N/4)/AES256CTR_BLOCKBYTES) /* Assumes divisibility */
uint64_t nonce = 0;
ALIGNED_UINT8(NOISE_NBLOCKS * AES256CTR_BLOCKBYTES + 32) buf; /* +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1 */
aes256ctr_ctx state;
ALIGN32_ARRAY(uint8_t, 128) buf;
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce.orig++);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce++);
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&sp.vec[i], buf.arr);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, NOISE_NBLOCKS, &state);
state.n = _mm_loadl_epi64((__m128i *)&nonce);
nonce += 1;
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(&sp.vec[i], buf.vec);
}
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&ep.vec[i], buf.arr);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state);
state.n = _mm_loadl_epi64((__m128i *)&nonce);
nonce += 1;
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(&ep.vec[i], buf.vec);
}
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state);
state.n = _mm_loadl_epi64(&nonce.vec);
nonce.orig++;
PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf.arr);
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.coeffs, CIPHERTEXTNOISE_NBLOCKS, &state);
state.n = _mm_loadl_epi64((__m128i *)&nonce);
nonce += 1;
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(&epp, buf.vec);

PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&sp);

// matrix-vector multiplication
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
}
PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&bp);
PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&b);
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&v);

PQCLEAN_KYBER102490S_AVX2_polyvec_add(&bp, &bp, &ep);
PQCLEAN_KYBER102490S_AVX2_polyvec_add(&b, &b, &ep);
PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &epp);
PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &k);
PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&bp);
PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(&b);
PQCLEAN_KYBER102490S_AVX2_poly_reduce(&v);

pack_ciphertext(c, &bp, &v);
pack_ciphertext(c, &b, &v);
}

/*************************************************
@@ -333,24 +343,24 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length KYBER_INDCPA_SECRETKEYBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
polyvec bp, skpv;
polyvec b, skpv;
poly v, mp;

unpack_ciphertext(&bp, &v, c);
unpack_ciphertext(&b, &v, c);
unpack_sk(&skpv, sk);

PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&bp);
PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&b);
PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&mp);

PQCLEAN_KYBER102490S_AVX2_poly_sub(&mp, &v, &mp);


+ 122
- 154
crypto_kem/kyber1024-90s/avx2/invntt.S 查看文件

@@ -2,22 +2,21 @@
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2
#update & mul
vpsubw %ymm\rh0,%ymm\rl0,%ymm12
vpsubw %ymm\rh1,%ymm\rl1,%ymm13
vpsubw %ymm\rh2,%ymm\rl2,%ymm14

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
vpsubw %ymm\rl0,%ymm\rh0,%ymm12
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
vpsubw %ymm\rl1,%ymm\rh1,%ymm13

vpmullw %ymm\zl0,%ymm12,%ymm\rh0
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
vpsubw %ymm\rl2,%ymm\rh2,%ymm14

vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
vpmullw %ymm\zl0,%ymm13,%ymm\rh1
vpsubw %ymm\rh3,%ymm\rl3,%ymm15
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
vpsubw %ymm\rl3,%ymm\rh3,%ymm15

vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw %ymm\zl1,%ymm14,%ymm\rh2
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw %ymm\zl1,%ymm15,%ymm\rh3

vpmulhw %ymm\zh0,%ymm12,%ymm12
@@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13
vpmulhw %ymm\zh1,%ymm14,%ymm14
vpmulhw %ymm\zh1,%ymm15,%ymm15

#reduce
vpmulhw %ymm0,%ymm\rh0,%ymm\rh0

vpmulhw %ymm0,%ymm\rh1,%ymm\rh1

vpmulhw %ymm0,%ymm\rh2,%ymm\rh2
vpmulhw %ymm0,%ymm\rh3,%ymm\rh3

#

#

vpsubw %ymm\rh0,%ymm12,%ymm\rh0

vpsubw %ymm\rh1,%ymm13,%ymm\rh1

vpsubw %ymm\rh2,%ymm14,%ymm\rh2
vpsubw %ymm\rh3,%ymm15,%ymm\rh3
.endm

.text
invntt_levels0t5_avx:
level0:
#zetas
vmovdqu (%rsi),%ymm15
vmovdqu 64(%rsi),%ymm3
vmovdqu 32(%rsi),%ymm1
vmovdqu 96(%rsi),%ymm2

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly 4,5,8,9,6,7,10,11,15,3,1,2

level1:
#zetas
vmovdqu 128(%rsi),%ymm3
vmovdqu 160(%rsi),%ymm2

butterfly 4,5,6,7,8,9,10,11,3,3,2,2
.macro intt_levels0t5 off
/* level 0 */
vmovdqa _16XFLO*2(%rsi),%ymm2
vmovdqa _16XFHI*2(%rsi),%ymm3

vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7

fqmulprecomp 2,3,4
fqmulprecomp 2,3,6
fqmulprecomp 2,3,5
fqmulprecomp 2,3,7

vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
vmovdqa (128*\off+112)*2(%rdi),%ymm11

fqmulprecomp 2,3,8
fqmulprecomp 2,3,10
fqmulprecomp 2,3,9
fqmulprecomp 2,3,11

vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
vmovdqa _REVIDXB*2(%rsi),%ymm12
vpshufb %ymm12,%ymm15,%ymm15
vpshufb %ymm12,%ymm1,%ymm1
vpshufb %ymm12,%ymm2,%ymm2
vpshufb %ymm12,%ymm3,%ymm3

butterfly 4,5,8,9,6,7,10,11,15,1,2,3

/* level 1 */
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
vmovdqa _REVIDXB*2(%rsi),%ymm1
vpshufb %ymm1,%ymm2,%ymm2
vpshufb %ymm1,%ymm3,%ymm3

butterfly 4,5,6,7,8,9,10,11,2,2,3,3

shuffle1 4,5,3,5
shuffle1 6,7,4,7
shuffle1 8,9,6,9
shuffle1 10,11,8,11

level2:
#zetas
vmovdqu 192(%rsi),%ymm10
vmovdqu 224(%rsi),%ymm2

#consts
vmovdqa _16XV*2(%rdx),%ymm1
/* level 2 */
vmovdqa _REVIDXD*2(%rsi),%ymm12
vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10

butterfly 3,4,6,8,5,7,9,11,10,10,2,2
butterfly 3,4,6,8,5,7,9,11,2,2,10,10

vmovdqa _16XV*2(%rsi),%ymm1
red16 3

shuffle2 3,4,10,4
@@ -87,26 +110,22 @@ shuffle2 6,8,3,8
shuffle2 5,7,6,7
shuffle2 9,11,5,11

level3:
#zetas
vmovdqu 256(%rsi),%ymm9
vmovdqu 288(%rsi),%ymm2
/* level 3 */
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9

butterfly 10,3,6,5,4,8,7,11,9,9,2,2

red16 10
butterfly 10,3,6,5,4,8,7,11,2,2,9,9

shuffle4 10,3,9,3
shuffle4 6,5,10,5
shuffle4 4,8,6,8
shuffle4 7,11,4,11

level4:
#zetas
vmovdqu 320(%rsi),%ymm7
vmovdqu 352(%rsi),%ymm2
/* level 4 */
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7

butterfly 9,10,6,4,3,5,8,11,7,7,2,2
butterfly 9,10,6,4,3,5,8,11,2,2,7,7

red16 9

@@ -115,113 +134,62 @@ shuffle8 6,4,9,4
shuffle8 3,5,6,5
shuffle8 8,11,3,11

level5:
#zetas
vpbroadcastd 384(%rsi),%ymm8
vpbroadcastd 388(%rsi),%ymm2

butterfly 7,9,6,3,10,4,5,11,8,8,2,2

red16 7

#store
vmovdqa %ymm7,(%rdi)
vmovdqa %ymm9,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm3,96(%rdi)
vmovdqa %ymm10,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm5,192(%rdi)
vmovdqa %ymm11,224(%rdi)
/* level5 */
vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8

ret
butterfly 7,9,6,3,10,4,5,11,2,2,8,8

invntt_level6_avx:
#zetas
vpbroadcastd (%rsi),%ymm1
vpbroadcastd 4(%rsi),%ymm2

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 256(%rdi),%ymm8
vmovdqa 288(%rdi),%ymm9
vmovdqa 320(%rdi),%ymm10
vmovdqa 352(%rdi),%ymm11
vmovdqa %ymm7,(128*\off+ 0)*2(%rdi)
vmovdqa %ymm9,(128*\off+ 16)*2(%rdi)
vmovdqa %ymm6,(128*\off+ 32)*2(%rdi)
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
vmovdqa %ymm10,(128*\off+ 64)*2(%rdi)
vmovdqa %ymm4,(128*\off+ 80)*2(%rdi)
vmovdqa %ymm5,(128*\off+ 96)*2(%rdi)
vmovdqa %ymm11,(128*\off+112)*2(%rdi)
.endm

butterfly 4,5,6,7,8,9,10,11
.macro intt_level6 off
/* level 6 */
vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
vmovdqa (64*\off+128)*2(%rdi),%ymm8
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
vmovdqa (64*\off+144)*2(%rdi),%ymm9
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2

#consts
vmovdqa _16XFLO*2(%rdx),%ymm12
vmovdqa _16XFHI*2(%rdx),%ymm13

#store
vmovdqa %ymm8,256(%rdi)
vmovdqa %ymm9,288(%rdi)
vmovdqa %ymm10,320(%rdi)
vmovdqa %ymm11,352(%rdi)

fqmulprecomp 12,13,4,8
fqmulprecomp 12,13,5,9
fqmulprecomp 12,13,6,10
fqmulprecomp 12,13,7,11

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm7,96(%rdi)

#load
vmovdqa 128(%rdi),%ymm4
vmovdqa 160(%rdi),%ymm5
vmovdqa 192(%rdi),%ymm6
vmovdqa 224(%rdi),%ymm7
vmovdqa 384(%rdi),%ymm8
vmovdqa 416(%rdi),%ymm9
vmovdqa 448(%rdi),%ymm10
vmovdqa 480(%rdi),%ymm11
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
vmovdqa (64*\off+160)*2(%rdi),%ymm10
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
vmovdqa (64*\off+176)*2(%rdi),%ymm11
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3

butterfly 4,5,6,7,8,9,10,11

#consts
vmovdqa _16XFLO*2(%rdx),%ymm12
vmovdqa _16XFHI*2(%rdx),%ymm13

#store
vmovdqa %ymm8,384(%rdi)
vmovdqa %ymm9,416(%rdi)
vmovdqa %ymm10,448(%rdi)
vmovdqa %ymm11,480(%rdi)

fqmulprecomp 12,13,4,8
fqmulprecomp 12,13,5,9
fqmulprecomp 12,13,6,10
fqmulprecomp 12,13,7,11

#store
vmovdqa %ymm4,128(%rdi)
vmovdqa %ymm5,160(%rdi)
vmovdqa %ymm6,192(%rdi)
vmovdqa %ymm7,224(%rdi)

ret
.if \off == 0
red16 4
.endif

vmovdqa %ymm4,(64*\off+ 0)*2(%rdi)
vmovdqa %ymm5,(64*\off+ 16)*2(%rdi)
vmovdqa %ymm6,(64*\off+ 32)*2(%rdi)
vmovdqa %ymm7,(64*\off+ 48)*2(%rdi)
vmovdqa %ymm8,(64*\off+128)*2(%rdi)
vmovdqa %ymm9,(64*\off+144)*2(%rdi)
vmovdqa %ymm10,(64*\off+160)*2(%rdi)
vmovdqa %ymm11,(64*\off+176)*2(%rdi)
.endm

.text
.global cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx)
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx):
_cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
mov %rsi,%rdx
add $_ZETAS_INV_EXP*2,%rsi
call invntt_levels0t5_avx
add $256,%rdi
add $392,%rsi
call invntt_levels0t5_avx
sub $256,%rdi
add $392,%rsi
call invntt_level6_avx

intt_levels0t5 0
intt_levels0t5 1

intt_level6 0
intt_level6 1
ret

+ 36
- 36
crypto_kem/kyber1024-90s/avx2/kem.c 查看文件

@@ -1,4 +1,3 @@
#include "align.h"
#include "indcpa.h"
#include "kem.h"
#include "params.h"
@@ -15,13 +14,14 @@
* for CCA-secure Kyber key encapsulation mechanism
*
* Arguments: - unsigned char *pk: pointer to output public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
* - unsigned char *sk: pointer to output private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* (an already allocated array of KYBER_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES],
unsigned char sk[KYBER_SECRETKEYBYTES]) {
size_t i;
PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -40,36 +40,36 @@ int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned cha
* secret for given public key
*
* Arguments: - unsigned char *ct: pointer to output cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
* - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* (an already allocated array of KYBER_SSBYTES bytes)
* - const unsigned char *pk: pointer to input public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk) {
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES],
unsigned char ss[KYBER_SSBYTES],
const unsigned char pk[KYBER_PUBLICKEYBYTES]) {
uint8_t buf[2 * KYBER_SYMBYTES];
/* Will contain key, coins */
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr;
uint8_t kr[2 * KYBER_SYMBYTES];

randombytes(buf.arr, KYBER_SYMBYTES);
randombytes(buf, KYBER_SYMBYTES);
/* Don't release system RNG output */
hash_h(buf.arr, buf.arr, KYBER_SYMBYTES);
hash_h(buf, buf, KYBER_SYMBYTES);

/* Multitarget countermeasure for coins + contributory KEM */
hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES);
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
hash_g(kr, buf, 2 * KYBER_SYMBYTES);

/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES);
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES);

/* overwrite coins in kr with H(c) */
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES);
kdf(ss, kr, 2 * KYBER_SYMBYTES);
return 0;
}

@@ -80,47 +80,47 @@ int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct,
* cipher text and private key
*
* Arguments: - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* (an already allocated array of KYBER_SSBYTES bytes)
* - const unsigned char *ct: pointer to input cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
* - const unsigned char *sk: pointer to input private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* (an already allocated array of KYBER_SECRETKEYBYTES bytes)
*
* Returns 0.
*
* On failure, ss will contain a pseudo-random value.
**************************************************/
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk) {
int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES],
const unsigned char ct[KYBER_CIPHERTEXTBYTES],
const unsigned char sk[KYBER_SECRETKEYBYTES]) {
size_t i;
int fail;
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
uint8_t buf[2 * KYBER_SYMBYTES];
/* Will contain key, coins */
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr;
uint8_t cmp[KYBER_CIPHERTEXTBYTES];
uint8_t kr[2 * KYBER_SYMBYTES];
ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp;
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;

PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf.arr, ct, sk);
PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf, ct, sk);

/* Multitarget countermeasure for coins + contributory KEM */
for (i = 0; i < KYBER_SYMBYTES; i++) {
buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i];
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i];
}
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES);
hash_g(kr, buf, 2 * KYBER_SYMBYTES);

/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES);
PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES);

fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES);

/* overwrite coins in kr with H(c) */
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);

/* Overwrite pre-k with z on re-encryption failure */
PQCLEAN_KYBER102490S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail);
PQCLEAN_KYBER102490S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail);

/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES);
kdf(ss, kr, 2 * KYBER_SYMBYTES);
return 0;
}

+ 154
- 185
crypto_kem/kyber1024-90s/avx2/ntt.S 查看文件

@@ -1,222 +1,191 @@
#include "cdecl.h"
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1
#mul
.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
vpmullw %ymm\zl0,%ymm\rh0,%ymm12
vpmullw %ymm\zl0,%ymm\rh1,%ymm13

vpmullw %ymm\zl1,%ymm\rh2,%ymm14
vpmullw %ymm\zl1,%ymm\rh3,%ymm15

vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1

vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3
.endm

#reduce
.macro reduce
vpmulhw %ymm0,%ymm12,%ymm12
vpmulhw %ymm0,%ymm13,%ymm13

vpmulhw %ymm0,%ymm14,%ymm14
vpmulhw %ymm0,%ymm15,%ymm15
vpsubw %ymm12,%ymm\rh0,%ymm12
vpsubw %ymm13,%ymm\rh1,%ymm13
vpsubw %ymm14,%ymm\rh2,%ymm14
vpsubw %ymm15,%ymm\rh3,%ymm15

#update
vpsubw %ymm12,%ymm\rl0,%ymm\rh0
vpaddw %ymm12,%ymm\rl0,%ymm\rl0
vpsubw %ymm13,%ymm\rl1,%ymm\rh1
vpaddw %ymm13,%ymm\rl1,%ymm\rl1
vpsubw %ymm14,%ymm\rl2,%ymm\rh2
vpaddw %ymm14,%ymm\rl2,%ymm\rl2
vpsubw %ymm15,%ymm\rl3,%ymm\rh3
vpaddw %ymm15,%ymm\rl3,%ymm\rl3
.endm

# We break the dependency chains with the cost of slightly more additions.
# But they can be run in parallel to the multiplications on execution port 5
# (multiplications only go to ports 0 and 1)
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1
#mul
vpmullw %ymm\zl0,%ymm\rh0,%ymm12
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x
vpmullw %ymm\zl0,%ymm\rh1,%ymm13
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0
vpmullw %ymm\zl1,%ymm\rh2,%ymm14
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y
vpmullw %ymm\zl1,%ymm\rh3,%ymm15
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2
.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln
vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0

#reduce
vpmulhw %ymm0,%ymm12,%ymm12
vpmulhw %ymm0,%ymm13,%ymm13
vpmulhw %ymm0,%ymm14,%ymm14
vpmulhw %ymm0,%ymm15,%ymm15
vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1
vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2

vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1
vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1
vpsubw %ymm\x,%ymm\rl0,%ymm\rh0
vpaddw %ymm\x,%ymm\rl0,%ymm\rl0
vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3
vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3
vpsubw %ymm\y,%ymm\rl2,%ymm\rh2
vpaddw %ymm\y,%ymm\rl2,%ymm\rl2
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2
vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3

#update
vpsubw %ymm12,%ymm\rln,%ymm\rln
vpaddw %ymm12,%ymm\rh0,%ymm\rh0
vpsubw %ymm12,%ymm\rl0,%ymm\rl0
vpsubw %ymm13,%ymm\rl0,%ymm\rl0

vpaddw %ymm13,%ymm\rh1,%ymm\rh1
vpsubw %ymm13,%ymm\rl1,%ymm\rl1
vpsubw %ymm14,%ymm\rl1,%ymm\rl1
vpaddw %ymm14,%ymm\rh2,%ymm\rh2
vpsubw %ymm14,%ymm\rl2,%ymm\rl2

vpsubw %ymm15,%ymm\rl2,%ymm\rl2
vpaddw %ymm15,%ymm\rh3,%ymm\rh3
vpsubw %ymm15,%ymm\rl3,%ymm\rl3
.endm

.text
ntt_level0_avx:
level0:
#zetas
vpbroadcastd (%rsi),%ymm15
vpbroadcastd 4(%rsi),%ymm1

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 256(%rdi),%ymm8
vmovdqa 288(%rdi),%ymm9
vmovdqa 320(%rdi),%ymm10
vmovdqa 352(%rdi),%ymm11

butterfly2 4,5,6,7,8,9,10,11

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm7,96(%rdi)
vmovdqa %ymm8,256(%rdi)
vmovdqa %ymm9,288(%rdi)
vmovdqa %ymm10,320(%rdi)
vmovdqa %ymm11,352(%rdi)
.macro level0 off
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15
vmovdqa (64*\off+128)*2(%rdi),%ymm8
vmovdqa (64*\off+144)*2(%rdi),%ymm9
vmovdqa (64*\off+160)*2(%rdi),%ymm10
vmovdqa (64*\off+176)*2(%rdi),%ymm11
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2

mul 8,9,10,11

vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7

reduce
update 3,4,5,6,7,8,9,10,11

vmovdqa %ymm3,(64*\off+ 0)*2(%rdi)
vmovdqa %ymm4,(64*\off+ 16)*2(%rdi)
vmovdqa %ymm5,(64*\off+ 32)*2(%rdi)
vmovdqa %ymm6,(64*\off+ 48)*2(%rdi)
vmovdqa %ymm8,(64*\off+128)*2(%rdi)
vmovdqa %ymm9,(64*\off+144)*2(%rdi)
vmovdqa %ymm10,(64*\off+160)*2(%rdi)
vmovdqa %ymm11,(64*\off+176)*2(%rdi)
.endm

ret
.macro levels1t6 off
/* level 1 */
vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
vmovdqa (128*\off+112)*2(%rdi),%ymm11
vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2

ntt_levels1t6_avx:
level1:
#zetas
vpbroadcastd (%rsi),%ymm15
vpbroadcastd 4(%rsi),%ymm1

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly2 4,5,6,7,8,9,10,11,3

level2:
#zetas
vmovdqu 8(%rsi),%ymm15
vmovdqu 40(%rsi),%ymm1

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

butterfly2 3,8,4,9,5,10,6,11,7

level3:
#zetas
vmovdqu 72(%rsi),%ymm15
vmovdqu 104(%rsi),%ymm1

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

butterfly2 7,5,3,10,8,6,4,11,9

level4:
#zetas
vmovdqu 136(%rsi),%ymm15
vmovdqu 168(%rsi),%ymm1

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

butterfly2 9,8,7,6,5,4,3,11,10

level5:
#zetas
vmovdqu 200(%rsi),%ymm15
vmovdqu 232(%rsi),%ymm1

shuffle1 9,5,10,5
shuffle1 8,4,9,4
shuffle1 7,3,8,3
shuffle1 6,11,7,11

butterfly2 10,5,9,4,8,3,7,11,6

level6:
#zetas
vmovdqu 264(%rsi),%ymm14
vmovdqu 328(%rsi),%ymm15
vmovdqu 296(%rsi),%ymm1
vmovdqu 360(%rsi),%ymm2

butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2

vmovdqa _16XV*2(%rdx),%ymm1
red16 10,12
red16 5,13
red16 9,14
red16 4,15
red16 8,2
red16 3,6
red16 7,12
red16 11,13

#store
vmovdqa %ymm10,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm9,64(%rdi)
vmovdqa %ymm4,96(%rdi)
vmovdqa %ymm8,128(%rdi)
vmovdqa %ymm3,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm11,224(%rdi)
mul 8,9,10,11

ret
vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7

reduce
update 3,4,5,6,7,8,9,10,11

/* level 2 */
shuffle8 5,10,7,10
shuffle8 6,11,5,11

vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2

mul 7,10,5,11

shuffle8 3,8,6,8
shuffle8 4,9,3,9

reduce
update 4,6,8,3,9,7,10,5,11

/* level 3 */
shuffle4 8,5,9,5
shuffle4 3,11,8,11

vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2

mul 9,5,8,11

shuffle4 4,7,3,7
shuffle4 6,10,4,10

reduce
update 6,3,7,4,10,9,5,8,11

/* level 4 */
shuffle2 7,8,10,8
shuffle2 4,11,7,11

vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2

mul 10,8,7,11

shuffle2 6,9,4,9
shuffle2 3,5,6,5

reduce
update 3,4,9,6,5,10,8,7,11

/* level 5 */
shuffle1 9,7,5,7
shuffle1 6,11,9,11

vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2

mul 5,7,9,11

shuffle1 3,10,6,10
shuffle1 4,8,3,8

reduce
update 4,6,10,3,8,5,7,9,11

/* level 6 */
vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2

mul 10,3,9,11,14,15,8,2

reduce
update 8,4,6,5,7,10,3,9,11

vmovdqa %ymm8,(128*\off+ 0)*2(%rdi)
vmovdqa %ymm4,(128*\off+ 16)*2(%rdi)
vmovdqa %ymm10,(128*\off+ 32)*2(%rdi)
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
vmovdqa %ymm6,(128*\off+ 64)*2(%rdi)
vmovdqa %ymm5,(128*\off+ 80)*2(%rdi)
vmovdqa %ymm9,(128*\off+ 96)*2(%rdi)
vmovdqa %ymm11,(128*\off+112)*2(%rdi)
.endm

.text
.global cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx)
.global _cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx)
cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx):
_cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
mov %rsi,%rdx
add $_ZETAS_EXP*2,%rsi
call ntt_level0_avx
add $128,%rdi
call ntt_level0_avx
sub $128,%rdi
add $8,%rsi
call ntt_levels1t6_avx
add $256,%rdi
add $392,%rsi
call ntt_levels1t6_avx

level0 0
level0 1

levels1t6 0
levels1t6 1

ret

+ 12
- 15
crypto_kem/kyber1024-90s/avx2/ntt.h 查看文件

@@ -1,24 +1,21 @@
#ifndef PQCLEAN_KYBER102490S_AVX2_NTT_H
#define PQCLEAN_KYBER102490S_AVX2_NTT_H
#include "consts.h"

#include <immintrin.h>
#include <stdint.h>

void PQCLEAN_KYBER102490S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);

void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);

void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r,
const int16_t *a,
const int16_t *b,
const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r,
const int16_t *a,
const int16_t *b,
const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_basemul_avx(__m256i *r,
const __m256i *a,
const __m256i *b,
const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);

void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);

#endif

+ 3
- 2
crypto_kem/kyber1024-90s/avx2/params.h 查看文件

@@ -7,8 +7,6 @@
#define KYBER_N 256
#define KYBER_Q 3329

#define KYBER_ETA 2

#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */
#define KYBER_SSBYTES 32 /* size in bytes of shared key */

@@ -16,9 +14,12 @@
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_K 4
#define KYBER_ETA1 2
#define KYBER_POLYCOMPRESSEDBYTES 160
#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)

#define KYBER_ETA2 2

#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)


+ 157
- 116
crypto_kem/kyber1024-90s/avx2/poly.c 查看文件

@@ -12,76 +12,99 @@
/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_compress
*
* Description: Compression and subsequent serialization of a polynomial
* Description: Compression and subsequent serialization of a polynomial.
* The coefficients of the input polynomial are assumed to
* lie in the invertal [0,q], i.e. the polynomial must be reduced
* by PQCLEAN_KYBER102490S_AVX2_poly_reduce().
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length KYBER_POLYCOMPRESSEDBYTES)
* - poly *a: pointer to input polynomial
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) {
unsigned int i, j;
uint8_t t[8];

PQCLEAN_KYBER102490S_AVX2_poly_csubq(a);

for (i = 0; i < KYBER_N / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
}
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[160], const poly *restrict a) {
size_t i;
uint32_t low;
__m256i f0, f1;
__m128i t0, t1;
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XV / 16]);
const __m256i shift1 = _mm256_set1_epi16(1 << 10);
const __m256i mask = _mm256_set1_epi16(31);
const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
const __m256i sllvdidx = _mm256_set1_epi64x(12);
const __m256i shufbidx = _mm256_set_epi8( 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9,
-1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0);

r[0] = (t[0] >> 0) | (t[1] << 5);
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
r[2] = (t[3] >> 1) | (t[4] << 4);
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
r[4] = (t[6] >> 2) | (t[7] << 3);
r += 5;
for (i = 0; i < KYBER_N / 32; i++) {
f0 = _mm256_load_si256(&a->vec[2 * i + 0]);
f1 = _mm256_load_si256(&a->vec[2 * i + 1]);
f0 = _mm256_mulhi_epi16(f0, v);
f1 = _mm256_mulhi_epi16(f1, v);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f1 = _mm256_mulhrs_epi16(f1, shift1);
f0 = _mm256_and_si256(f0, mask);
f1 = _mm256_and_si256(f1, mask);
f0 = _mm256_packus_epi16(f0, f1);
f0 = _mm256_maddubs_epi16(f0, shift2); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
f0 = _mm256_madd_epi16(f0, shift3); // a0 a1 b0 b1 a2 a3 b2 b3
f0 = _mm256_sllv_epi32(f0, sllvdidx);
f0 = _mm256_srlv_epi64(f0, sllvdidx);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
t0 = _mm256_castsi256_si128(f0);
t1 = _mm256_extracti128_si256(f0, 1);
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
_mm_storeu_si128((__m128i *)&r[20 * i + 0], t0);
_mm_store_ss((float *)&low, _mm_castsi128_ps(t1));
r[20 * i + 16] = (uint8_t)low;
r[20 * i + 17] = (uint8_t)(low >> 0x08);
r[20 * i + 18] = (uint8_t)(low >> 0x10);
r[20 * i + 19] = (uint8_t)(low >> 0x18);
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_decompress
*
* Description: De-serialization and subsequent decompression of a polynomial;
* approximate inverse of PQCLEAN_KYBER102490S_AVX2_poly_compress
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYCOMPRESSEDBYTES bytes)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r,
const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) {
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r, const uint8_t a[160]) {
unsigned int i;
int16_t h;
__m128i t;
__m256i f;
const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]);
const __m256i shufbidx = _mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5,
4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0);
const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31,
248, 1984, 62, 496, 3968, 124, 992, 31);
const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024,
128, 16, 512, 64, 8, 256, 32, 1024);

unsigned int j;
uint8_t t[8];
for (i = 0; i < KYBER_N / 8; i++) {
t[0] = (a[0] >> 0);
t[1] = (a[0] >> 5) | (a[1] << 3);
t[2] = (a[1] >> 2);
t[3] = (a[1] >> 7) | (a[2] << 1);
t[4] = (a[2] >> 4) | (a[3] << 4);
t[5] = (a[3] >> 1);
t[6] = (a[3] >> 6) | (a[4] << 2);
t[7] = (a[4] >> 3);
a += 5;

for (j = 0; j < 8; j++) {
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5;
}
for (i = 0; i < KYBER_N / 16; i++) {
t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]);
h = (a[10 * i + 9] << 8) + a[10 * i + 8];
t = _mm_insert_epi16(t, h, 4);
f = _mm256_broadcastsi128_si256(t);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_and_si256(f, mask);
f = _mm256_mullo_epi16(f, shift);
f = _mm256_mulhrs_epi16(f, q);
_mm256_store_si256(&r->vec[i], f);
}
}


/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tobytes
*
* Description: Serialization of a polynomial
* Description: Serialization of a polynomial in NTT representation.
* The coefficients of the input polynomial are assumed to
* lie in the invertal [0,q], i.e. the polynomial must be reduced
* by PQCLEAN_KYBER102490S_AVX2_poly_reduce(). The coefficients are orderd as output by
* PQCLEAN_KYBER102490S_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed
* order.
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYBYTES bytes)
* - poly *a: pointer to input polynomial
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec);
}

/*************************************************
@@ -90,12 +113,12 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a)
* Description: De-serialization of a polynomial;
* inverse of PQCLEAN_KYBER102490S_AVX2_poly_tobytes
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of KYBER_POLYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) {
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER102490S_AVX2_qdata);
PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER102490S_AVX2_qdata.vec);
}

/*************************************************
@@ -103,11 +126,10 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POL
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r,
const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
__m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3));
const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
@@ -136,12 +158,12 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r,
g2 = _mm256_permute2x128_si256(h0,h1,0x31); \
g1 = _mm256_permute2x128_si256(h2,h3,0x20); \
g3 = _mm256_permute2x128_si256(h2,h3,0x31); \
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3)
_mm256_store_si256(&r->vec[0+2*(i)+0],g0); \
_mm256_store_si256(&r->vec[0+2*(i)+1],g1); \
_mm256_store_si256(&r->vec[8+2*(i)+0],g2); \
_mm256_store_si256(&r->vec[8+2*(i)+1],g3)

f = _mm256_load_si256((__m256i *)msg);
f = _mm256_loadu_si256((__m256i *)msg);
FROMMSG64(0);
FROMMSG64(1);
FROMMSG64(2);
@@ -151,32 +173,34 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r,
/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_tomsg
*
* Description: Convert polynomial to 32-byte message
* Description: Convert polynomial to 32-byte message.
* The coefficients of the input polynomial are assumed to
* lie in the invertal [0,q], i.e. the polynomial must be reduced
* by PQCLEAN_KYBER102490S_AVX2_poly_reduce().
*
* Arguments: - uint8_t *msg: pointer to output message
* - poly *a: pointer to input polynomial
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) {
unsigned int i;
uint32_t small;
__m256i f0, f1, g0, g1;
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4);
const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4);

for (i = 0; i < KYBER_N / 32; i++) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]);
f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]);
f0 = _mm256_sub_epi16(hqs, f0);
f1 = _mm256_sub_epi16(hqs, f1);
f0 = _mm256_load_si256(&a->vec[2 * i + 0]);
f1 = _mm256_load_si256(&a->vec[2 * i + 1]);
f0 = _mm256_sub_epi16(hq, f0);
f1 = _mm256_sub_epi16(hq, f1);
g0 = _mm256_srai_epi16(f0, 15);
g1 = _mm256_srai_epi16(f1, 15);
f0 = _mm256_xor_si256(f0, g0);
f1 = _mm256_xor_si256(f1, g1);
f0 = _mm256_sub_epi16(hhqs, f0);
f1 = _mm256_sub_epi16(hhqs, f1);
f0 = _mm256_sub_epi16(f0, hhq);
f1 = _mm256_sub_epi16(f1, hhq);
f0 = _mm256_packs_epi16(f0, f1);
small = _mm256_movemask_epi8(f0);
small = ~small;
msg[4 * i + 0] = small;
msg[4 * i + 1] = small >> 16;
msg[4 * i + 2] = small >> 8;
@@ -185,21 +209,39 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], po
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise
* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA1
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1
prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce);
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta1(r, buf.vec);
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA
* with parameter KYBER_ETA2
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf;
prf(buf.arr, sizeof(buf.arr), seed, nonce);
PQCLEAN_KYBER102490S_AVX2_cbd(r, buf.arr);
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf;
prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce);
PQCLEAN_KYBER102490S_AVX2_poly_cbd_eta2(r, buf.vec);
}


@@ -207,13 +249,17 @@ void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_S
* Name: PQCLEAN_KYBER102490S_AVX2_poly_ntt
*
* Description: Computes negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
* inputs assumed to be in normal order, output in bitreversed order
* a polynomial in place.
* Input coefficients assumed to be in normal order,
* output coefficients are in special order that is natural
* for the vectorization. Input coefficients are assumed to be
* bounded by q in absolute value, output coefficients are bounded
* by 16118 in absolute value.
*
* Arguments: - uint16_t *r: pointer to in/output polynomial
* Arguments: - poly *r: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) {
PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec);
}

/*************************************************
@@ -221,29 +267,35 @@ void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) {
*
* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
* of a polynomial in place;
* inputs assumed to be in bitreversed order, output in normal order
* Input coefficients assumed to be in special order from vectorized
* forward ntt, output in normal order. Input coefficients can be
* arbitrary 16-bit integers, output coefficients are bounded by 14870
* in absolute value.
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
* Arguments: - poly *a: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r) {
PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec);
}

void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r) {
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec);
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery
*
* Description: Multiplication of two polynomials in NTT domain
* Description: Multiplication of two polynomials in NTT domain.
* One of the input polynomials needs to have coefficients
* bounded by q, the other polynomial can have arbitrary
* coefficients. Output coefficients are bounded by 6656.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) {
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec);
}

/*************************************************
@@ -255,7 +307,7 @@ void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, c
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) {
PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec);
}

/*************************************************
@@ -267,28 +319,16 @@ void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) {
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r) {
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of a polynomial. For details of conditional subtraction
* of q see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) {
PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata);
PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER102490S_AVX2_qdata.vec);
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_add
*
* Description: Add two polynomials
* Description: Add two polynomials. No modular reduction
* is performed.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
@@ -296,20 +336,21 @@ void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) {
unsigned int i;
__m256i f0, f1;

for (i = 0; i < KYBER_N; i += 16) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
for (i = 0; i < KYBER_N / 16; i++) {
f0 = _mm256_load_si256(&a->vec[i]);
f1 = _mm256_load_si256(&b->vec[i]);
f0 = _mm256_add_epi16(f0, f1);
_mm256_store_si256((__m256i *)&r->coeffs[i], f0);
_mm256_store_si256(&r->vec[i], f0);
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_poly_sub
*
* Description: Subtract two polynomials
* Description: Subtract two polynomials. No modular reduction
* is performed.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
@@ -317,10 +358,10 @@ void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) {
unsigned int i;
__m256i f0, f1;

for (i = 0; i < KYBER_N; i += 16) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
for (i = 0; i < KYBER_N / 16; i++) {
f0 = _mm256_load_si256(&a->vec[i]);
f1 = _mm256_load_si256(&b->vec[i]);
f0 = _mm256_sub_epi16(f0, f1);
_mm256_store_si256((__m256i *)&r->coeffs[i], f0);
_mm256_store_si256(&r->vec[i], f0);
}
}

+ 8
- 11
crypto_kem/kyber1024-90s/avx2/poly.h 查看文件

@@ -1,19 +1,13 @@
#ifndef PQCLEAN_KYBER102490S_AVX2_POLY_H
#define PQCLEAN_KYBER102490S_AVX2_POLY_H
#include "align.h"
#include "params.h"
#include <immintrin.h>
#include <stdint.h>

/*
* Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
*/
typedef union {
__m256i dummy;
int16_t coeffs[KYBER_N];
} poly;
typedef ALIGNED_INT16(KYBER_N) poly;

void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a);
void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);

void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a);
@@ -22,7 +16,11 @@ void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POL
void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a);

void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);

void PQCLEAN_KYBER102490S_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);



void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r);
void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r);
@@ -31,7 +29,6 @@ void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, c
void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r);

void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r);
void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r);

void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b);
void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b);


+ 93
- 72
crypto_kem/kyber1024-90s/avx2/polyvec.c 查看文件

@@ -3,8 +3,79 @@
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <immintrin.h>
#include <stdint.h>

static void poly_compress11(uint8_t r[352 + 2], const poly *restrict a) {
unsigned int i;
__m256i f0, f1, f2;
__m128i t0, t1;
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XV / 16]);
const __m256i v8 = _mm256_slli_epi16(v, 3);
const __m256i off = _mm256_set1_epi16(36);
const __m256i shift1 = _mm256_set1_epi16(1 << 13);
const __m256i mask = _mm256_set1_epi16(2047);
const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
const __m256i sllvdidx = _mm256_set1_epi64x(10);
const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10);
const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5,
-1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);

for (i = 0; i < KYBER_N / 16; i++) {
f0 = _mm256_load_si256(&a->vec[i]);
f1 = _mm256_mullo_epi16(f0, v8);
f2 = _mm256_add_epi16(f0, off);
f0 = _mm256_slli_epi16(f0, 3);
f0 = _mm256_mulhi_epi16(f0, v);
f2 = _mm256_sub_epi16(f1, f2);
f1 = _mm256_andnot_si256(f1, f2);
f1 = _mm256_srli_epi16(f1, 15);
f0 = _mm256_sub_epi16(f0, f1);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f0 = _mm256_and_si256(f0, mask);
f0 = _mm256_madd_epi16(f0, shift2);
f0 = _mm256_sllv_epi32(f0, sllvdidx);
f1 = _mm256_bsrli_epi128(f0, 8);
f0 = _mm256_srlv_epi64(f0, srlvqidx);
f1 = _mm256_slli_epi64(f1, 34);
f0 = _mm256_add_epi64(f0, f1);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
t0 = _mm256_castsi256_si128(f0);
t1 = _mm256_extracti128_si256(f0, 1);
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
_mm_storeu_si128((__m128i *)&r[22 * i + 0], t0);
_mm_storel_epi64((__m128i *)&r[22 * i + 16], t1);
}
}

static void poly_decompress11(poly *restrict r, const uint8_t a[352 + 10]) {
unsigned int i;
__m256i f;
const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]);
const __m256i shufbidx = _mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8,
8, 7, 6, 5, 5, 4, 4, 3,
10, 9, 9, 8, 7, 6, 6, 5,
5, 4, 3, 2, 2, 1, 1, 0);
const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0);
const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0);
const __m256i shift = _mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32);
const __m256i mask = _mm256_set1_epi16(32752);

for (i = 0; i < KYBER_N / 16; i++) {
f = _mm256_loadu_si256((__m256i *)&a[22 * i]);
f = _mm256_permute4x64_epi64(f, 0x94);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_srlv_epi32(f, srlvdidx);
f = _mm256_srlv_epi64(f, srlvqidx);
f = _mm256_mullo_epi16(f, shift);
f = _mm256_srli_epi16(f, 1);
f = _mm256_and_si256(f, mask);
f = _mm256_mulhrs_epi16(f, q);
_mm256_store_si256(&r->vec[i], f);
}
}


/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_compress
*
@@ -14,33 +85,11 @@
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
* - polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES],
polyvec *restrict a) {
size_t i, j, k;

PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(a);
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) {
size_t i;

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
for (k = 0; k < 8; k++) {
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2)
/ KYBER_Q) & 0x7ff;
}

r[ 0] = (t[0] >> 0);
r[ 1] = (t[0] >> 8) | (t[1] << 3);
r[ 2] = (t[1] >> 5) | (t[2] << 6);
r[ 3] = (t[2] >> 2);
r[ 4] = (t[2] >> 10) | (t[3] << 1);
r[ 5] = (t[3] >> 7) | (t[4] << 4);
r[ 6] = (t[4] >> 4) | (t[5] << 7);
r[ 7] = (t[5] >> 1);
r[ 8] = (t[5] >> 9) | (t[6] << 2);
r[ 9] = (t[6] >> 6) | (t[7] << 5);
r[10] = (t[7] >> 3);
r += 11;
}
poly_compress11(&r[352 * i], &a->vec[i]);
}
}

@@ -50,31 +99,15 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSE
* Description: De-serialize and decompress vector of polynomials;
* approximate inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_compress
*
* Arguments: - polyvec *r: pointer to output vector of polynomials
* Arguments: - polyvec *r: pointer to output vector of polynomials
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYVECCOMPRESSEDBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *restrict r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
size_t i, j, k;
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) {
size_t i;

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
a += 11;

for (k = 0; k < 8; k++) {
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11;
}
}
poly_decompress11(&r->vec[i], &a[352 * i]);
}
}

@@ -100,7 +133,7 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], po
* Description: De-serialize vector of polynomials;
* inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes
*
* Arguments: - uint8_t *r: pointer to output byte array
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (of length KYBER_POLYVECBYTES)
**************************************************/
@@ -141,29 +174,34 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r) {
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery
*
* Description: Pointwise multiply elements of a and b, accumulate into r,
* Description: Multiply elements in a and b in NTT domain, accumulate into r,
* and multiply by 2^-16.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b) {
PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) {
size_t i;
poly tmp;

PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
for (i = 1; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]);
PQCLEAN_KYBER102490S_AVX2_poly_add(r, r, &tmp);
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_reduce
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials
* of each element of a vector of polynomials;
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - polyvec *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) {
size_t i;
@@ -172,23 +210,6 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) {
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of each element of a vector of polynomials
* for details of conditional subtraction of q see comments in
* reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) {
size_t i;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_AVX2_poly_csubq(&r->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_add
*


+ 3
- 7
crypto_kem/kyber1024-90s/avx2/polyvec.h 查看文件

@@ -8,9 +8,8 @@ typedef struct {
poly vec[KYBER_K];
} polyvec;

void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a);
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a);
void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]);

void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a);
void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
@@ -18,12 +17,9 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYB
void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r);
void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r);

void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b);
void PQCLEAN_KYBER102490S_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);

void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r);
void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r);

void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);



+ 4
- 5
crypto_kem/kyber1024-90s/avx2/reduce.h 查看文件

@@ -1,10 +1,9 @@
#ifndef PQCLEAN_KYBER102490S_AVX2_REDUCE_H
#define PQCLEAN_KYBER102490S_AVX2_REDUCE_H
#include "consts.h"
#include <stdint.h>
#include "params.h"
#include <immintrin.h>

int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
int16_t PQCLEAN_KYBER102490S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);
void PQCLEAN_KYBER102490S_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER102490S_AVX2_qdata);

#endif

+ 72
- 309
crypto_kem/kyber1024-90s/avx2/rejsample.c 查看文件

@@ -4,311 +4,68 @@
#include "rejsample.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>

//#define BMI

static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = {
{-1, -1, -1, -1, -1, -1, -1, -1},
{ 0, -1, -1, -1, -1, -1, -1, -1},
{ 2, -1, -1, -1, -1, -1, -1, -1},
{ 0, 2, -1, -1, -1, -1, -1, -1},
{ 4, -1, -1, -1, -1, -1, -1, -1},
{ 0, 4, -1, -1, -1, -1, -1, -1},
{ 2, 4, -1, -1, -1, -1, -1, -1},
{ 0, 2, 4, -1, -1, -1, -1, -1},
{ 6, -1, -1, -1, -1, -1, -1, -1},
{ 0, 6, -1, -1, -1, -1, -1, -1},
{ 2, 6, -1, -1, -1, -1, -1, -1},
{ 0, 2, 6, -1, -1, -1, -1, -1},
{ 4, 6, -1, -1, -1, -1, -1, -1},
{ 0, 4, 6, -1, -1, -1, -1, -1},
{ 2, 4, 6, -1, -1, -1, -1, -1},
{ 0, 2, 4, 6, -1, -1, -1, -1},
{ 8, -1, -1, -1, -1, -1, -1, -1},
{ 0, 8, -1, -1, -1, -1, -1, -1},
{ 2, 8, -1, -1, -1, -1, -1, -1},
{ 0, 2, 8, -1, -1, -1, -1, -1},
{ 4, 8, -1, -1, -1, -1, -1, -1},
{ 0, 4, 8, -1, -1, -1, -1, -1},
{ 2, 4, 8, -1, -1, -1, -1, -1},
{ 0, 2, 4, 8, -1, -1, -1, -1},
{ 6, 8, -1, -1, -1, -1, -1, -1},
{ 0, 6, 8, -1, -1, -1, -1, -1},
{ 2, 6, 8, -1, -1, -1, -1, -1},
{ 0, 2, 6, 8, -1, -1, -1, -1},
{ 4, 6, 8, -1, -1, -1, -1, -1},
{ 0, 4, 6, 8, -1, -1, -1, -1},
{ 2, 4, 6, 8, -1, -1, -1, -1},
{ 0, 2, 4, 6, 8, -1, -1, -1},
{10, -1, -1, -1, -1, -1, -1, -1},
{ 0, 10, -1, -1, -1, -1, -1, -1},
{ 2, 10, -1, -1, -1, -1, -1, -1},
{ 0, 2, 10, -1, -1, -1, -1, -1},
{ 4, 10, -1, -1, -1, -1, -1, -1},
{ 0, 4, 10, -1, -1, -1, -1, -1},
{ 2, 4, 10, -1, -1, -1, -1, -1},
{ 0, 2, 4, 10, -1, -1, -1, -1},
{ 6, 10, -1, -1, -1, -1, -1, -1},
{ 0, 6, 10, -1, -1, -1, -1, -1},
{ 2, 6, 10, -1, -1, -1, -1, -1},
{ 0, 2, 6, 10, -1, -1, -1, -1},
{ 4, 6, 10, -1, -1, -1, -1, -1},
{ 0, 4, 6, 10, -1, -1, -1, -1},
{ 2, 4, 6, 10, -1, -1, -1, -1},
{ 0, 2, 4, 6, 10, -1, -1, -1},
{ 8, 10, -1, -1, -1, -1, -1, -1},
{ 0, 8, 10, -1, -1, -1, -1, -1},
{ 2, 8, 10, -1, -1, -1, -1, -1},
{ 0, 2, 8, 10, -1, -1, -1, -1},
{ 4, 8, 10, -1, -1, -1, -1, -1},
{ 0, 4, 8, 10, -1, -1, -1, -1},
{ 2, 4, 8, 10, -1, -1, -1, -1},
{ 0, 2, 4, 8, 10, -1, -1, -1},
{ 6, 8, 10, -1, -1, -1, -1, -1},
{ 0, 6, 8, 10, -1, -1, -1, -1},
{ 2, 6, 8, 10, -1, -1, -1, -1},
{ 0, 2, 6, 8, 10, -1, -1, -1},
{ 4, 6, 8, 10, -1, -1, -1, -1},
{ 0, 4, 6, 8, 10, -1, -1, -1},
{ 2, 4, 6, 8, 10, -1, -1, -1},
{ 0, 2, 4, 6, 8, 10, -1, -1},
{12, -1, -1, -1, -1, -1, -1, -1},
{ 0, 12, -1, -1, -1, -1, -1, -1},
{ 2, 12, -1, -1, -1, -1, -1, -1},
{ 0, 2, 12, -1, -1, -1, -1, -1},
{ 4, 12, -1, -1, -1, -1, -1, -1},
{ 0, 4, 12, -1, -1, -1, -1, -1},
{ 2, 4, 12, -1, -1, -1, -1, -1},
{ 0, 2, 4, 12, -1, -1, -1, -1},
{ 6, 12, -1, -1, -1, -1, -1, -1},
{ 0, 6, 12, -1, -1, -1, -1, -1},
{ 2, 6, 12, -1, -1, -1, -1, -1},
{ 0, 2, 6, 12, -1, -1, -1, -1},
{ 4, 6, 12, -1, -1, -1, -1, -1},
{ 0, 4, 6, 12, -1, -1, -1, -1},
{ 2, 4, 6, 12, -1, -1, -1, -1},
{ 0, 2, 4, 6, 12, -1, -1, -1},
{ 8, 12, -1, -1, -1, -1, -1, -1},
{ 0, 8, 12, -1, -1, -1, -1, -1},
{ 2, 8, 12, -1, -1, -1, -1, -1},
{ 0, 2, 8, 12, -1, -1, -1, -1},
{ 4, 8, 12, -1, -1, -1, -1, -1},
{ 0, 4, 8, 12, -1, -1, -1, -1},
{ 2, 4, 8, 12, -1, -1, -1, -1},
{ 0, 2, 4, 8, 12, -1, -1, -1},
{ 6, 8, 12, -1, -1, -1, -1, -1},
{ 0, 6, 8, 12, -1, -1, -1, -1},
{ 2, 6, 8, 12, -1, -1, -1, -1},
{ 0, 2, 6, 8, 12, -1, -1, -1},
{ 4, 6, 8, 12, -1, -1, -1, -1},
{ 0, 4, 6, 8, 12, -1, -1, -1},
{ 2, 4, 6, 8, 12, -1, -1, -1},
{ 0, 2, 4, 6, 8, 12, -1, -1},
{10, 12, -1, -1, -1, -1, -1, -1},
{ 0, 10, 12, -1, -1, -1, -1, -1},
{ 2, 10, 12, -1, -1, -1, -1, -1},
{ 0, 2, 10, 12, -1, -1, -1, -1},
{ 4, 10, 12, -1, -1, -1, -1, -1},
{ 0, 4, 10, 12, -1, -1, -1, -1},
{ 2, 4, 10, 12, -1, -1, -1, -1},
{ 0, 2, 4, 10, 12, -1, -1, -1},
{ 6, 10, 12, -1, -1, -1, -1, -1},
{ 0, 6, 10, 12, -1, -1, -1, -1},
{ 2, 6, 10, 12, -1, -1, -1, -1},
{ 0, 2, 6, 10, 12, -1, -1, -1},
{ 4, 6, 10, 12, -1, -1, -1, -1},
{ 0, 4, 6, 10, 12, -1, -1, -1},
{ 2, 4, 6, 10, 12, -1, -1, -1},
{ 0, 2, 4, 6, 10, 12, -1, -1},
{ 8, 10, 12, -1, -1, -1, -1, -1},
{ 0, 8, 10, 12, -1, -1, -1, -1},
{ 2, 8, 10, 12, -1, -1, -1, -1},
{ 0, 2, 8, 10, 12, -1, -1, -1},
{ 4, 8, 10, 12, -1, -1, -1, -1},
{ 0, 4, 8, 10, 12, -1, -1, -1},
{ 2, 4, 8, 10, 12, -1, -1, -1},
{ 0, 2, 4, 8, 10, 12, -1, -1},
{ 6, 8, 10, 12, -1, -1, -1, -1},
{ 0, 6, 8, 10, 12, -1, -1, -1},
{ 2, 6, 8, 10, 12, -1, -1, -1},
{ 0, 2, 6, 8, 10, 12, -1, -1},
{ 4, 6, 8, 10, 12, -1, -1, -1},
{ 0, 4, 6, 8, 10, 12, -1, -1},
{ 2, 4, 6, 8, 10, 12, -1, -1},
{ 0, 2, 4, 6, 8, 10, 12, -1},
{14, -1, -1, -1, -1, -1, -1, -1},
{ 0, 14, -1, -1, -1, -1, -1, -1},
{ 2, 14, -1, -1, -1, -1, -1, -1},
{ 0, 2, 14, -1, -1, -1, -1, -1},
{ 4, 14, -1, -1, -1, -1, -1, -1},
{ 0, 4, 14, -1, -1, -1, -1, -1},
{ 2, 4, 14, -1, -1, -1, -1, -1},
{ 0, 2, 4, 14, -1, -1, -1, -1},
{ 6, 14, -1, -1, -1, -1, -1, -1},
{ 0, 6, 14, -1, -1, -1, -1, -1},
{ 2, 6, 14, -1, -1, -1, -1, -1},
{ 0, 2, 6, 14, -1, -1, -1, -1},
{ 4, 6, 14, -1, -1, -1, -1, -1},
{ 0, 4, 6, 14, -1, -1, -1, -1},
{ 2, 4, 6, 14, -1, -1, -1, -1},
{ 0, 2, 4, 6, 14, -1, -1, -1},
{ 8, 14, -1, -1, -1, -1, -1, -1},
{ 0, 8, 14, -1, -1, -1, -1, -1},
{ 2, 8, 14, -1, -1, -1, -1, -1},
{ 0, 2, 8, 14, -1, -1, -1, -1},
{ 4, 8, 14, -1, -1, -1, -1, -1},
{ 0, 4, 8, 14, -1, -1, -1, -1},
{ 2, 4, 8, 14, -1, -1, -1, -1},
{ 0, 2, 4, 8, 14, -1, -1, -1},
{ 6, 8, 14, -1, -1, -1, -1, -1},
{ 0, 6, 8, 14, -1, -1, -1, -1},
{ 2, 6, 8, 14, -1, -1, -1, -1},
{ 0, 2, 6, 8, 14, -1, -1, -1},
{ 4, 6, 8, 14, -1, -1, -1, -1},
{ 0, 4, 6, 8, 14, -1, -1, -1},
{ 2, 4, 6, 8, 14, -1, -1, -1},
{ 0, 2, 4, 6, 8, 14, -1, -1},
{10, 14, -1, -1, -1, -1, -1, -1},
{ 0, 10, 14, -1, -1, -1, -1, -1},
{ 2, 10, 14, -1, -1, -1, -1, -1},
{ 0, 2, 10, 14, -1, -1, -1, -1},
{ 4, 10, 14, -1, -1, -1, -1, -1},
{ 0, 4, 10, 14, -1, -1, -1, -1},
{ 2, 4, 10, 14, -1, -1, -1, -1},
{ 0, 2, 4, 10, 14, -1, -1, -1},
{ 6, 10, 14, -1, -1, -1, -1, -1},
{ 0, 6, 10, 14, -1, -1, -1, -1},
{ 2, 6, 10, 14, -1, -1, -1, -1},
{ 0, 2, 6, 10, 14, -1, -1, -1},
{ 4, 6, 10, 14, -1, -1, -1, -1},
{ 0, 4, 6, 10, 14, -1, -1, -1},
{ 2, 4, 6, 10, 14, -1, -1, -1},
{ 0, 2, 4, 6, 10, 14, -1, -1},
{ 8, 10, 14, -1, -1, -1, -1, -1},
{ 0, 8, 10, 14, -1, -1, -1, -1},
{ 2, 8, 10, 14, -1, -1, -1, -1},
{ 0, 2, 8, 10, 14, -1, -1, -1},
{ 4, 8, 10, 14, -1, -1, -1, -1},
{ 0, 4, 8, 10, 14, -1, -1, -1},
{ 2, 4, 8, 10, 14, -1, -1, -1},
{ 0, 2, 4, 8, 10, 14, -1, -1},
{ 6, 8, 10, 14, -1, -1, -1, -1},
{ 0, 6, 8, 10, 14, -1, -1, -1},
{ 2, 6, 8, 10, 14, -1, -1, -1},
{ 0, 2, 6, 8, 10, 14, -1, -1},
{ 4, 6, 8, 10, 14, -1, -1, -1},
{ 0, 4, 6, 8, 10, 14, -1, -1},
{ 2, 4, 6, 8, 10, 14, -1, -1},
{ 0, 2, 4, 6, 8, 10, 14, -1},
{12, 14, -1, -1, -1, -1, -1, -1},
{ 0, 12, 14, -1, -1, -1, -1, -1},
{ 2, 12, 14, -1, -1, -1, -1, -1},
{ 0, 2, 12, 14, -1, -1, -1, -1},
{ 4, 12, 14, -1, -1, -1, -1, -1},
{ 0, 4, 12, 14, -1, -1, -1, -1},
{ 2, 4, 12, 14, -1, -1, -1, -1},
{ 0, 2, 4, 12, 14, -1, -1, -1},
{ 6, 12, 14, -1, -1, -1, -1, -1},
{ 0, 6, 12, 14, -1, -1, -1, -1},
{ 2, 6, 12, 14, -1, -1, -1, -1},
{ 0, 2, 6, 12, 14, -1, -1, -1},
{ 4, 6, 12, 14, -1, -1, -1, -1},
{ 0, 4, 6, 12, 14, -1, -1, -1},
{ 2, 4, 6, 12, 14, -1, -1, -1},
{ 0, 2, 4, 6, 12, 14, -1, -1},
{ 8, 12, 14, -1, -1, -1, -1, -1},
{ 0, 8, 12, 14, -1, -1, -1, -1},
{ 2, 8, 12, 14, -1, -1, -1, -1},
{ 0, 2, 8, 12, 14, -1, -1, -1},
{ 4, 8, 12, 14, -1, -1, -1, -1},
{ 0, 4, 8, 12, 14, -1, -1, -1},
{ 2, 4, 8, 12, 14, -1, -1, -1},
{ 0, 2, 4, 8, 12, 14, -1, -1},
{ 6, 8, 12, 14, -1, -1, -1, -1},
{ 0, 6, 8, 12, 14, -1, -1, -1},
{ 2, 6, 8, 12, 14, -1, -1, -1},
{ 0, 2, 6, 8, 12, 14, -1, -1},
{ 4, 6, 8, 12, 14, -1, -1, -1},
{ 0, 4, 6, 8, 12, 14, -1, -1},
{ 2, 4, 6, 8, 12, 14, -1, -1},
{ 0, 2, 4, 6, 8, 12, 14, -1},
{10, 12, 14, -1, -1, -1, -1, -1},
{ 0, 10, 12, 14, -1, -1, -1, -1},
{ 2, 10, 12, 14, -1, -1, -1, -1},
{ 0, 2, 10, 12, 14, -1, -1, -1},
{ 4, 10, 12, 14, -1, -1, -1, -1},
{ 0, 4, 10, 12, 14, -1, -1, -1},
{ 2, 4, 10, 12, 14, -1, -1, -1},
{ 0, 2, 4, 10, 12, 14, -1, -1},
{ 6, 10, 12, 14, -1, -1, -1, -1},
{ 0, 6, 10, 12, 14, -1, -1, -1},
{ 2, 6, 10, 12, 14, -1, -1, -1},
{ 0, 2, 6, 10, 12, 14, -1, -1},
{ 4, 6, 10, 12, 14, -1, -1, -1},
{ 0, 4, 6, 10, 12, 14, -1, -1},
{ 2, 4, 6, 10, 12, 14, -1, -1},
{ 0, 2, 4, 6, 10, 12, 14, -1},
{ 8, 10, 12, 14, -1, -1, -1, -1},
{ 0, 8, 10, 12, 14, -1, -1, -1},
{ 2, 8, 10, 12, 14, -1, -1, -1},
{ 0, 2, 8, 10, 12, 14, -1, -1},
{ 4, 8, 10, 12, 14, -1, -1, -1},
{ 0, 4, 8, 10, 12, 14, -1, -1},
{ 2, 4, 8, 10, 12, 14, -1, -1},
{ 0, 2, 4, 8, 10, 12, 14, -1},
{ 6, 8, 10, 12, 14, -1, -1, -1},
{ 0, 6, 8, 10, 12, 14, -1, -1},
{ 2, 6, 8, 10, 12, 14, -1, -1},
{ 0, 2, 6, 8, 10, 12, 14, -1},
{ 4, 6, 8, 10, 12, 14, -1, -1},
{ 0, 4, 6, 8, 10, 12, 14, -1},
{ 2, 4, 6, 8, 10, 12, 14, -1},
{ 0, 2, 4, 6, 8, 10, 12, 14}
}
};

#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)

#define REJ_UNIFORM_BUFLEN 576
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r,
const uint8_t *restrict buf) {
unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) {
unsigned int ctr, pos;
uint16_t val;
uint16_t val0, val1;
uint32_t good;
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1));
uint64_t idx0, idx1, idx2, idx3;
const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_qdata.vec[_16XQ / 16]);
const __m256i ones = _mm256_set1_epi8(1);
const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XQ]);
const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XV]);
const __m256i mask = _mm256_set1_epi16(0xFFF);
const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10,
9, 8, 8, 7, 6, 5, 5, 4,
11, 10, 10, 9, 8, 7, 7, 6,
5, 4, 4, 3, 2, 1, 1, 0);
__m256i f0, f1, g0, g1, g2, g3;
__m128i f, t, pilo, pihi;

ctr = 0;
for (pos = 0; pos < 2 * KYBER_N; pos += 64) {
f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]);
f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]);

g0 = _mm256_cmpge_epu16(bound, f0);
g1 = _mm256_cmpge_epu16(bound, f1);
ctr = pos = 0;
while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) {
f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]);
f0 = _mm256_permute4x64_epi64(f0, 0x94);
f1 = _mm256_permute4x64_epi64(f1, 0x94);
f0 = _mm256_shuffle_epi8(f0, idx8);
f1 = _mm256_shuffle_epi8(f1, idx8);
g0 = _mm256_srli_epi16(f0, 4);
g1 = _mm256_srli_epi16(f1, 4);
f0 = _mm256_blend_epi16(f0, g0, 0xAA);
f1 = _mm256_blend_epi16(f1, g1, 0xAA);
f0 = _mm256_and_si256(f0, mask);
f1 = _mm256_and_si256(f1, mask);
pos += 48;

g0 = _mm256_cmpgt_epi16(bound, f0);
g1 = _mm256_cmpgt_epi16(bound, f1);

g0 = _mm256_packs_epi16(g0, g1);
good = _mm256_movemask_epi8(g0);

g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF]));
g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF]));
g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1);
g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1);

//g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good));
//g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8);
/* Barrett reduction of (still unsigned) values */
g2 = _mm256_mulhi_epu16(f0, v);
g3 = _mm256_mulhi_epu16(f1, v);
g2 = _mm256_srli_epi16(g2, 10);
g3 = _mm256_srli_epi16(g3, 10);
g2 = _mm256_mullo_epi16(g2, kyberq);
g3 = _mm256_mullo_epi16(g3, kyberq);
f0 = _mm256_sub_epi16(f0, g2);
f1 = _mm256_sub_epi16(f1, g3);
idx0 = _pdep_u64(good >> 0, 0x0101010101010101);
idx1 = _pdep_u64(good >> 8, 0x0101010101010101);
idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
idx0 = (idx0 << 8) - idx0;
idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
idx1 = (idx1 << 8) - idx1;
idx1 = _pext_u64(0x0E0C0A0806040200, idx1);
idx2 = (idx2 << 8) - idx2;
idx2 = _pext_u64(0x0E0C0A0806040200, idx2);
idx3 = (idx3 << 8) - idx3;
idx3 = _pext_u64(0x0E0C0A0806040200, idx3);

g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);

g2 = _mm256_add_epi8(g0, ones);
g3 = _mm256_add_epi8(g1, ones);
@@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r,
ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
}

while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) {
f = _mm_load_si128((__m128i *)&buf[pos]);
t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f);
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) {
f = _mm_loadu_si128((__m128i *)&buf[pos]);
f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
t = _mm_srli_epi16(f, 4);
f = _mm_blend_epi16(f, t, 0xAA);
f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
pos += 12;

t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
good = _mm_movemask_epi8(t);
good = _pext_u32(good, 0x5555);
pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]);
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
pilo = _mm_unpacklo_epi8(pilo, pihi);

/* Barrett reduction */
t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v));
t = _mm_srli_epi16(t, 10);
t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq));
f = _mm_sub_epi16(f, t);
good &= 0x5555;
idx0 = _pdep_u64(good, 0x1111111111111111);
idx0 = (idx0 << 8) - idx0;
idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
pilo = _mm_cvtsi64_si128(idx0);

pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
pilo = _mm_unpacklo_epi8(pilo, pihi);
f = _mm_shuffle_epi8(f, pilo);
_mm_storeu_si128((__m128i *)&r[ctr], f);
ctr += _mm_popcnt_u32(good);
pos += 16;
}

while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) {
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;
while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) {
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4));
pos += 3;

if (val < 19 * KYBER_Q) {
val -= ((int32_t)val * 20159 >> 26) * KYBER_Q;
r[ctr++] = val;
if (val0 < KYBER_Q) {
r[ctr++] = val0;
}
if (val1 < KYBER_Q && ctr < KYBER_N) {
r[ctr++] = val1;
}
}



+ 5
- 2
crypto_kem/kyber1024-90s/avx2/rejsample.h 查看文件

@@ -1,9 +1,12 @@
#ifndef PQCLEAN_KYBER102490S_AVX2_REJSAMPLE_H
#define PQCLEAN_KYBER102490S_AVX2_REJSAMPLE_H
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r,
const unsigned char *buf);
#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES)

unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf);

#endif

+ 6
- 6
crypto_kem/kyber1024-90s/avx2/shuffle.S 查看文件

@@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12

#csubq
csubq 5,13
csubq 6,14
csubq 7,15
csubq 8,1
csubq 6,13
csubq 7,13
csubq 8,13
csubq 9,13
csubq 10,14
csubq 11,15
csubq 12,1
csubq 10,13
csubq 11,13
csubq 12,13

#bitpack
vpsllw $12,%ymm6,%ymm4


+ 10
- 10
crypto_kem/kyber1024-90s/avx2/shuffle.inc 查看文件

@@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2
#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3
vpsllq $32,%ymm\r1,%ymm12
vpsrlq $32,%ymm\r0,%ymm13
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3
#vpsllq $32,%ymm\r1,%ymm\r2
vmovsldup %ymm\r1,%ymm\r2
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq $32,%ymm\r0,%ymm\r0
#vmovshdup %ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm12
vpsrld $16,%ymm\r0,%ymm13
vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3
vpslld $16,%ymm\r1,%ymm\r2
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld $16,%ymm\r0,%ymm\r0
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

+ 4
- 6
crypto_kem/kyber1024-90s/avx2/symmetric.h 查看文件

@@ -14,12 +14,10 @@ typedef aes256ctr_ctx xof_state;

#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES)
#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES)
#define xof_absorb(STATE, SEED, X, Y) \
PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8))
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \
PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) \
PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE)
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8))
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_ctx_release(STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE)
#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES)




+ 33
- 35
crypto_kem/kyber1024-90s/avx2/verify.c 查看文件

@@ -8,31 +8,31 @@
*
* Description: Compare two arrays for equality in constant time.
*
* Arguments: const unsigned char *a: pointer to first byte array
* const unsigned char *b: pointer to second byte array
* size_t len: length of the byte arrays
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
* size_t len: length of the byte arrays
*
* Returns 0 if the byte arrays are equal, 1 otherwise
**************************************************/
int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
size_t pos;
size_t i;
uint64_t r;
__m256i avec, bvec, cvec;
__m256i f, g, h;

cvec = _mm256_setzero_si256();
for (pos = 0; pos + 32 <= len; pos += 32) {
avec = _mm256_loadu_si256((__m256i *)&a[pos]);
bvec = _mm256_loadu_si256((__m256i *)&b[pos]);
avec = _mm256_xor_si256(avec, bvec);
cvec = _mm256_or_si256(cvec, avec);
h = _mm256_setzero_si256();
for (i = 0; i < len / 32; i++) {
f = _mm256_loadu_si256((__m256i *)&a[32 * i]);
g = _mm256_loadu_si256((__m256i *)&b[32 * i]);
f = _mm256_xor_si256(f, g);
h = _mm256_or_si256(h, f);
}
r = 1 - _mm256_testz_si256(cvec, cvec);
r = 1 - _mm256_testz_si256(h, h);

if (pos < len) {
avec = _mm256_loadu_si256((__m256i *)&a[pos]);
bvec = _mm256_loadu_si256((__m256i *)&b[pos]);
cvec = _mm256_cmpeq_epi8(avec, bvec);
r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len));
a += 32 * i;
b += 32 * i;
len -= 32 * i;
for (i = 0; i < len; i++) {
r |= a[i] ^ b[i];
}

r = (-r) >> 63;
@@ -47,29 +47,27 @@ int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t
* assumes two's complement representation of negative integers.
* Runs in constant time.
*
* Arguments: unsigned char *r: pointer to output byte array
* Arguments: unsigned char *r: pointer to output byte array
* const unsigned char *x: pointer to input byte array
* size_t len: Amount of bytes to be copied
* unsigned char b: Condition bit; has to be in {0,1}
* size_t len: Amount of bytes to be copied
* unsigned char b: Condition bit; has to be in {0,1}
**************************************************/
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) {
size_t pos;
void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) {
size_t i;
__m256i xvec, rvec, bvec;

b = -b;
bvec = _mm256_set1_epi8(b);

for (pos = 0; pos + 32 <= len; pos += 32) {
rvec = _mm256_loadu_si256((__m256i *)&r[pos]);
xvec = _mm256_loadu_si256((__m256i *)&x[pos]);
xvec = _mm256_xor_si256(xvec, rvec);
xvec = _mm256_and_si256(xvec, bvec);
rvec = _mm256_xor_si256(rvec, xvec);
_mm256_storeu_si256((__m256i *)&r[pos], rvec);
bvec = _mm256_set1_epi64x(-(uint64_t)b);
for (i = 0; i < len / 32; i++) {
rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]);
xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]);
rvec = _mm256_blendv_epi8(rvec, xvec, bvec);
_mm256_storeu_si256((__m256i *)&r[32 * i], rvec);
}

while (pos < len) {
r[pos] ^= b & (x[pos] ^ r[pos]);
pos += 1;
r += 32 * i;
x += 32 * i;
len -= 32 * i;
for (i = 0; i < len; i++) {
r[i] ^= -b & (x[i] ^ r[i]);
}
}

+ 2
- 2
crypto_kem/kyber1024-90s/clean/Makefile 查看文件

@@ -1,8 +1,8 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libkyber1024-90s_clean.a
HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric-aes.h symmetric.h verify.h
OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o
HEADERS=aes256ctr.h api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h symmetric.h verify.h
OBJECTS=aes256ctr.o cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o symmetric-aes.o verify.o

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)



+ 1
- 1
crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake 查看文件

@@ -2,7 +2,7 @@
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libkyber1024-90s_clean.lib
OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj
OBJECTS=aes256ctr.obj cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj symmetric-aes.obj verify.obj

# Warning C4146 is raised when a unary minus operator is applied to an
# unsigned type; this has nonetheless been standard and portable for as


+ 564
- 0
crypto_kem/kyber1024-90s/clean/aes256ctr.c 查看文件

@@ -0,0 +1,564 @@
#include "aes256ctr.h"
#include <stdint.h>
#include <string.h>
/*
* Copyright (c) 2016 Thomas Pornin <pornin@bolet.org>
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/


static inline uint32_t br_dec32le(const uint8_t *src) {
return (uint32_t)src[0]
| ((uint32_t)src[1] << 8)
| ((uint32_t)src[2] << 16)
| ((uint32_t)src[3] << 24);
}

static void br_range_dec32le(uint32_t *v, size_t num, const uint8_t *src) {
while (num-- > 0) {
*v ++ = br_dec32le(src);
src += 4;
}
}

static inline uint32_t br_swap32(uint32_t x) {
x = ((x & (uint32_t)0x00FF00FF) << 8)
| ((x >> 8) & (uint32_t)0x00FF00FF);
return (x << 16) | (x >> 16);
}

static inline void br_enc32le(uint8_t *dst, uint32_t x) {
dst[0] = (uint8_t)x;
dst[1] = (uint8_t)(x >> 8);
dst[2] = (uint8_t)(x >> 16);
dst[3] = (uint8_t)(x >> 24);
}

static void br_range_enc32le(uint8_t *dst, const uint32_t *v, size_t num) {
while (num-- > 0) {
br_enc32le(dst, *v ++);
dst += 4;
}
}

static void br_aes_ct64_bitslice_Sbox(uint64_t *q) {
/*
* This S-box implementation is a straightforward translation of
* the circuit described by Boyar and Peralta in "A new
* combinational logic minimization technique with applications
* to cryptology" (https://eprint.iacr.org/2009/191.pdf).
*
* Note that variables x* (input) and s* (output) are numbered
* in "reverse" order (x0 is the high bit, x7 is the low bit).
*/

uint64_t x0, x1, x2, x3, x4, x5, x6, x7;
uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9;
uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19;
uint64_t y20, y21;
uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9;
uint64_t z10, z11, z12, z13, z14, z15, z16, z17;
uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19;
uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39;
uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49;
uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59;
uint64_t t60, t61, t62, t63, t64, t65, t66, t67;
uint64_t s0, s1, s2, s3, s4, s5, s6, s7;

x0 = q[7];
x1 = q[6];
x2 = q[5];
x3 = q[4];
x4 = q[3];
x5 = q[2];
x6 = q[1];
x7 = q[0];

/*
* Top linear transformation.
*/
y14 = x3 ^ x5;
y13 = x0 ^ x6;
y9 = x0 ^ x3;
y8 = x0 ^ x5;
t0 = x1 ^ x2;
y1 = t0 ^ x7;
y4 = y1 ^ x3;
y12 = y13 ^ y14;
y2 = y1 ^ x0;
y5 = y1 ^ x6;
y3 = y5 ^ y8;
t1 = x4 ^ y12;
y15 = t1 ^ x5;
y20 = t1 ^ x1;
y6 = y15 ^ x7;
y10 = y15 ^ t0;
y11 = y20 ^ y9;
y7 = x7 ^ y11;
y17 = y10 ^ y11;
y19 = y10 ^ y8;
y16 = t0 ^ y11;
y21 = y13 ^ y16;
y18 = x0 ^ y16;

/*
* Non-linear section.
*/
t2 = y12 & y15;
t3 = y3 & y6;
t4 = t3 ^ t2;
t5 = y4 & x7;
t6 = t5 ^ t2;
t7 = y13 & y16;
t8 = y5 & y1;
t9 = t8 ^ t7;
t10 = y2 & y7;
t11 = t10 ^ t7;
t12 = y9 & y11;
t13 = y14 & y17;
t14 = t13 ^ t12;
t15 = y8 & y10;
t16 = t15 ^ t12;
t17 = t4 ^ t14;
t18 = t6 ^ t16;
t19 = t9 ^ t14;
t20 = t11 ^ t16;
t21 = t17 ^ y20;
t22 = t18 ^ y19;
t23 = t19 ^ y21;
t24 = t20 ^ y18;

t25 = t21 ^ t22;
t26 = t21 & t23;
t27 = t24 ^ t26;
t28 = t25 & t27;
t29 = t28 ^ t22;
t30 = t23 ^ t24;
t31 = t22 ^ t26;
t32 = t31 & t30;
t33 = t32 ^ t24;
t34 = t23 ^ t33;
t35 = t27 ^ t33;
t36 = t24 & t35;
t37 = t36 ^ t34;
t38 = t27 ^ t36;
t39 = t29 & t38;
t40 = t25 ^ t39;

t41 = t40 ^ t37;
t42 = t29 ^ t33;
t43 = t29 ^ t40;
t44 = t33 ^ t37;
t45 = t42 ^ t41;
z0 = t44 & y15;
z1 = t37 & y6;
z2 = t33 & x7;
z3 = t43 & y16;
z4 = t40 & y1;
z5 = t29 & y7;
z6 = t42 & y11;
z7 = t45 & y17;
z8 = t41 & y10;
z9 = t44 & y12;
z10 = t37 & y3;
z11 = t33 & y4;
z12 = t43 & y13;
z13 = t40 & y5;
z14 = t29 & y2;
z15 = t42 & y9;
z16 = t45 & y14;
z17 = t41 & y8;

/*
* Bottom linear transformation.
*/
t46 = z15 ^ z16;
t47 = z10 ^ z11;
t48 = z5 ^ z13;
t49 = z9 ^ z10;
t50 = z2 ^ z12;
t51 = z2 ^ z5;
t52 = z7 ^ z8;
t53 = z0 ^ z3;
t54 = z6 ^ z7;
t55 = z16 ^ z17;
t56 = z12 ^ t48;
t57 = t50 ^ t53;
t58 = z4 ^ t46;
t59 = z3 ^ t54;
t60 = t46 ^ t57;
t61 = z14 ^ t57;
t62 = t52 ^ t58;
t63 = t49 ^ t58;
t64 = z4 ^ t59;
t65 = t61 ^ t62;
t66 = z1 ^ t63;
s0 = t59 ^ t63;
s6 = t56 ^ ~t62;
s7 = t48 ^ ~t60;
t67 = t64 ^ t65;
s3 = t53 ^ t66;
s4 = t51 ^ t66;
s5 = t47 ^ t65;
s1 = t64 ^ ~s3;
s2 = t55 ^ ~t67;

q[7] = s0;
q[6] = s1;
q[5] = s2;
q[4] = s3;
q[3] = s4;
q[2] = s5;
q[1] = s6;
q[0] = s7;
}

static void br_aes_ct64_ortho(uint64_t *q) {
#define SWAPN(cl, ch, s, x, y) do { \
uint64_t a, b; \
a = (x); \
b = (y); \
(x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \
(y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \
} while (0)

#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y)
#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y)
#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y)

SWAP2(q[0], q[1]);
SWAP2(q[2], q[3]);
SWAP2(q[4], q[5]);
SWAP2(q[6], q[7]);

SWAP4(q[0], q[2]);
SWAP4(q[1], q[3]);
SWAP4(q[4], q[6]);
SWAP4(q[5], q[7]);

SWAP8(q[0], q[4]);
SWAP8(q[1], q[5]);
SWAP8(q[2], q[6]);
SWAP8(q[3], q[7]);
}

static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) {
uint64_t x0, x1, x2, x3;

x0 = w[0];
x1 = w[1];
x2 = w[2];
x3 = w[3];
x0 |= (x0 << 16);
x1 |= (x1 << 16);
x2 |= (x2 << 16);
x3 |= (x3 << 16);
x0 &= (uint64_t)0x0000FFFF0000FFFF;
x1 &= (uint64_t)0x0000FFFF0000FFFF;
x2 &= (uint64_t)0x0000FFFF0000FFFF;
x3 &= (uint64_t)0x0000FFFF0000FFFF;
x0 |= (x0 << 8);
x1 |= (x1 << 8);
x2 |= (x2 << 8);
x3 |= (x3 << 8);
x0 &= (uint64_t)0x00FF00FF00FF00FF;
x1 &= (uint64_t)0x00FF00FF00FF00FF;
x2 &= (uint64_t)0x00FF00FF00FF00FF;
x3 &= (uint64_t)0x00FF00FF00FF00FF;
*q0 = x0 | (x2 << 8);
*q1 = x1 | (x3 << 8);
}

static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) {
uint64_t x0, x1, x2, x3;

x0 = q0 & (uint64_t)0x00FF00FF00FF00FF;
x1 = q1 & (uint64_t)0x00FF00FF00FF00FF;
x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF;
x0 |= (x0 >> 8);
x1 |= (x1 >> 8);
x2 |= (x2 >> 8);
x3 |= (x3 >> 8);
x0 &= (uint64_t)0x0000FFFF0000FFFF;
x1 &= (uint64_t)0x0000FFFF0000FFFF;
x2 &= (uint64_t)0x0000FFFF0000FFFF;
x3 &= (uint64_t)0x0000FFFF0000FFFF;
w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16);
w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16);
w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16);
w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16);
}

static const uint8_t Rcon[] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36
};

static uint32_t sub_word(uint32_t x) {
uint64_t q[8];

memset(q, 0, sizeof q);
q[0] = x;
br_aes_ct64_ortho(q);
br_aes_ct64_bitslice_Sbox(q);
br_aes_ct64_ortho(q);
return (uint32_t)q[0];
}

static void br_aes_ct64_keysched(uint64_t *comp_skey, const uint8_t *key) {
int i, j, k, nk, nkf;
uint32_t tmp;
uint32_t skey[60];

int key_len = 32;

nk = (int)(key_len >> 2);
nkf = (int)((14 + 1) << 2);
br_range_dec32le(skey, (key_len >> 2), key);
tmp = skey[(key_len >> 2) - 1];
for (i = nk, j = 0, k = 0; i < nkf; i ++) {
if (j == 0) {
tmp = (tmp << 24) | (tmp >> 8);
tmp = sub_word(tmp) ^ Rcon[k];
} else if (nk > 6 && j == 4) {
tmp = sub_word(tmp);
}
tmp ^= skey[i - nk];
skey[i] = tmp;
if (++ j == nk) {
j = 0;
k ++;
}
}

for (i = 0, j = 0; i < nkf; i += 4, j += 2) {
uint64_t q[8];

br_aes_ct64_interleave_in(&q[0], &q[4], skey + i);
q[1] = q[0];
q[2] = q[0];
q[3] = q[0];
q[5] = q[4];
q[6] = q[4];
q[7] = q[4];
br_aes_ct64_ortho(q);
comp_skey[j + 0] =
(q[0] & (uint64_t)0x1111111111111111)
| (q[1] & (uint64_t)0x2222222222222222)
| (q[2] & (uint64_t)0x4444444444444444)
| (q[3] & (uint64_t)0x8888888888888888);
comp_skey[j + 1] =
(q[4] & (uint64_t)0x1111111111111111)
| (q[5] & (uint64_t)0x2222222222222222)
| (q[6] & (uint64_t)0x4444444444444444)
| (q[7] & (uint64_t)0x8888888888888888);
}
}

static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey) {
unsigned u, v, n;

n = (14 + 1) << 1;
for (u = 0, v = 0; u < n; u ++, v += 4) {
uint64_t x0, x1, x2, x3;

x0 = x1 = x2 = x3 = comp_skey[u];
x0 &= (uint64_t)0x1111111111111111;
x1 &= (uint64_t)0x2222222222222222;
x2 &= (uint64_t)0x4444444444444444;
x3 &= (uint64_t)0x8888888888888888;
x1 >>= 1;
x2 >>= 2;
x3 >>= 3;
skey[v + 0] = (x0 << 4) - x0;
skey[v + 1] = (x1 << 4) - x1;
skey[v + 2] = (x2 << 4) - x2;
skey[v + 3] = (x3 << 4) - x3;
}
}

static inline void add_round_key(uint64_t *q, const uint64_t *sk) {
q[0] ^= sk[0];
q[1] ^= sk[1];
q[2] ^= sk[2];
q[3] ^= sk[3];
q[4] ^= sk[4];
q[5] ^= sk[5];
q[6] ^= sk[6];
q[7] ^= sk[7];
}

static inline void shift_rows(uint64_t *q) {
int i;

for (i = 0; i < 8; i ++) {
uint64_t x;

x = q[i];
q[i] = (x & (uint64_t)0x000000000000FFFF)
| ((x & (uint64_t)0x00000000FFF00000) >> 4)
| ((x & (uint64_t)0x00000000000F0000) << 12)
| ((x & (uint64_t)0x0000FF0000000000) >> 8)
| ((x & (uint64_t)0x000000FF00000000) << 8)
| ((x & (uint64_t)0xF000000000000000) >> 12)
| ((x & (uint64_t)0x0FFF000000000000) << 4);
}
}

static inline uint64_t rotr32(uint64_t x) {
return (x << 32) | (x >> 32);
}

static inline void mix_columns(uint64_t *q) {
uint64_t q0, q1, q2, q3, q4, q5, q6, q7;
uint64_t r0, r1, r2, r3, r4, r5, r6, r7;

q0 = q[0];
q1 = q[1];
q2 = q[2];
q3 = q[3];
q4 = q[4];
q5 = q[5];
q6 = q[6];
q7 = q[7];
r0 = (q0 >> 16) | (q0 << 48);
r1 = (q1 >> 16) | (q1 << 48);
r2 = (q2 >> 16) | (q2 << 48);
r3 = (q3 >> 16) | (q3 << 48);
r4 = (q4 >> 16) | (q4 << 48);
r5 = (q5 >> 16) | (q5 << 48);
r6 = (q6 >> 16) | (q6 << 48);
r7 = (q7 >> 16) | (q7 << 48);

q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0);
q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1);
q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2);
q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3);
q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4);
q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5);
q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6);
q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7);
}

static void inc4_be(uint32_t *x) {
*x = br_swap32(*x) + 4;
*x = br_swap32(*x);
}

static void aes_ctr4x(uint8_t out[64], uint32_t ivw[16], uint64_t sk_exp[64]) {
uint32_t w[16];
uint64_t q[8];
int i;

memcpy(w, ivw, sizeof(w));
for (i = 0; i < 4; i++) {
br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2));
}
br_aes_ct64_ortho(q);

add_round_key(q, sk_exp);
for (i = 1; i < 14; i++) {
br_aes_ct64_bitslice_Sbox(q);
shift_rows(q);
mix_columns(q);
add_round_key(q, sk_exp + (i << 3));
}
br_aes_ct64_bitslice_Sbox(q);
shift_rows(q);
add_round_key(q, sk_exp + 112);

br_aes_ct64_ortho(q);
for (i = 0; i < 4; i ++) {
br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]);
}
br_range_enc32le(out, w, 16);

/* Increase counter for next 4 blocks */
inc4_be(ivw + 3);
inc4_be(ivw + 7);
inc4_be(ivw + 11);
inc4_be(ivw + 15);
}

static void br_aes_ct64_ctr_init(uint64_t sk_exp[120], const uint8_t *key) {
uint64_t skey[30];

br_aes_ct64_keysched(skey, key);
br_aes_ct64_skey_expand(sk_exp, skey);
}

static void br_aes_ct64_ctr_run(uint64_t sk_exp[120], const uint8_t *iv, uint32_t cc, uint8_t *data, size_t len) {
uint32_t ivw[16];
size_t i;

br_range_dec32le(ivw, 3, iv);
memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t));
memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t));
memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t));
ivw[ 3] = br_swap32(cc);
ivw[ 7] = br_swap32(cc + 1);
ivw[11] = br_swap32(cc + 2);
ivw[15] = br_swap32(cc + 3);

while (len > 64) {
aes_ctr4x(data, ivw, sk_exp);
data += 64;
len -= 64;
}
if (len > 0) {
uint8_t tmp[64];
aes_ctr4x(tmp, ivw, sk_exp);
for (i = 0; i < len; i++) {
data[i] = tmp[i];
}
}
}

void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t *key, const uint8_t *nonce) {
uint64_t sk_exp[120];

br_aes_ct64_ctr_init(sk_exp, key);
br_aes_ct64_ctr_run(sk_exp, nonce, 0, out, outlen);
}

void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(aes256ctr_ctx *s, const uint8_t *key, const uint8_t *nonce) {
br_aes_ct64_ctr_init(s->sk_exp, key);

br_range_dec32le(s->ivw, 3, nonce);
memcpy(s->ivw + 4, s->ivw, 3 * sizeof(uint32_t));
memcpy(s->ivw + 8, s->ivw, 3 * sizeof(uint32_t));
memcpy(s->ivw + 12, s->ivw, 3 * sizeof(uint32_t));
s->ivw[ 3] = br_swap32(0);
s->ivw[ 7] = br_swap32(1);
s->ivw[11] = br_swap32(2);
s->ivw[15] = br_swap32(3);
}

void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *s) {
while (nblocks > 0) {
aes_ctr4x(out, s->ivw, s->sk_exp);
out += 64;
nblocks--;
}
}

+ 28
- 0
crypto_kem/kyber1024-90s/clean/aes256ctr.h 查看文件

@@ -0,0 +1,28 @@
#ifndef PQCLEAN_KYBER102490S_CLEAN_AES256CTR_H
#define PQCLEAN_KYBER102490S_CLEAN_AES256CTR_H

#include <stddef.h>
#include <stdint.h>

#define AES256CTR_BLOCKBYTES 64


typedef struct {
uint64_t sk_exp[120];
uint32_t ivw[16];
} aes256ctr_ctx;

void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(uint8_t *out,
size_t outlen,
const uint8_t key[32],
const uint8_t nonce[12]);

void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(aes256ctr_ctx *state,
const uint8_t key[32],
const uint8_t nonce[12]);

void PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(uint8_t *out,
size_t nblocks,
aes256ctr_ctx *state);

#endif

+ 38
- 5
crypto_kem/kyber1024-90s/clean/cbd.c 查看文件

@@ -5,7 +5,7 @@
/*************************************************
* Name: load32_littleendian
*
* Description: load bytes into a 32-bit integer
* Description: load 4 bytes into a 32-bit integer
* in little-endian order
*
* Arguments: - const uint8_t *x: pointer to input byte array
@@ -22,16 +22,29 @@ static uint32_t load32_littleendian(const uint8_t x[4]) {
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_cbd
* Name: load24_littleendian
*
* Description: load 3 bytes into a 32-bit integer
* in little-endian order.
* This function is only needed for Kyber-512
*
* Arguments: - const uint8_t *x: pointer to input byte array
*
* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
**************************************************/


/*************************************************
* Name: cbd2
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
* a centered binomial distribution with parameter eta=2
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) {
static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) {
unsigned int i, j;
uint32_t t, d;
int16_t a, b;
@@ -48,3 +61,23 @@ void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER
}
}
}

/*************************************************
* Name: cbd3
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter eta=3.
* This function is only needed for Kyber-512
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
**************************************************/

void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) {
cbd2(r, buf);
}

void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) {
cbd2(r, buf);
}

+ 3
- 1
crypto_kem/kyber1024-90s/clean/cbd.h 查看文件

@@ -4,6 +4,8 @@
#include "poly.h"
#include <stdint.h>

void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);
void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]);

void PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]);

#endif

+ 72
- 77
crypto_kem/kyber1024-90s/clean/indcpa.c 查看文件

@@ -15,8 +15,8 @@
* serialized vector of polynomials pk
* and the public seed used to generate the matrix A.
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* polyvec *pk: pointer to the input public-key polyvec
* Arguments: uint8_t *r: pointer to the output serialized public key
* polyvec *pk: pointer to the input public-key polyvec
* const uint8_t *seed: pointer to the input public seed
**************************************************/
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
@@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
* Description: De-serialize public key from a byte array;
* approximate inverse of pack_pk
*
* Arguments: - polyvec *pk: pointer to output public-key
* polynomial vector
* - uint8_t *seed: pointer to output seed to generate
* matrix A
* Arguments: - polyvec *pk: pointer to output public-key polynomial vector
* - uint8_t *seed: pointer to output seed to generate matrix A
* - const uint8_t *packedpk: pointer to input serialized public key
**************************************************/
static void unpack_pk(polyvec *pk,
@@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk,
*
* Description: Serialize the secret key
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - polyvec *sk: pointer to input vector of polynomials (secret key)
**************************************************/
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
@@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
/*************************************************
* Name: unpack_sk
*
* Description: De-serialize the secret key;
* inverse of pack_sk
* Description: De-serialize the secret key; inverse of pack_sk
*
* Arguments: - polyvec *sk: pointer to output vector of
* polynomials (secret key)
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret key
**************************************************/
static void unpack_sk(polyvec *sk,
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(sk, packedsk);
}

@@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk,
* and the compressed and serialized polynomial v
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
**************************************************/
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
polyvec *b,
poly *v) {
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(r, b);
PQCLEAN_KYBER102490S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
}
@@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of pack_ciphertext
*
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
**************************************************/
static void unpack_ciphertext(polyvec *b,
poly *v,
const uint8_t c[KYBER_INDCPA_BYTES]) {
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(b, c);
PQCLEAN_KYBER102490S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
}
@@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b,
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers
* (uniform mod q)
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers (uniform mod q)
* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
* - unsigned int buflen: length of input buffer in bytes
*
* Returns number of sampled 16-bit integers (at most len)
@@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint16_t val;
uint16_t val0, val1;

ctr = pos = 0;
while (ctr < len && pos + 2 <= buflen) {
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;
while (ctr < len && pos + 3 <= buflen) {
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
pos += 3;

if (val < 19 * KYBER_Q) {
val -= (val >> 12) * KYBER_Q; // Barrett reduction
r[ctr++] = (int16_t)val;
if (val0 < KYBER_Q) {
r[ctr++] = val0;
}
if (ctr < len && val1 < KYBER_Q) {
r[ctr++] = val1;
}
}

@@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r,
* uniformly random. Performs rejection sampling on output of
* a XOF
*
* Arguments: - polyvec *a: pointer to ouptput matrix A
* Arguments: - polyvec *a: pointer to ouptput matrix A
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T
* is generated
* - int transposed: boolean deciding whether A or A^T is generated
**************************************************/
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
// Not static for benchmarking
void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) {
unsigned int ctr, i, j;
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES];
unsigned int ctr, i, j, k;
unsigned int buflen, off;
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
xof_state state;

for (i = 0; i < KYBER_K; i++) {
@@ -182,12 +173,17 @@ void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_
}

xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf));
buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen);

while (ctr < KYBER_N) {
xof_squeezeblocks(buf, 1, &state);
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf,
XOF_BLOCKBYTES);
off = buflen % 3;
for (k = 0; k < off; k++) {
buf[k] = buf[buflen - off + k];
}
xof_squeezeblocks(buf + off, 1, &state);
buflen = off + XOF_BLOCKBYTES;
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen);
}
xof_ctx_release(&state);
}
@@ -220,10 +216,10 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY
gen_a(a, publicseed);

for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++);
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++);
}
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++);
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++);
}

PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&skpv);
@@ -231,7 +227,7 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY

// matrix-vector multiplication
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER102490S_CLEAN_poly_tomont(&pkpv.vec[i]);
}

@@ -248,16 +244,15 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEY
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins
* used as seed (of length KYBER_SYMBYTES)
* to deterministically generate all
* randomness
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins used as seed
* (of length KYBER_SYMBYTES) to deterministically
* generate all randomness
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t m[KYBER_INDCPA_MSGBYTES],
@@ -266,7 +261,7 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
unsigned int i;
uint8_t seed[KYBER_SYMBYTES];
uint8_t nonce = 0;
polyvec sp, pkpv, ep, at[KYBER_K], bp;
polyvec sp, pkpv, ep, at[KYBER_K], b;
poly v, k, epp;

unpack_pk(&pkpv, seed, pk);
@@ -274,32 +269,32 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
gen_at(at, seed);

for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++);
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++);
}
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++);
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++);
}
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&epp, coins, nonce++);
PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++);

PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&sp);

// matrix-vector multiplication
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
}

PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&bp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&b);
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&v);

PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&bp, &bp, &ep);
PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&b, &b, &ep);
PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &epp);
PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &k);
PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(&bp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(&b);
PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&v);

pack_ciphertext(c, &bp, &v);
pack_ciphertext(c, &b, &v);
}

/*************************************************
@@ -308,24 +303,24 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length KYBER_INDCPA_SECRETKEYBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
polyvec bp, skpv;
polyvec b, skpv;
poly v, mp;

unpack_ciphertext(&bp, &v, c);
unpack_ciphertext(&b, &v, c);
unpack_sk(&skpv, sk);

PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&bp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&b);
PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&mp);

PQCLEAN_KYBER102490S_CLEAN_poly_sub(&mp, &v, &mp);


+ 16
- 15
crypto_kem/kyber1024-90s/clean/kem.c 查看文件

@@ -14,13 +14,14 @@
* for CCA-secure Kyber key encapsulation mechanism
*
* Arguments: - unsigned char *pk: pointer to output public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
* - unsigned char *sk: pointer to output private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* (an already allocated array of KYBER_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES],
unsigned char sk[KYBER_SECRETKEYBYTES]) {
size_t i;
PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -39,17 +40,17 @@ int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned ch
* secret for given public key
*
* Arguments: - unsigned char *ct: pointer to output cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
* - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* (an already allocated array of KYBER_SSBYTES bytes)
* - const unsigned char *pk: pointer to input public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk) {
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES],
unsigned char ss[KYBER_SSBYTES],
const unsigned char pk[KYBER_PUBLICKEYBYTES]) {
uint8_t buf[2 * KYBER_SYMBYTES];
/* Will contain key, coins */
uint8_t kr[2 * KYBER_SYMBYTES];
@@ -79,19 +80,19 @@ int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct,
* cipher text and private key
*
* Arguments: - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* (an already allocated array of KYBER_SSBYTES bytes)
* - const unsigned char *ct: pointer to input cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
* - const unsigned char *sk: pointer to input private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* (an already allocated array of KYBER_SECRETKEYBYTES bytes)
*
* Returns 0.
*
* On failure, ss will contain a pseudo-random value.
**************************************************/
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk) {
int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES],
const unsigned char ct[KYBER_CIPHERTEXTBYTES],
const unsigned char sk[KYBER_SECRETKEYBYTES]) {
size_t i;
int fail;
uint8_t buf[2 * KYBER_SYMBYTES];


+ 42
- 57
crypto_kem/kyber1024-90s/clean/ntt.c 查看文件

@@ -3,11 +3,11 @@
#include "reduce.h"
#include <stdint.h>

/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and PQCLEAN_KYBER102490S_CLEAN_zetas_inv used in the number-theoretic transform:
/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and zetas_inv used in the number-theoretic transform:

#define KYBER_ROOT_OF_UNITY 17

static const uint16_t tree[128] = {
static const uint8_t tree[128] = {
0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
@@ -19,51 +19,41 @@ static const uint16_t tree[128] = {
};

void init_ntt() {
unsigned int i, j, k;
unsigned int i;
int16_t tmp[128];

tmp[0] = MONT;
for(i = 1; i < 128; ++i)
tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q);
for(i=1;i<128;i++)
tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q);

for(i = 0; i < 128; ++i)
for(i=0;i<128;i++) {
PQCLEAN_KYBER102490S_CLEAN_zetas[i] = tmp[tree[i]];

k = 0;
for(i = 64; i >= 1; i >>= 1)
for(j = i; j < 2*i; ++j)
PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]];

PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q;
if(PQCLEAN_KYBER102490S_CLEAN_zetas[i] > KYBER_Q/2)
PQCLEAN_KYBER102490S_CLEAN_zetas[i] -= KYBER_Q;
if(PQCLEAN_KYBER102490S_CLEAN_zetas[i] < -KYBER_Q/2)
PQCLEAN_KYBER102490S_CLEAN_zetas[i] += KYBER_Q;
}
}

*/

const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128] = {
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962,
2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017,
732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047,
1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830,
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226,
430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574,
1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349,
418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193,
1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459,
478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628
};

const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128] = {
1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535,
1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465,
1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685,
1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235,
3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652,
1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853,
1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552,
2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871,
829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171,
3127, 3042, 1907, 1836, 1517, 359, 758, 1441
};
-1044, -758, -359, -1517, 1493, 1422, 287, 202,
-171, 622, 1577, 182, 962, -1202, -1474, 1468,
573, -1325, 264, 383, -829, 1458, -1602, -130,
-681, 1017, 732, 608, -1542, 411, -205, -1571,
1223, 652, -552, 1015, -1293, 1491, -282, -1544,
516, -8, -320, -666, -1618, -1162, 126, 1469,
-853, -90, -271, 830, 107, -1421, -247, -951,
-398, 961, -1508, -725, 448, -1065, 677, -1275,
-1103, 430, 555, 843, -1251, 871, 1550, 105,
422, 587, 177, -235, -291, -460, 1574, 1653,
-246, 778, 1159, -147, -777, 1483, -602, 1119,
-1590, 644, -872, 349, 418, 329, -156, -75,
817, 1097, 603, 610, 1322, -1285, -1465, 384,
-1215, -136, 1218, -1335, -874, 220, -1187, -1659,
-1185, -1530, -1278, 794, -1510, -854, -870, 478,
-108, -308, 996, 991, 958, -1460, 1522, 1628
};

/*************************************************
* Name: fqmul
@@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) {
/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_ntt
*
* Description: Inplace number-theoretic transform (NTT) in Rq
* Description: Inplace number-theoretic transform (NTT) in Rq.
* input is in standard order, output is in bitreversed order
*
* Arguments: - int16_t r[256]: pointer to input/output vector of elements
* of Zq
* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) {
unsigned int len, start, j, k;
@@ -96,7 +85,7 @@ void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) {
for (len = 128; len >= 2; len >>= 1) {
for (start = 0; start < 256; start = j + len) {
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k++];
for (j = start; j < start + len; ++j) {
for (j = start; j < start + len; j++) {
t = fqmul(zeta, r[j + len]);
r[j + len] = r[j] - t;
r[j] = r[j] + t;
@@ -112,28 +101,28 @@ void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) {
* multiplication by Montgomery factor 2^16.
* Input is in bitreversed order, output is in standard order
*
* Arguments: - int16_t r[256]: pointer to input/output vector of elements
* of Zq
* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) {
unsigned int start, len, j, k;
int16_t t, zeta;
const int16_t f = 1441; // mont^2/128

k = 0;
k = 127;
for (len = 2; len <= 128; len <<= 1) {
for (start = 0; start < 256; start = j + len) {
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++];
for (j = start; j < start + len; ++j) {
zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k--];
for (j = start; j < start + len; j++) {
t = r[j];
r[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + r[j + len]);
r[j + len] = t - r[j + len];
r[j + len] = r[j + len] - t;
r[j + len] = fqmul(zeta, r[j + len]);
}
}
}

for (j = 0; j < 256; ++j) {
r[j] = fqmul(r[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]);
for (j = 0; j < 256; j++) {
r[j] = fqmul(r[j], f);
}
}

@@ -143,19 +132,15 @@ void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) {
* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
* used for multiplication of elements in Rq in NTT domain
*
* Arguments: - int16_t r[2]: pointer to the output polynomial
* Arguments: - int16_t r[2]: pointer to the output polynomial
* - const int16_t a[2]: pointer to the first factor
* - const int16_t b[2]: pointer to the second factor
* - int16_t zeta: integer defining the reduction polynomial
* - int16_t zeta: integer defining the reduction polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2],
const int16_t a[2],
const int16_t b[2],
int16_t zeta) {
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) {
r[0] = fqmul(a[1], b[1]);
r[0] = fqmul(r[0], zeta);
r[0] += fqmul(a[0], b[0]);

r[1] = fqmul(a[0], b[1]);
r[1] += fqmul(a[1], b[0]);
}

+ 1
- 6
crypto_kem/kyber1024-90s/clean/ntt.h 查看文件

@@ -5,15 +5,10 @@

extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128];

extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128];

void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]);

void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]);

void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2],
const int16_t a[2],
const int16_t b[2],
int16_t zeta);
void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);

#endif

+ 7
- 9
crypto_kem/kyber1024-90s/clean/params.h 查看文件

@@ -7,8 +7,6 @@
#define KYBER_N 256
#define KYBER_Q 3329

#define KYBER_ETA 2

#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */
#define KYBER_SSBYTES 32 /* size in bytes of shared key */

@@ -16,20 +14,20 @@
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_K 4
#define KYBER_ETA1 2
#define KYBER_POLYCOMPRESSEDBYTES 160
#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)

#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
#define KYBER_ETA2 2

#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES)
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \
+ KYBER_POLYCOMPRESSEDBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)

#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES)
/* 32 bytes of additional space to save H(pk) */
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \
+ KYBER_INDCPA_PUBLICKEYBYTES \
+ 2*KYBER_SYMBYTES)
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)

#endif

+ 55
- 50
crypto_kem/kyber1024-90s/clean/poly.c 查看文件

@@ -13,17 +13,19 @@
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length KYBER_POLYCOMPRESSEDBYTES)
* - poly *a: pointer to input polynomial
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) {
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) {
size_t i, j;
int16_t u;
uint8_t t[8];

PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a);

for (i = 0; i < KYBER_N / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
// map to positive standard representatives
u = a->coeffs[8 * i + j];
u += (u >> 15) & KYBER_Q;
t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
}

r[0] = (t[0] >> 0) | (t[1] << 5);
@@ -41,7 +43,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTE
* Description: De-serialization and subsequent decompression of a polynomial;
* approximate inverse of PQCLEAN_KYBER102490S_CLEAN_poly_compress
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYCOMPRESSEDBYTES bytes)
**************************************************/
@@ -74,20 +76,21 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_P
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYBYTES bytes)
* - poly *a: pointer to input polynomial
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) {
size_t i;
uint16_t t0, t1;

PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a);

for (i = 0; i < KYBER_N / 2; i++) {
t0 = a->coeffs[2 * i];
// map to positive standard representatives
t0 = a->coeffs[2 * i];
t0 += ((int16_t)t0 >> 15) & KYBER_Q;
t1 = a->coeffs[2 * i + 1];
r[3 * i + 0] = (uint8_t)(t0 >> 0);
r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4));
r[3 * i + 2] = (uint8_t)(t1 >> 4);
t1 += ((int16_t)t1 >> 15) & KYBER_Q;
r[3 * i + 0] = (t0 >> 0);
r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
r[3 * i + 2] = (t1 >> 4);
}
}

@@ -97,7 +100,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a
* Description: De-serialization of a polynomial;
* inverse of PQCLEAN_KYBER102490S_CLEAN_poly_tobytes
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of KYBER_POLYBYTES bytes)
**************************************************/
@@ -114,7 +117,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_PO
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
@@ -135,41 +138,60 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_IN
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - poly *a: pointer to input polynomial
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) {
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) {
size_t i, j;
uint16_t t;

PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a);

for (i = 0; i < KYBER_N / 8; i++) {
msg[i] = 0;
for (j = 0; j < 8; j++) {
t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1;
t = a->coeffs[8 * i + j];
t += ((int16_t)t >> 15) & KYBER_Q;
t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1;
msg[i] |= t << j;
}
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA1
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
uint8_t buf[KYBER_ETA1 * KYBER_N / 4];
prf(buf, sizeof(buf), seed, nonce);
PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta1(r, buf);
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA
* with parameter KYBER_ETA2
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
uint8_t buf[KYBER_ETA * KYBER_N / 4];
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
uint8_t buf[KYBER_ETA2 * KYBER_N / 4];
prf(buf, sizeof(buf), seed, nonce);
PQCLEAN_KYBER102490S_CLEAN_cbd(r, buf);
PQCLEAN_KYBER102490S_CLEAN_poly_cbd_eta2(r, buf);
}


/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_ntt
*
@@ -202,7 +224,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r) {
*
* Description: Multiplication of two polynomials in NTT domain
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
@@ -210,8 +232,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a,
size_t i;
for (i = 0; i < KYBER_N / 4; i++) {
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]);
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2],
-PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]);
PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]);
}
}

@@ -246,28 +267,12 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r) {
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of a polynomial. For details of conditional subtraction
* of q see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) {
size_t i;
for (i = 0; i < KYBER_N; i++) {
r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_csubq(r->coeffs[i]);
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_add
*
* Description: Add two polynomials
* Description: Add two polynomials; no modular reduction is performed
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
@@ -281,7 +286,7 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b)
/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_poly_sub
*
* Description: Subtract two polynomials
* Description: Subtract two polynomials; no modular reduction is performed
*
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial


+ 6
- 5
crypto_kem/kyber1024-90s/clean/poly.h 查看文件

@@ -11,16 +11,18 @@ typedef struct {
int16_t coeffs[KYBER_N];
} poly;

void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a);
void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);

void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a);
void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);

void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a);
void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a);

void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);

void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);

void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r);
void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r);
@@ -28,7 +30,6 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a,
void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r);

void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r);
void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r);

void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b);
void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b);


+ 15
- 36
crypto_kem/kyber1024-90s/clean/polyvec.c 查看文件

@@ -10,19 +10,18 @@
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
* - polyvec *a: pointer to input vector of polynomials
* - const polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) {
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) {
unsigned int i, j, k;

PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(a);

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
for (k = 0; k < 8; k++) {
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2)
/ KYBER_Q) & 0x7ff;
t[k] = a->vec[i].coeffs[8 * j + k];
t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff;
}

r[ 0] = (uint8_t)(t[0] >> 0);
@@ -51,8 +50,7 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESS
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYVECCOMPRESSEDBYTES)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
unsigned int i, j, k;

uint16_t t[8];
@@ -82,9 +80,9 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r,
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYVECBYTES)
* - polyvec *a: pointer to input vector of polynomials
* - const polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) {
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) {
unsigned int i;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]);
@@ -138,18 +136,16 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r) {
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery
*
* Description: Pointwise multiply elements of a and b, accumulate into r,
* Description: Multiply elements of a and b in NTT domain, accumulate into r,
* and multiply by 2^-16.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b) {
void PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) {
unsigned int i;
poly t;

@@ -166,10 +162,10 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r,
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials
* of each element of a vector of polynomials;
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - polyvec *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) {
unsigned int i;
@@ -178,29 +174,12 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) {
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of each element of a vector of polynomials
* for details of conditional subtraction of q see comments in
* reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) {
unsigned int i;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER102490S_CLEAN_poly_csubq(&r->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_add
*
* Description: Add vectors of polynomials
*
* Arguments: - polyvec *r: pointer to output vector of polynomials
* Arguments: - polyvec *r: pointer to output vector of polynomials
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/


+ 4
- 8
crypto_kem/kyber1024-90s/clean/polyvec.h 查看文件

@@ -8,22 +8,18 @@ typedef struct {
poly vec[KYBER_K];
} polyvec;

void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r);
void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r);

void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);



+ 4
- 20
crypto_kem/kyber1024-90s/clean/reduce.c 查看文件

@@ -6,8 +6,7 @@
* Name: PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce
*
* Description: Montgomery reduction; given a 32-bit integer a, computes
* 16-bit integer congruent to a * R^-1 mod q,
* where R=2^16
* 16-bit integer congruent to a * R^-1 mod q, where R=2^16
*
* Arguments: - int32_t a: input integer to be reduced;
* has to be in {-q2^15,...,q2^15-1}
@@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) {
* Name: PQCLEAN_KYBER102490S_CLEAN_barrett_reduce
*
* Description: Barrett reduction; given a 16-bit integer a, computes
* 16-bit integer congruent to a mod q in {0,...,q}
* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2}
*
* Arguments: - int16_t a: input integer to be reduced
*
* Returns: integer in {0,...,q} congruent to a modulo q.
* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
**************************************************/
int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a) {
int16_t t;
const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q;

t = (int32_t)v * a >> 26;
t = ((int32_t)v * a + (1 << 25)) >> 26;
t *= KYBER_Q;
return a - t;
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_csubq
*
* Description: Conditionallly subtract q
*
* Arguments: - int16_t x: input integer
*
* Returns: a - q if a >= q, else a
**************************************************/
int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a) {
a -= KYBER_Q;
a += (a >> 15) & KYBER_Q;
return a;
}

+ 0
- 2
crypto_kem/kyber1024-90s/clean/reduce.h 查看文件

@@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a);

int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a);

int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a);

#endif

+ 10
- 92
crypto_kem/kyber1024-90s/clean/symmetric-aes.c 查看文件

@@ -1,100 +1,18 @@
#include "aes.h"
#include "aes256ctr.h"
#include "params.h"
#include "symmetric.h"
#include <stddef.h>
#include <stdint.h>
#include <string.h>

static inline void br_enc32be(unsigned char *dst, uint32_t x) {
dst[3] = (unsigned char)x;
dst[2] = (unsigned char)(x >> 8);
dst[1] = (unsigned char)(x >> 16);
dst[0] = (unsigned char)(x >> 24);
void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y) {
uint8_t expnonce[12] = {0};
expnonce[0] = x;
expnonce[1] = y;
PQCLEAN_KYBER102490S_CLEAN_aes256ctr_init(state, seed, expnonce);
}

static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) {
uint8_t ivw[16];
uint8_t buf[AES_BLOCKBYTES];
size_t i = 0;

memcpy(ivw, iv, AESCTR_NONCEBYTES);
br_enc32be(ivw + AESCTR_NONCEBYTES, ctr);

while (outlen > AES_BLOCKBYTES) {
aes256_ecb(out, ivw, 1, ctx);
br_enc32be(ivw + AESCTR_NONCEBYTES, ++ctr);
out += AES_BLOCKBYTES;
outlen -= AES_BLOCKBYTES;
}
if (outlen > 0) {
aes256_ecb(buf, ivw, 1, ctx);
for (i = 0; i < outlen; i++) {
out[i] = buf[i];
}
}
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_aes256_prf
*
* Description: AES256 stream generation in CTR mode using 32-bit counter,
* nonce is zero-padded to 12 bytes, counter starts at zero
*
* Arguments: - uint8_t *output: pointer to output
* - size_t outlen: length of requested output in bytes
* - const uint8_t *key: pointer to 32-byte key
* - uint8_t nonce: 1-byte nonce (will be zero-padded to 12 bytes)
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) {
uint8_t iv[12];
for (int i = 1; i < 12; i++) {
iv[i] = 0;
}
iv[0] = nonce;

aes256ctx ctx;
aes256_ctr_keyexp(&ctx, key);
aes256_ctr(output, outlen, iv, &ctx);
aes256_ctx_release(&ctx);
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb
*
* Description: AES256 CTR used as a replacement for a XOF; this function
* "absorbs" a 32-byte key and two additional bytes that are zero-padded
* to a 12-byte nonce
*
* Arguments: - aes256xof_ctx *s: pointer to state to "absorb" key and IV into
* - const uint8_t *key: pointer to 32-byte key
* - uint8_t x: first additional byte to "absorb"
* - uint8_t y: second additional byte to "absorb"
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y) {
aes256_ecb_keyexp(&s->sk_exp, key);
for (int i = 2; i < 12; i++) {
s->iv[i] = 0;
}
s->iv[0] = x;
s->iv[1] = y;
s->ctr = 0;
}

/*************************************************
* Name: PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks
*
* Description: AES256 CTR used as a replacement for a XOF; this function
* generates 4 blocks out AES256-CTR output
*
* Arguments: - uint8_t *out: pointer to output
* - size_t nblocks: number of reqested 64-byte output blocks
* - aes256xof_ctx *s: AES "state", i.e. expanded key and IV
**************************************************/
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s) {
aes256_ctr_xof(out, nblocks * 64, s->iv, s->ctr, &s->sk_exp);
s->ctr += (uint32_t) (4 * nblocks);
}

void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) {
aes256_ctx_release(&s->sk_exp);
void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce) {
uint8_t expnonce[12] = {0};
expnonce[0] = nonce;
PQCLEAN_KYBER102490S_CLEAN_aes256ctr_prf(out, outlen, key, expnonce);
}

+ 0
- 19
crypto_kem/kyber1024-90s/clean/symmetric-aes.h 查看文件

@@ -1,19 +0,0 @@
#ifndef PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_AES_H
#define PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_AES_H
#include "aes.h"
#include <stddef.h>
#include <stdint.h>


typedef struct {
aes256ctx sk_exp;
uint8_t iv[12];
uint32_t ctr;
} aes256xof_ctx;

void PQCLEAN_KYBER102490S_CLEAN_aes256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce);
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(aes256xof_ctx *s, const uint8_t *key, uint8_t x, uint8_t y);
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblocks, aes256xof_ctx *s);
void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s);

#endif

+ 12
- 7
crypto_kem/kyber1024-90s/clean/symmetric.h 查看文件

@@ -1,23 +1,28 @@
#ifndef PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_H
#define PQCLEAN_KYBER102490S_CLEAN_SYMMETRIC_H
#include "aes256ctr.h"
#include "params.h"
#include "sha2.h"
#include "symmetric-aes.h"
#include <stddef.h>
#include <stdint.h>



typedef aes256xof_ctx xof_state;

#define XOF_BLOCKBYTES 64
typedef aes256ctr_ctx xof_state;

void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(aes256ctr_ctx *state, const uint8_t seed[32], uint8_t x, uint8_t y);

void PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[32], uint8_t nonce);

#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES

#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES)
#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES)
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_ctx_release(STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE)
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_kyber_aes256xof_absorb(STATE, SEED, X, Y)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_ctx_release(STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_kyber_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE)
#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES)




+ 3
- 3
crypto_kem/kyber1024/META.yml 查看文件

@@ -6,7 +6,7 @@ length-public-key: 1568
length-ciphertext: 1568
length-secret-key: 3168
length-shared-secret: 32
nistkat-sha256: b4b4fc1c2cbbb182252d2822ccb8cb704bcfe876122635c5dfa48ddc09b6e73f
nistkat-sha256: 5afcf2a568ad32d49b55105b032af1850f03f3888ff9e2a72f4059c58e968f60
principal-submitters:
- Peter Schwabe
auxiliary-submitters:
@@ -21,9 +21,9 @@ auxiliary-submitters:
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber
- name: avx2
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber
supported_platforms:
- architecture: x86_64
operating_systems:


+ 9
- 13
crypto_kem/kyber1024/avx2/align.h 查看文件

@@ -2,22 +2,18 @@
#define PQCLEAN_KYBER1024_AVX2_ALIGN_H

#include <immintrin.h>
#include <stdint.h>

#define ALIGN16_TYPE(t) \
union { \
__m128i vec; \
t orig; \
#define ALIGNED_UINT8(N) \
union { \
uint8_t coeffs[(N)]; \
__m256i vec[((N)+31)/32]; \
}

#define ALIGN32_ARRAY(t, s) \
union { \
__m256i vec; \
t arr[(s)]; \
#define ALIGNED_INT16(N) \
union { \
int16_t coeffs[(N)]; \
__m256i vec[((N)+15)/16]; \
}

#define ALIGN32_ARRAY_2D(t, n, m) \
union { \
__m256i vec; \
t arr[(n)][(m)]; \
}
#endif

+ 96
- 237
crypto_kem/kyber1024/avx2/basemul.S 查看文件

@@ -1,248 +1,107 @@
#include "cdecl.h"
#include "params.h"

.macro schoolbook off,sign
#load
vmovdqa \off+32(%rsi),%ymm7 # b
vmovdqa \off+32(%rdx),%ymm8 # d
vmovdqa \off(%rsi),%ymm9 # a
vmovdqa \off(%rdx),%ymm10 # c

#mul
vpmullw %ymm7,%ymm8,%ymm11 # bd.lo
vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi
vpmullw %ymm7,%ymm10,%ymm13 # bc.lo
vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi
vpmullw %ymm9,%ymm8,%ymm14 # ad.lo
vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi
vpmullw %ymm9,%ymm10,%ymm15 # ac.lo
vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi

#reduce
vpmullw %ymm1,%ymm11,%ymm11
vpmulhw %ymm0,%ymm11,%ymm11
vpsubw %ymm11,%ymm12,%ymm11 # bd

#mul
vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo
vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi

#unpack
vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0
vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1
vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0
vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1
vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0
vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1
vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0
vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1

#add
.ifeq \sign
vpaddd %ymm14,%ymm15,%ymm14 # x0
vpaddd %ymm9,%ymm10,%ymm9 # x1
.else
vpsubd %ymm15,%ymm14,%ymm14 # x0
vpsubd %ymm10,%ymm9,%ymm9 # x1
.endif
vpaddd %ymm12,%ymm13,%ymm12 # y0
vpaddd %ymm7,%ymm8,%ymm7 # y1
.endm

.macro red a0,a1,b0,b1,x,y,z
#pack
vpxor %ymm\x,%ymm\x,%ymm\x
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y
vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z
vpsrld $16,%ymm\a0,%ymm\a0
vpsrld $16,%ymm\a1,%ymm\a1
vpackusdw %ymm\z,%ymm\y,%ymm\z
vpackusdw %ymm\a1,%ymm\a0,%ymm\a0
vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y
vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x
vpsrld $16,%ymm\b0,%ymm\b0
vpsrld $16,%ymm\b1,%ymm\b1
vpackusdw %ymm\x,%ymm\y,%ymm\y
vpackusdw %ymm\b1,%ymm\b0,%ymm\b0

#reduce
vpmullw %ymm1,%ymm\z,%ymm\z
vpmullw %ymm1,%ymm\y,%ymm\y
vpmulhw %ymm0,%ymm\z,%ymm\z
vpmulhw %ymm0,%ymm\y,%ymm\y
vpsubw %ymm\z,%ymm\a0,%ymm\a0
vpsubw %ymm\y,%ymm\b0,%ymm\b0
.macro schoolbook off
vmovdqa _16XQINV*2(%rcx),%ymm0
vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0
vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0
vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1
vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1

vpmullw %ymm0,%ymm1,%ymm9 # a0.lo
vpmullw %ymm0,%ymm2,%ymm10 # b0.lo
vpmullw %ymm0,%ymm3,%ymm11 # a1.lo
vpmullw %ymm0,%ymm4,%ymm12 # b1.lo

vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0
vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0

vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi
vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi
vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi
vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi

vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1
vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1

vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi
vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi
vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi
vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi

vmovdqa %ymm13,(%rsp)

vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo
vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo
vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo
vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo

vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo
vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo
vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo
vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo

vmovdqa _16XQ*2(%rcx),%ymm8
vpmulhw %ymm8,%ymm13,%ymm13
vpmulhw %ymm8,%ymm9,%ymm9
vpmulhw %ymm8,%ymm5,%ymm5
vpmulhw %ymm8,%ymm10,%ymm10
vpmulhw %ymm8,%ymm6,%ymm6
vpmulhw %ymm8,%ymm11,%ymm11
vpmulhw %ymm8,%ymm7,%ymm7
vpmulhw %ymm8,%ymm12,%ymm12

vpsubw (%rsp),%ymm13,%ymm13 # -a0c0
vpsubw %ymm9,%ymm1,%ymm9 # a0d0
vpsubw %ymm5,%ymm14,%ymm5 # b0c0
vpsubw %ymm10,%ymm2,%ymm10 # b0d0

vpsubw %ymm6,%ymm15,%ymm6 # a1c1
vpsubw %ymm11,%ymm3,%ymm11 # a1d1
vpsubw %ymm7,%ymm0,%ymm7 # b1c1
vpsubw %ymm12,%ymm4,%ymm12 # b1d1

vmovdqa (%r9),%ymm0
vmovdqa 32(%r9),%ymm1
vpmullw %ymm0,%ymm10,%ymm2
vpmullw %ymm0,%ymm12,%ymm3
vpmulhw %ymm1,%ymm10,%ymm10
vpmulhw %ymm1,%ymm12,%ymm12
vpmulhw %ymm8,%ymm2,%ymm2
vpmulhw %ymm8,%ymm3,%ymm3
vpsubw %ymm2,%ymm10,%ymm10 # rb0d0
vpsubw %ymm3,%ymm12,%ymm12 # rb1d1

vpaddw %ymm5,%ymm9,%ymm9
vpaddw %ymm7,%ymm11,%ymm11
vpsubw %ymm13,%ymm10,%ymm13
vpsubw %ymm12,%ymm6,%ymm6

vmovdqa %ymm13,(64*\off+ 0)*2(%rdi)
vmovdqa %ymm9,(64*\off+16)*2(%rdi)
vmovdqa %ymm6,(64*\off+32)*2(%rdi)
vmovdqa %ymm11,(64*\off+48)*2(%rdi)
.endm

.text
basemul64_acc_avx:
poly0.0:
schoolbook 0,0

#mov
vmovdqa %ymm14,%ymm3
vmovdqa %ymm9,%ymm4
vmovdqa %ymm12,%ymm5
vmovdqa %ymm7,%ymm6

poly1.0:
schoolbook 512,0

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

poly2.0:
schoolbook 1024,0

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

poly3.0:
schoolbook 1536,0

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

#reduce
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,(%rdi)
vmovdqa %ymm5,32(%rdi)

poly0.1:
schoolbook 64,1

#mov
vmovdqa %ymm14,%ymm3
vmovdqa %ymm9,%ymm4
vmovdqa %ymm12,%ymm5
vmovdqa %ymm7,%ymm6

poly1.1:
schoolbook 576,1

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

poly2.1:
schoolbook 1088,1

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

poly3.1:
schoolbook 1600,1

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6

#reduce
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,64(%rdi)
vmovdqa %ymm5,96(%rdi)

ret

.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx)
.global _cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx):
_cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx):
#consts
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

ret

basemul64_avx:
schoolbook 0,0

#reduce
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,(%rdi)
vmovdqa %ymm12,32(%rdi)

schoolbook 64,1

#reduce
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,64(%rdi)
vmovdqa %ymm12,96(%rdi)

ret

.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx)
.global _cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx):
_cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx):
#consts
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx
mov %rsp,%r8
and $-32,%rsp
sub $32,%rsp

lea (_ZETAS_EXP+176)*2(%rcx),%r9
schoolbook 0

add $32*2,%r9
schoolbook 1

add $192*2,%r9
schoolbook 2

add $32*2,%r9
schoolbook 3

mov %r8,%rsp
ret

+ 41
- 43
crypto_kem/kyber1024/avx2/cbd.c 查看文件

@@ -4,66 +4,64 @@
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_cbd
* Name: cbd2
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
* a centered binomial distribution with parameter eta=2
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *buf: pointer to input byte array
* Arguments: - poly *r: pointer to output polynomial
* - const __m256i *buf: pointer to aligned input byte array
**************************************************/
void PQCLEAN_KYBER1024_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) {
static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) {
unsigned int i;
__m256i vec0, vec1, vec2, vec3, tmp;
__m256i f0, f1, f2, f3;
const __m256i mask55 = _mm256_set1_epi32(0x55555555);
const __m256i mask33 = _mm256_set1_epi32(0x33333333);
const __m256i mask03 = _mm256_set1_epi32(0x03030303);
const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);

for (i = 0; i < KYBER_N / 64; i++) {
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]);
f0 = _mm256_load_si256(&buf[i]);

vec1 = _mm256_srli_epi32(vec0, 1);
vec0 = _mm256_and_si256(mask55, vec0);
vec1 = _mm256_and_si256(mask55, vec1);
vec0 = _mm256_add_epi32(vec0, vec1);
f1 = _mm256_srli_epi16(f0, 1);
f0 = _mm256_and_si256(mask55, f0);
f1 = _mm256_and_si256(mask55, f1);
f0 = _mm256_add_epi8(f0, f1);

vec1 = _mm256_srli_epi32(vec0, 2);
vec0 = _mm256_and_si256(mask33, vec0);
vec1 = _mm256_and_si256(mask33, vec1);
f1 = _mm256_srli_epi16(f0, 2);
f0 = _mm256_and_si256(mask33, f0);
f1 = _mm256_and_si256(mask33, f1);
f0 = _mm256_add_epi8(f0, mask33);
f0 = _mm256_sub_epi8(f0, f1);

vec2 = _mm256_srli_epi32(vec0, 4);
vec3 = _mm256_srli_epi32(vec1, 4);
vec0 = _mm256_and_si256(mask03, vec0);
vec1 = _mm256_and_si256(mask03, vec1);
vec2 = _mm256_and_si256(mask03, vec2);
vec3 = _mm256_and_si256(mask03, vec3);
f1 = _mm256_srli_epi16(f0, 4);
f0 = _mm256_and_si256(mask0F, f0);
f1 = _mm256_and_si256(mask0F, f1);
f0 = _mm256_sub_epi8(f0, mask03);
f1 = _mm256_sub_epi8(f1, mask03);

vec1 = _mm256_sub_epi8(vec0, vec1);
vec3 = _mm256_sub_epi8(vec2, vec3);
f2 = _mm256_unpacklo_epi8(f0, f1);
f3 = _mm256_unpackhi_epi8(f0, f1);

vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1));
vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1));
vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3));
vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1));
f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1));
f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1));

tmp = _mm256_unpacklo_epi16(vec0, vec2);
vec2 = _mm256_unpackhi_epi16(vec0, vec2);
vec0 = tmp;
tmp = _mm256_unpacklo_epi16(vec1, vec3);
vec3 = _mm256_unpackhi_epi16(vec1, vec3);
vec1 = tmp;
_mm256_store_si256(&r->vec[4 * i + 0], f0);
_mm256_store_si256(&r->vec[4 * i + 1], f2);
_mm256_store_si256(&r->vec[4 * i + 2], f1);
_mm256_store_si256(&r->vec[4 * i + 3], f3);
}
}

tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20);
vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31);
vec0 = tmp;
tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20);
vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31);
vec1 = tmp;

_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3);
}
/* buf 32 bytes longer for cbd3 */
void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) {
cbd2(r, buf);
}

void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) {
cbd2(r, buf);
}

+ 4
- 1
crypto_kem/kyber1024/avx2/cbd.h 查看文件

@@ -2,8 +2,11 @@
#define PQCLEAN_KYBER1024_AVX2_CBD_H
#include "params.h"
#include "poly.h"
#include <immintrin.h>
#include <stdint.h>

void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);
void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]);

void PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]);

#endif

+ 7
- 3
crypto_kem/kyber1024/avx2/cdecl.h 查看文件

@@ -1,6 +1,8 @@
#ifndef PQCLEAN_KYBER1024_AVX2_CDECL_H
#define PQCLEAN_KYBER1024_AVX2_CDECL_H



#define _16XQ 0
#define _16XQINV 16
#define _16XV 32
@@ -9,9 +11,10 @@
#define _16XMONTSQLO 80
#define _16XMONTSQHI 96
#define _16XMASK 112
#define _ZETAS_EXP 128
#define _ZETAS_INV_EXP 528

#define _REVIDXB 128
#define _REVIDXD 144
#define _ZETAS_EXP 160
#define _16XSHIFT 624

/* The C ABI on MacOS exports all symbols with a leading
* underscore. This means that any symbols we refer to from
@@ -23,4 +26,5 @@

#define _cdecl(s) _##s
#define cdecl(s) s

#endif

+ 88
- 120
crypto_kem/kyber1024/avx2/consts.c 查看文件

@@ -1,155 +1,123 @@
#include "align.h"
#include "consts.h"
#include "params.h"
#include <stdint.h>

#define Q KYBER_Q
#define MONT ((1U << 16) % Q)
#define QINV 62209 // q^-1 mod 2^16
#define V (((1U << 26) + Q/2)/Q)
#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q)
#define FLO (FHI*QINV % 65536)
#define MONTSQHI (MONT*MONT % Q)
#define MONTSQLO (MONTSQHI*QINV % 65536)
#define MONT (-1044) // 2^16 mod q
#define QINV (-3327) // q^-1 mod 2^16
#define V 20159 // floor(2^26/q + 0.5)
#define FHI 1441 // mont^2/128
#define FLO (-10079) // qinv*FHI
#define MONTSQHI 1353 // mont^2
#define MONTSQLO 20553 // qinv*MONTSQHI
#define MASK 4095
#define SHIFT 32


const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.as_arr = {
#define _16XQ 0
const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.coeffs = {
//#define _16XQ 0
Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q,

#define _16XQINV 16
//#define _16XQINV 16
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,
QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV,

#define _16XV 32
//#define _16XV 32
V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V,

#define _16XFLO 48
//#define _16XFLO 48
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,
FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO,

#define _16XFHI 64
//#define _16XFHI 64
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,
FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI,

#define _16XMONTSQLO 80
//#define _16XMONTSQLO 80
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,
MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO,

#define _16XMONTSQHI 96
//#define _16XMONTSQHI 96
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,
MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI,

#define _16XMASK 112
//#define _16XMASK 112
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,
MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK,

#define _ZETAS_EXP 128
31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970,
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525,
53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134,
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422,
44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758,
61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846,
3158, 3158, 3158, 3158, 622, 622, 622, 622,
1577, 1577, 1577, 1577, 182, 182, 182, 182,
59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479,
5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295,
573, 573, 2004, 2004, 264, 264, 383, 383,
2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199,
59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081,
52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837,
1223, 652, 2777, 1015, 2036, 1491, 3047, 1785,
516, 3321, 3009, 2663, 1711, 2167, 126, 1469,
65202, 54059, 33310, 20494, 37798, 945, 50654, 6182,
32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261,
2226, 555, 2078, 1550, 422, 177, 3038, 1574,
3083, 1159, 2552, 2727, 1739, 2457, 418, 3173,
11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493,
33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918,
430, 843, 871, 105, 587, 3094, 2869, 1653,
778, 3182, 1483, 1119, 644, 349, 329, 3254,
788, 788, 1812, 1812, 28191, 28191, 28191, 28191,
28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842,
48842, 48842, 48842, 48842, 287, 287, 287, 287,
287, 287, 287, 287, 202, 202, 202, 202,
202, 202, 202, 202, 10690, 10690, 10690, 10690,
1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335,
31164, 31164, 31164, 31164, 962, 962, 962, 962,
2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855,
1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313,
55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859,
26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017,
732, 732, 608, 608, 1787, 1787, 411, 411,
3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638,
37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780,
16832, 4312, 41381, 47622, 2476, 3239, 3058, 830,
107, 1908, 3082, 2378, 2931, 961, 1821, 2604,
448, 2264, 677, 2054, 34353, 25435, 58154, 24392,
44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907,
31637, 28644, 23998, 48114, 817, 603, 1322, 1864,
2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459,
3221, 996, 958, 1522, 20297, 2146, 15356, 33152,
59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094,
41677, 45279, 7757, 23132, 1097, 610, 2044, 384,
3193, 1994, 220, 1670, 1799, 794, 2475, 478,
3021, 991, 1869, 1628, 0, 0, 0, 0,
//#define _REVIDXB 128
3854, 3340, 2826, 2312, 1798, 1284, 770, 256,
3854, 3340, 2826, 2312, 1798, 1284, 770, 256,

//#define _REVIDXD 144
7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0, 0,

//#define _ZETAS_EXP 160
31498, 31498, 31498, 31498, -758, -758, -758, -758,
5237, 5237, 5237, 5237, 1397, 1397, 1397, 1397,
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
-359, -359, -359, -359, -359, -359, -359, -359,
-359, -359, -359, -359, -359, -359, -359, -359,
13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525,
-12402, -12402, -12402, -12402, -12402, -12402, -12402, -12402,
1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422,
-20907, -20907, -20907, -20907, 27758, 27758, 27758, 27758,
-3799, -3799, -3799, -3799, -15690, -15690, -15690, -15690,
-171, -171, -171, -171, 622, 622, 622, 622,
1577, 1577, 1577, 1577, 182, 182, 182, 182,
-5827, -5827, 17363, 17363, -26360, -26360, -29057, -29057,
5571, 5571, -1102, -1102, 21438, 21438, -26242, -26242,
573, 573, -1325, -1325, 264, 264, 383, 383,
-829, -829, 1458, 1458, -1602, -1602, -130, -130,
-5689, -6516, 1496, 30967, -23565, 20179, 20710, 25080,
-12796, 26616, 16064, -12442, 9134, -650, -25986, 27837,
1223, 652, -552, 1015, -1293, 1491, -282, -1544,
516, -8, -320, -666, -1618, -1162, 126, 1469,
-335, -11477, -32227, 20494, -27738, 945, -14883, 6182,
32010, 10631, 29175, -28762, -18486, 17560, -14430, -5276,
-1103, 555, -1251, 1550, 422, 177, -291, 1574,
-246, 1159, -777, -602, -1590, -872, 418, -156,
11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493,
-32502, 30317, -18741, 12639, 20100, 18525, 19529, -12619,
430, 843, 871, 105, 587, -235, -460, 1653,
778, -147, 1483, 1119, 644, 349, 329, -75,
787, 787, 787, 787, 787, 787, 787, 787,
787, 787, 787, 787, 787, 787, 787, 787,
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191,
-16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694,
287, 287, 287, 287, 287, 287, 287, 287,
202, 202, 202, 202, 202, 202, 202, 202,
10690, 10690, 10690, 10690, 1358, 1358, 1358, 1358,
-11202, -11202, -11202, -11202, 31164, 31164, 31164, 31164,
962, 962, 962, 962, -1202, -1202, -1202, -1202,
-1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468,
-28073, -28073, 24313, 24313, -10532, -10532, 8800, 8800,
18426, 18426, 8859, 8859, 26675, 26675, -16163, -16163,
-681, -681, 1017, 1017, 732, 732, 608, 608,
-1542, -1542, 411, 411, -205, -205, -1571, -1571,
19883, -28250, -15887, -8898, -28309, 9075, -30199, 18249,
13426, 14017, -29156, -12757, 16832, 4311, -24155, -17915,
-853, -90, -271, 830, 107, -1421, -247, -951,
-398, 961, -1508, -725, 448, -1065, 677, -1275,
-31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989,
10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422,
817, 603, 1322, -1465, -1215, 1218, -874, -1187,
-1185, -1278, -1510, -870, -108, 996, 958, 1522,
20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469,
-21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132,
1097, 610, -1285, 384, -136, -1335, 220, -1659,
-1530, 794, -854, 478, -308, 991, -1460, 1628,

#define _ZETAS_INV_EXP 528
42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498,
51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240,
1701, 1460, 2338, 308, 2851, 854, 2535, 1530,
1659, 3109, 1335, 136, 2945, 1285, 2719, 2232,
17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201,
48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184,
1807, 2371, 2333, 108, 870, 1510, 1278, 1185,
1187, 874, 2111, 1215, 1465, 2007, 2726, 2512,
17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110,
47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653,
1275, 2652, 1065, 2881, 725, 1508, 2368, 398,
951, 247, 1421, 3222, 2499, 271, 90, 853,
16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110,
56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073,
1571, 1571, 205, 205, 2918, 2918, 1542, 1542,
2721, 2721, 2597, 2597, 2312, 2312, 681, 681,
34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202,
64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847,
1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474,
1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367,
16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695,
37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346,
3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127,
3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042,
64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437,
52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406,
21656, 14234, 52150, 54355, 75, 3000, 2980, 2685,
2210, 1846, 147, 2551, 1676, 460, 235, 2742,
3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486,
28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739,
45043, 32227, 11478, 335, 156, 2911, 872, 1590,
602, 777, 2170, 246, 1755, 291, 3152, 2907,
1779, 1251, 2774, 1103, 37700, 25987, 650, 56402,
12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565,
34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618,
666, 320, 8, 2813, 1544, 282, 1838, 1293,
2314, 552, 2677, 2106, 26242, 26242, 44098, 44098,
1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361,
48173, 48173, 5828, 5828, 130, 130, 1602, 1602,
1871, 1871, 829, 829, 2946, 2946, 3065, 3065,
1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691,
3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779,
20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147,
1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707,
171, 171, 171, 171, 12403, 12403, 12403, 12403,
12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012,
52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907,
1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836,
1836, 1836, 1836, 1836, 50791, 50791, 359, 359,
60300, 60300, 1932, 1932, 0, 0, 0, 0
//#define _16XSHIFT 624
SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT,
SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT, SHIFT
}
};

+ 2
- 11
crypto_kem/kyber1024/avx2/consts.h 查看文件

@@ -1,19 +1,10 @@
#ifndef PQCLEAN_KYBER1024_AVX2_CONSTS_H
#define PQCLEAN_KYBER1024_AVX2_CONSTS_H
#include "align.h"
#include "cdecl.h"
#include "params.h"
#include <immintrin.h>
#include <stdint.h>


#define ALIGNED_UINT16_T(N) \
union { \
__m256i as_vec; \
uint16_t as_arr[(N)]; \
}

typedef ALIGNED_UINT16_T(928) qdata_t;

typedef ALIGNED_INT16(640) qdata_t;
extern const qdata_t PQCLEAN_KYBER1024_AVX2_qdata;

#endif

+ 30
- 46
crypto_kem/kyber1024/avx2/fips202x4.c 查看文件

@@ -9,22 +9,14 @@
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds
extern void KeccakF1600_StatePermute4x(__m256i *s);

static inline void store64(uint8_t x[8], uint64_t u) {
unsigned int i;

for (i = 0; i < 8; i++) {
x[i] = u >> 8 * i;
}
}

static void keccakx4_absorb(__m256i s[25],
unsigned int r,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen,
uint8_t p) {
static void keccakx4_absorb_once(__m256i s[25],
unsigned int r,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen,
uint8_t p) {
size_t i, pos = 0;
__m256i t, idx;

@@ -39,20 +31,17 @@ static void keccakx4_absorb(__m256i s[25],
s[i] = _mm256_xor_si256(s[i], t);
pos += 8;
}
inlen -= r;

KeccakF1600_StatePermute4x(s);
inlen -= r;
}

i = 0;
while (inlen >= 8) {
for (i = 0; i < inlen / 8; ++i) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
s[i] = _mm256_xor_si256(s[i], t);

i++;
pos += 8;
inlen -= 8;
}
inlen -= 8 * i;

if (inlen) {
t = _mm256_i64gather_epi64((long long *)pos, idx, 1);
@@ -75,37 +64,34 @@ static void keccakx4_squeezeblocks(uint8_t *out0,
unsigned int r,
__m256i s[25]) {
unsigned int i;
uint64_t f0, f1, f2, f3;
__m128d t;

while (nblocks > 0) {
KeccakF1600_StatePermute4x(s);
for (i = 0; i < r / 8; ++i) {
f0 = _mm256_extract_epi64(s[i], 0);
f1 = _mm256_extract_epi64(s[i], 1);
f2 = _mm256_extract_epi64(s[i], 2);
f3 = _mm256_extract_epi64(s[i], 3);
store64(out0, f0);
store64(out1, f1);
store64(out2, f2);
store64(out3, f3);

out0 += 8;
out1 += 8;
out2 += 8;
out3 += 8;
t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i]));
_mm_storel_pd((double *)&out0[8 * i], t);
_mm_storeh_pd((double *)&out1[8 * i], t);
t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1));
_mm_storel_pd((double *)&out2[8 * i], t);
_mm_storeh_pd((double *)&out3[8 * i], t);
}

out0 += r;
out1 += r;
out2 += r;
out3 += r;
--nblocks;
}
}

void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state,
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
@@ -114,17 +100,16 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE,
state->s);
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s);
}

void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state,
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3,
size_t inlen) {
keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F);
}

void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
@@ -133,8 +118,7 @@ void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0,
uint8_t *out3,
size_t nblocks,
keccakx4_state *state) {
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE,
state->s);
keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s);
}

void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0,
@@ -152,7 +136,7 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0,
uint8_t t[4][SHAKE128_RATE];
keccakx4_state state;

PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen);
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen);
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);

out0 += nblocks * SHAKE128_RATE;
@@ -187,7 +171,7 @@ void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0,
uint8_t t[4][SHAKE256_RATE];
keccakx4_state state;

PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen);
PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen);
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state);

out0 += nblocks * SHAKE256_RATE;


+ 2
- 2
crypto_kem/kyber1024/avx2/fips202x4.h 查看文件

@@ -9,7 +9,7 @@ typedef struct {
__m256i s[25];
} keccakx4_state;

void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state,
void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
@@ -23,7 +23,7 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0,
size_t nblocks,
keccakx4_state *state);

void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state,
void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(keccakx4_state *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,


+ 8
- 51
crypto_kem/kyber1024/avx2/fq.S 查看文件

@@ -13,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7
vmovdqa 192(%rdi),%ymm8
vmovdqa 224(%rdi),%ymm9

red16 2,10
red16 3,11
red16 4,12
red16 5,13
red16 6,14
red16 7,15
red16 8,10
red16 9,11
red16 2
red16 3
red16 4
red16 5
red16 6
red16 7
red16 8
red16 9

#store
vmovdqa %ymm2,(%rdi)
@@ -46,49 +46,6 @@ add $256,%rdi
call reduce128_avx
ret

csubq128_avx:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm2
vmovdqa 64(%rdi),%ymm3
vmovdqa 96(%rdi),%ymm4
vmovdqa 128(%rdi),%ymm5
vmovdqa 160(%rdi),%ymm6
vmovdqa 192(%rdi),%ymm7
vmovdqa 224(%rdi),%ymm8

csubq 1,9
csubq 2,10
csubq 3,11
csubq 4,12
csubq 5,13
csubq 6,14
csubq 7,15
csubq 8,9

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm2,32(%rdi)
vmovdqa %ymm3,64(%rdi)
vmovdqa %ymm4,96(%rdi)
vmovdqa %ymm5,128(%rdi)
vmovdqa %ymm6,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm8,224(%rdi)

ret

.global cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx)
.global _cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx):
_cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
call csubq128_avx
add $256,%rdi
call csubq128_avx
ret

tomont128_avx:
#load
vmovdqa (%rdi),%ymm3


+ 5
- 4
crypto_kem/kyber1024/avx2/fq.inc 查看文件

@@ -1,6 +1,10 @@
.macro red16 r,x=12
.macro red16 r,rs=0,x=12
vpmulhw %ymm1,%ymm\r,%ymm\x
.if \rs
vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
.else
vpsraw $10,%ymm\x,%ymm\x
.endif
vpmullw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\r,%ymm\r
.endm
@@ -10,9 +14,6 @@ vpsubw %ymm0,%ymm\r,%ymm\r
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
#vpcmpgtw %ymm0,%ymm\r,%ymm\x
#vpand %ymm0,%ymm\x,%ymm\x
#vpsubw %ymm\x,%ymm\r,%ymm\r
.endm

.macro caddq r,x=12


+ 112
- 120
crypto_kem/kyber1024/avx2/indcpa.c 查看文件

@@ -8,6 +8,7 @@
#include "randombytes.h"
#include "rejsample.h"
#include "symmetric.h"
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>

@@ -15,11 +16,14 @@
* Name: pack_pk
*
* Description: Serialize the public key as concatenation of the
* serialized vector of polynomials pk
* and the public seed used to generate the matrix A.
* serialized vector of polynomials pk and the
* public seed used to generate the matrix A.
* The polynomial coefficients in pk are assumed to
* lie in the invertal [0,q], i.e. pk must be reduced
* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce().
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* polyvec *pk: pointer to the input public-key polyvec
* Arguments: uint8_t *r: pointer to the output serialized public key
* polyvec *pk: pointer to the input public-key polyvec
* const uint8_t *seed: pointer to the input public seed
**************************************************/
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
@@ -55,9 +59,12 @@ static void unpack_pk(polyvec *pk,
/*************************************************
* Name: pack_sk
*
* Description: Serialize the secret key
* Description: Serialize the secret key.
* The polynomial coefficients in sk are assumed to
* lie in the invertal [0,q], i.e. sk must be reduced
* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce().
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - polyvec *sk: pointer to input vector of polynomials (secret key)
**************************************************/
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
@@ -67,15 +74,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
/*************************************************
* Name: unpack_sk
*
* Description: De-serialize the secret key;
* inverse of pack_sk
* Description: De-serialize the secret key; inverse of pack_sk
*
* Arguments: - polyvec *sk: pointer to output vector of polynomials
* (secret key)
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret key
**************************************************/
static void unpack_sk(polyvec *sk,
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(sk, packedsk);
}

@@ -84,15 +88,16 @@ static void unpack_sk(polyvec *sk,
*
* Description: Serialize the ciphertext as concatenation of the
* compressed and serialized vector of polynomials b
* and the compressed and serialized polynomial v
* and the compressed and serialized polynomial v.
* The polynomial coefficients in b and v are assumed to
* lie in the invertal [0,q], i.e. b and v must be reduced
* by PQCLEAN_KYBER1024_AVX2_polyvec_reduce() and PQCLEAN_KYBER1024_AVX2_poly_reduce(), respectively.
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
**************************************************/
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
polyvec *b,
poly *v) {
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) {
PQCLEAN_KYBER1024_AVX2_polyvec_compress(r, b);
PQCLEAN_KYBER1024_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
}
@@ -103,13 +108,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of pack_ciphertext
*
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
**************************************************/
static void unpack_ciphertext(polyvec *b,
poly *v,
const uint8_t c[KYBER_INDCPA_BYTES]) {
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) {
PQCLEAN_KYBER1024_AVX2_polyvec_decompress(b, c);
PQCLEAN_KYBER1024_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
}
@@ -120,11 +123,9 @@ static void unpack_ciphertext(polyvec *b,
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers
* (uniform mod q)
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* Arguments: - int16_t *r: pointer to output array
* - unsigned int len: requested number of 16-bit integers (uniform mod q)
* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
* - unsigned int buflen: length of input buffer in bytes
*
* Returns number of sampled 16-bit integers (at most len)
@@ -134,16 +135,19 @@ static unsigned int rej_uniform(int16_t *r,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint16_t val;
uint16_t val0, val1;

ctr = pos = 0;
while (ctr < len && pos + 2 <= buflen) {
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;
while (ctr < len && pos + 3 <= buflen) {
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
pos += 3;

if (val < 19 * KYBER_Q) {
val -= ((uint32_t)val * 20159 >> 26) * KYBER_Q; // Barrett reduction
r[ctr++] = (int16_t)val;
if (val0 < KYBER_Q) {
r[ctr++] = val0;
}
if (ctr < len && val1 < KYBER_Q) {
r[ctr++] = val1;
}
}

@@ -165,61 +169,54 @@ static unsigned int rej_uniform(int16_t *r,
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T is generated
**************************************************/
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) {
unsigned int i, ctr0, ctr1, ctr2, ctr3;
ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf;
ALIGNED_UINT8(REJ_UNIFORM_AVX_NBLOCKS * SHAKE128_RATE) buf[4];
__m256i f;
keccakx4_state state;

for (i = 0; i < 4; i++) {
f = _mm256_load_si256((__m256i *)seed);
_mm256_store_si256((__m256i *)buf.arr[0], f);
_mm256_store_si256((__m256i *)buf.arr[1], f);
_mm256_store_si256((__m256i *)buf.arr[2], f);
_mm256_store_si256((__m256i *)buf.arr[3], f);
f = _mm256_loadu_si256((__m256i *)seed);
_mm256_store_si256(buf[0].vec, f);
_mm256_store_si256(buf[1].vec, f);
_mm256_store_si256(buf[2].vec, f);
_mm256_store_si256(buf[3].vec, f);

if (transposed) {
buf.arr[0][KYBER_SYMBYTES + 0] = i;
buf.arr[0][KYBER_SYMBYTES + 1] = 0;
buf.arr[1][KYBER_SYMBYTES + 0] = i;
buf.arr[1][KYBER_SYMBYTES + 1] = 1;
buf.arr[2][KYBER_SYMBYTES + 0] = i;
buf.arr[2][KYBER_SYMBYTES + 1] = 2;
buf.arr[3][KYBER_SYMBYTES + 0] = i;
buf.arr[3][KYBER_SYMBYTES + 1] = 3;
buf[0].coeffs[32] = i;
buf[0].coeffs[33] = 0;
buf[1].coeffs[32] = i;
buf[1].coeffs[33] = 1;
buf[2].coeffs[32] = i;
buf[2].coeffs[33] = 2;
buf[3].coeffs[32] = i;
buf[3].coeffs[33] = 3;
} else {
buf.arr[0][KYBER_SYMBYTES + 0] = 0;
buf.arr[0][KYBER_SYMBYTES + 1] = i;
buf.arr[1][KYBER_SYMBYTES + 0] = 1;
buf.arr[1][KYBER_SYMBYTES + 1] = i;
buf.arr[2][KYBER_SYMBYTES + 0] = 2;
buf.arr[2][KYBER_SYMBYTES + 1] = i;
buf.arr[3][KYBER_SYMBYTES + 0] = 3;
buf.arr[3][KYBER_SYMBYTES + 1] = i;
buf[0].coeffs[32] = 0;
buf[0].coeffs[33] = i;
buf[1].coeffs[32] = 1;
buf[1].coeffs[33] = i;
buf[2].coeffs[32] = 2;
buf[2].coeffs[33] = i;
buf[3].coeffs[32] = 3;
buf[3].coeffs[33] = i;
}

PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2);
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3],
GEN_MATRIX_NBLOCKS, &state);
PQCLEAN_KYBER1024_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 34);
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_AVX_NBLOCKS, &state);

ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf.arr[0]);
ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf.arr[1]);
ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf.arr[2]);
ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf.arr[3]);
ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf[0].coeffs);
ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf[1].coeffs);
ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf[2].coeffs);
ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf[3].coeffs);

while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) {
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state);

ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0],
XOF_BLOCKBYTES);
ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1],
XOF_BLOCKBYTES);
ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2],
XOF_BLOCKBYTES);
ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3],
XOF_BLOCKBYTES);
PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state);

ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf[0].coeffs, SHAKE128_RATE);
ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf[1].coeffs, SHAKE128_RATE);
ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf[2].coeffs, SHAKE128_RATE);
ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf[3].coeffs, SHAKE128_RATE);
}

PQCLEAN_KYBER1024_AVX2_poly_nttunpack(&a[i].vec[0]);
@@ -243,27 +240,26 @@ void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int t
void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
unsigned int i;
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
const uint8_t *publicseed = buf.arr;
const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES;
uint8_t buf[2 * KYBER_SYMBYTES];
const uint8_t *publicseed = buf;
const uint8_t *noiseseed = buf + KYBER_SYMBYTES;
polyvec a[KYBER_K], e, pkpv, skpv;

randombytes(buf.arr, KYBER_SYMBYTES);
hash_g(buf.arr, buf.arr, KYBER_SYMBYTES);
randombytes(buf, KYBER_SYMBYTES);
hash_g(buf, buf, KYBER_SYMBYTES);

gen_a(a, publicseed);

PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed,
0, 1, 2, 3);
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed,
4, 5, 6, 7);
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, 0, 1, 2, 3);
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, 4, 5, 6, 7);

PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&skpv);
PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&skpv);
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&e);

// matrix-vector multiplication
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER1024_AVX2_poly_tomont(&pkpv.vec[i]);
}

@@ -280,55 +276,51 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTE
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins
* used as seed (of length KYBER_SYMBYTES)
* to deterministically generate all
* randomness
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins used as seed
* (of length KYBER_SYMBYTES) to deterministically
* generate all randomness
**************************************************/
void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
const uint8_t coins[KYBER_SYMBYTES]) {
unsigned int i;
ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed;
polyvec sp, pkpv, ep, at[KYBER_K], bp;
uint8_t seed[KYBER_SYMBYTES];
polyvec sp, pkpv, ep, at[KYBER_K], b;
poly v, k, epp;

unpack_pk(&pkpv, seed.arr, pk);
unpack_pk(&pkpv, seed, pk);
PQCLEAN_KYBER1024_AVX2_poly_frommsg(&k, m);
gen_at(at, seed.arr);
gen_at(at, seed);

PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins,
0, 1, 2, 3);
PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins,
4, 5, 6, 7);
PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, 8);
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, 0, 1, 2, 3);
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, 4, 5, 6, 7);
PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(&epp, coins, 8);

PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&sp);

// matrix-vector multiplication
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
}
PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&bp);
PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&b);
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&v);

PQCLEAN_KYBER1024_AVX2_polyvec_add(&bp, &bp, &ep);
PQCLEAN_KYBER1024_AVX2_polyvec_add(&b, &b, &ep);
PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &epp);
PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &k);
PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&bp);
PQCLEAN_KYBER1024_AVX2_polyvec_reduce(&b);
PQCLEAN_KYBER1024_AVX2_poly_reduce(&v);

pack_ciphertext(c, &bp, &v);
pack_ciphertext(c, &b, &v);
}

/*************************************************
@@ -337,24 +329,24 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length KYBER_INDCPA_SECRETKEYBYTES)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
polyvec bp, skpv;
polyvec b, skpv;
poly v, mp;

unpack_ciphertext(&bp, &v, c);
unpack_ciphertext(&b, &v, c);
unpack_sk(&skpv, sk);

PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&bp);
PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&b);
PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&mp);

PQCLEAN_KYBER1024_AVX2_poly_sub(&mp, &v, &mp);


+ 122
- 154
crypto_kem/kyber1024/avx2/invntt.S 查看文件

@@ -2,22 +2,21 @@
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2
#update & mul
vpsubw %ymm\rh0,%ymm\rl0,%ymm12
vpsubw %ymm\rh1,%ymm\rl1,%ymm13
vpsubw %ymm\rh2,%ymm\rl2,%ymm14

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
vpsubw %ymm\rl0,%ymm\rh0,%ymm12
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
vpsubw %ymm\rl1,%ymm\rh1,%ymm13

vpmullw %ymm\zl0,%ymm12,%ymm\rh0
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1
vpsubw %ymm\rl2,%ymm\rh2,%ymm14

vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
vpmullw %ymm\zl0,%ymm13,%ymm\rh1
vpsubw %ymm\rh3,%ymm\rl3,%ymm15
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2
vpsubw %ymm\rl3,%ymm\rh3,%ymm15

vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw %ymm\zl1,%ymm14,%ymm\rh2
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw %ymm\zl1,%ymm15,%ymm\rh3

vpmulhw %ymm\zh0,%ymm12,%ymm12
@@ -26,60 +25,84 @@ vpmulhw %ymm\zh0,%ymm13,%ymm13
vpmulhw %ymm\zh1,%ymm14,%ymm14
vpmulhw %ymm\zh1,%ymm15,%ymm15

#reduce
vpmulhw %ymm0,%ymm\rh0,%ymm\rh0

vpmulhw %ymm0,%ymm\rh1,%ymm\rh1

vpmulhw %ymm0,%ymm\rh2,%ymm\rh2
vpmulhw %ymm0,%ymm\rh3,%ymm\rh3

#

#

vpsubw %ymm\rh0,%ymm12,%ymm\rh0

vpsubw %ymm\rh1,%ymm13,%ymm\rh1

vpsubw %ymm\rh2,%ymm14,%ymm\rh2
vpsubw %ymm\rh3,%ymm15,%ymm\rh3
.endm

.text
invntt_levels0t5_avx:
level0:
#zetas
vmovdqu (%rsi),%ymm15
vmovdqu 64(%rsi),%ymm3
vmovdqu 32(%rsi),%ymm1
vmovdqu 96(%rsi),%ymm2

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly 4,5,8,9,6,7,10,11,15,3,1,2

level1:
#zetas
vmovdqu 128(%rsi),%ymm3
vmovdqu 160(%rsi),%ymm2

butterfly 4,5,6,7,8,9,10,11,3,3,2,2
.macro intt_levels0t5 off
/* level 0 */
vmovdqa _16XFLO*2(%rsi),%ymm2
vmovdqa _16XFHI*2(%rsi),%ymm3

vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7

fqmulprecomp 2,3,4
fqmulprecomp 2,3,6
fqmulprecomp 2,3,5
fqmulprecomp 2,3,7

vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
vmovdqa (128*\off+112)*2(%rdi),%ymm11

fqmulprecomp 2,3,8
fqmulprecomp 2,3,10
fqmulprecomp 2,3,9
fqmulprecomp 2,3,11

vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
vmovdqa _REVIDXB*2(%rsi),%ymm12
vpshufb %ymm12,%ymm15,%ymm15
vpshufb %ymm12,%ymm1,%ymm1
vpshufb %ymm12,%ymm2,%ymm2
vpshufb %ymm12,%ymm3,%ymm3

butterfly 4,5,8,9,6,7,10,11,15,1,2,3

/* level 1 */
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
vmovdqa _REVIDXB*2(%rsi),%ymm1
vpshufb %ymm1,%ymm2,%ymm2
vpshufb %ymm1,%ymm3,%ymm3

butterfly 4,5,6,7,8,9,10,11,2,2,3,3

shuffle1 4,5,3,5
shuffle1 6,7,4,7
shuffle1 8,9,6,9
shuffle1 10,11,8,11

level2:
#zetas
vmovdqu 192(%rsi),%ymm10
vmovdqu 224(%rsi),%ymm2

#consts
vmovdqa _16XV*2(%rdx),%ymm1
/* level 2 */
vmovdqa _REVIDXD*2(%rsi),%ymm12
vpermd (_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
vpermd (_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10

butterfly 3,4,6,8,5,7,9,11,10,10,2,2
butterfly 3,4,6,8,5,7,9,11,2,2,10,10

vmovdqa _16XV*2(%rsi),%ymm1
red16 3

shuffle2 3,4,10,4
@@ -87,26 +110,22 @@ shuffle2 6,8,3,8
shuffle2 5,7,6,7
shuffle2 9,11,5,11

level3:
#zetas
vmovdqu 256(%rsi),%ymm9
vmovdqu 288(%rsi),%ymm2
/* level 3 */
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
vpermq $0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9

butterfly 10,3,6,5,4,8,7,11,9,9,2,2

red16 10
butterfly 10,3,6,5,4,8,7,11,2,2,9,9

shuffle4 10,3,9,3
shuffle4 6,5,10,5
shuffle4 4,8,6,8
shuffle4 7,11,4,11

level4:
#zetas
vmovdqu 320(%rsi),%ymm7
vmovdqu 352(%rsi),%ymm2
/* level 4 */
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
vpermq $0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7

butterfly 9,10,6,4,3,5,8,11,7,7,2,2
butterfly 9,10,6,4,3,5,8,11,2,2,7,7

red16 9

@@ -115,113 +134,62 @@ shuffle8 6,4,9,4
shuffle8 3,5,6,5
shuffle8 8,11,3,11

level5:
#zetas
vpbroadcastd 384(%rsi),%ymm8
vpbroadcastd 388(%rsi),%ymm2

butterfly 7,9,6,3,10,4,5,11,8,8,2,2

red16 7

#store
vmovdqa %ymm7,(%rdi)
vmovdqa %ymm9,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm3,96(%rdi)
vmovdqa %ymm10,128(%rdi)
vmovdqa %ymm4,160(%rdi)
vmovdqa %ymm5,192(%rdi)
vmovdqa %ymm11,224(%rdi)
/* level5 */
vmovdqa (_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
vmovdqa (_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8

ret
butterfly 7,9,6,3,10,4,5,11,2,2,8,8

invntt_level6_avx:
#zetas
vpbroadcastd (%rsi),%ymm1
vpbroadcastd 4(%rsi),%ymm2

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 256(%rdi),%ymm8
vmovdqa 288(%rdi),%ymm9
vmovdqa 320(%rdi),%ymm10
vmovdqa 352(%rdi),%ymm11
vmovdqa %ymm7,(128*\off+ 0)*2(%rdi)
vmovdqa %ymm9,(128*\off+ 16)*2(%rdi)
vmovdqa %ymm6,(128*\off+ 32)*2(%rdi)
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
vmovdqa %ymm10,(128*\off+ 64)*2(%rdi)
vmovdqa %ymm4,(128*\off+ 80)*2(%rdi)
vmovdqa %ymm5,(128*\off+ 96)*2(%rdi)
vmovdqa %ymm11,(128*\off+112)*2(%rdi)
.endm

butterfly 4,5,6,7,8,9,10,11
.macro intt_level6 off
/* level 6 */
vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
vmovdqa (64*\off+128)*2(%rdi),%ymm8
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
vmovdqa (64*\off+144)*2(%rdi),%ymm9
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm2

#consts
vmovdqa _16XFLO*2(%rdx),%ymm12
vmovdqa _16XFHI*2(%rdx),%ymm13

#store
vmovdqa %ymm8,256(%rdi)
vmovdqa %ymm9,288(%rdi)
vmovdqa %ymm10,320(%rdi)
vmovdqa %ymm11,352(%rdi)

fqmulprecomp 12,13,4,8
fqmulprecomp 12,13,5,9
fqmulprecomp 12,13,6,10
fqmulprecomp 12,13,7,11

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm7,96(%rdi)

#load
vmovdqa 128(%rdi),%ymm4
vmovdqa 160(%rdi),%ymm5
vmovdqa 192(%rdi),%ymm6
vmovdqa 224(%rdi),%ymm7
vmovdqa 384(%rdi),%ymm8
vmovdqa 416(%rdi),%ymm9
vmovdqa 448(%rdi),%ymm10
vmovdqa 480(%rdi),%ymm11
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
vmovdqa (64*\off+160)*2(%rdi),%ymm10
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7
vmovdqa (64*\off+176)*2(%rdi),%ymm11
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm3

butterfly 4,5,6,7,8,9,10,11

#consts
vmovdqa _16XFLO*2(%rdx),%ymm12
vmovdqa _16XFHI*2(%rdx),%ymm13

#store
vmovdqa %ymm8,384(%rdi)
vmovdqa %ymm9,416(%rdi)
vmovdqa %ymm10,448(%rdi)
vmovdqa %ymm11,480(%rdi)

fqmulprecomp 12,13,4,8
fqmulprecomp 12,13,5,9
fqmulprecomp 12,13,6,10
fqmulprecomp 12,13,7,11

#store
vmovdqa %ymm4,128(%rdi)
vmovdqa %ymm5,160(%rdi)
vmovdqa %ymm6,192(%rdi)
vmovdqa %ymm7,224(%rdi)

ret
.if \off == 0
red16 4
.endif

vmovdqa %ymm4,(64*\off+ 0)*2(%rdi)
vmovdqa %ymm5,(64*\off+ 16)*2(%rdi)
vmovdqa %ymm6,(64*\off+ 32)*2(%rdi)
vmovdqa %ymm7,(64*\off+ 48)*2(%rdi)
vmovdqa %ymm8,(64*\off+128)*2(%rdi)
vmovdqa %ymm9,(64*\off+144)*2(%rdi)
vmovdqa %ymm10,(64*\off+160)*2(%rdi)
vmovdqa %ymm11,(64*\off+176)*2(%rdi)
.endm

.text
.global cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx)
.global _cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx):
_cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
mov %rsi,%rdx
add $_ZETAS_INV_EXP*2,%rsi
call invntt_levels0t5_avx
add $256,%rdi
add $392,%rsi
call invntt_levels0t5_avx
sub $256,%rdi
add $392,%rsi
call invntt_level6_avx

intt_levels0t5 0
intt_levels0t5 1

intt_level6 0
intt_level6 1
ret

+ 36
- 36
crypto_kem/kyber1024/avx2/kem.c 查看文件

@@ -1,4 +1,3 @@
#include "align.h"
#include "indcpa.h"
#include "kem.h"
#include "params.h"
@@ -15,13 +14,14 @@
* for CCA-secure Kyber key encapsulation mechanism
*
* Arguments: - unsigned char *pk: pointer to output public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
* - unsigned char *sk: pointer to output private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* (an already allocated array of KYBER_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES],
unsigned char sk[KYBER_SECRETKEYBYTES]) {
size_t i;
PQCLEAN_KYBER1024_AVX2_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -40,36 +40,36 @@ int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *
* secret for given public key
*
* Arguments: - unsigned char *ct: pointer to output cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
* - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* (an already allocated array of KYBER_SSBYTES bytes)
* - const unsigned char *pk: pointer to input public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk) {
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES],
unsigned char ss[KYBER_SSBYTES],
const unsigned char pk[KYBER_PUBLICKEYBYTES]) {
uint8_t buf[2 * KYBER_SYMBYTES];
/* Will contain key, coins */
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr;
uint8_t kr[2 * KYBER_SYMBYTES];

randombytes(buf.arr, KYBER_SYMBYTES);
randombytes(buf, KYBER_SYMBYTES);
/* Don't release system RNG output */
hash_h(buf.arr, buf.arr, KYBER_SYMBYTES);
hash_h(buf, buf, KYBER_SYMBYTES);

/* Multitarget countermeasure for coins + contributory KEM */
hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES);
hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES);
hash_g(kr, buf, 2 * KYBER_SYMBYTES);

/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES);
PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES);

/* overwrite coins in kr with H(c) */
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES);
kdf(ss, kr, 2 * KYBER_SYMBYTES);
return 0;
}

@@ -80,47 +80,47 @@ int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct,
* cipher text and private key
*
* Arguments: - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* (an already allocated array of KYBER_SSBYTES bytes)
* - const unsigned char *ct: pointer to input cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
* - const unsigned char *sk: pointer to input private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* (an already allocated array of KYBER_SECRETKEYBYTES bytes)
*
* Returns 0.
*
* On failure, ss will contain a pseudo-random value.
**************************************************/
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk) {
int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES],
const unsigned char ct[KYBER_CIPHERTEXTBYTES],
const unsigned char sk[KYBER_SECRETKEYBYTES]) {
size_t i;
int fail;
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf;
uint8_t buf[2 * KYBER_SYMBYTES];
/* Will contain key, coins */
ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr;
uint8_t cmp[KYBER_CIPHERTEXTBYTES];
uint8_t kr[2 * KYBER_SYMBYTES];
ALIGNED_UINT8(KYBER_CIPHERTEXTBYTES) cmp;
const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;

PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf.arr, ct, sk);
PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf, ct, sk);

/* Multitarget countermeasure for coins + contributory KEM */
for (i = 0; i < KYBER_SYMBYTES; i++) {
buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i];
buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i];
}
hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES);
hash_g(kr, buf, 2 * KYBER_SYMBYTES);

/* coins are in kr+KYBER_SYMBYTES */
PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES);
PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp.coeffs, buf, pk, kr + KYBER_SYMBYTES);

fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES);
fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp.coeffs, KYBER_CIPHERTEXTBYTES);

/* overwrite coins in kr with H(c) */
hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);
hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES);

/* Overwrite pre-k with z on re-encryption failure */
PQCLEAN_KYBER1024_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, (uint8_t)fail);
PQCLEAN_KYBER1024_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail);

/* hash concatenation of pre-k and H(c) to k */
kdf(ss, kr.arr, 2 * KYBER_SYMBYTES);
kdf(ss, kr, 2 * KYBER_SYMBYTES);
return 0;
}

+ 154
- 185
crypto_kem/kyber1024/avx2/ntt.S 查看文件

@@ -1,222 +1,191 @@
#include "cdecl.h"
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1
#mul
.macro mul rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=2,zh1=2
vpmullw %ymm\zl0,%ymm\rh0,%ymm12
vpmullw %ymm\zl0,%ymm\rh1,%ymm13

vpmullw %ymm\zl1,%ymm\rh2,%ymm14
vpmullw %ymm\zl1,%ymm\rh3,%ymm15

vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1

vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3
.endm

#reduce
.macro reduce
vpmulhw %ymm0,%ymm12,%ymm12
vpmulhw %ymm0,%ymm13,%ymm13

vpmulhw %ymm0,%ymm14,%ymm14
vpmulhw %ymm0,%ymm15,%ymm15
vpsubw %ymm12,%ymm\rh0,%ymm12
vpsubw %ymm13,%ymm\rh1,%ymm13
vpsubw %ymm14,%ymm\rh2,%ymm14
vpsubw %ymm15,%ymm\rh3,%ymm15

#update
vpsubw %ymm12,%ymm\rl0,%ymm\rh0
vpaddw %ymm12,%ymm\rl0,%ymm\rl0
vpsubw %ymm13,%ymm\rl1,%ymm\rh1
vpaddw %ymm13,%ymm\rl1,%ymm\rl1
vpsubw %ymm14,%ymm\rl2,%ymm\rh2
vpaddw %ymm14,%ymm\rl2,%ymm\rl2
vpsubw %ymm15,%ymm\rl3,%ymm\rh3
vpaddw %ymm15,%ymm\rl3,%ymm\rl3
.endm

# We break the dependency chains with the cost of slightly more additions.
# But they can be run in parallel to the multiplications on execution port 5
# (multiplications only go to ports 0 and 1)
.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1
#mul
vpmullw %ymm\zl0,%ymm\rh0,%ymm12
vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x
vpmullw %ymm\zl0,%ymm\rh1,%ymm13
vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0
vpmullw %ymm\zl1,%ymm\rh2,%ymm14
vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y
vpmullw %ymm\zl1,%ymm\rh3,%ymm15
vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2
.macro update rln,rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3
vpaddw %ymm\rh0,%ymm\rl0,%ymm\rln
vpsubw %ymm\rh0,%ymm\rl0,%ymm\rh0
vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl0

#reduce
vpmulhw %ymm0,%ymm12,%ymm12
vpmulhw %ymm0,%ymm13,%ymm13
vpmulhw %ymm0,%ymm14,%ymm14
vpmulhw %ymm0,%ymm15,%ymm15
vpsubw %ymm\rh1,%ymm\rl1,%ymm\rh1
vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl1
vpsubw %ymm\rh2,%ymm\rl2,%ymm\rh2

vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1
vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1
vpsubw %ymm\x,%ymm\rl0,%ymm\rh0
vpaddw %ymm\x,%ymm\rl0,%ymm\rl0
vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3
vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3
vpsubw %ymm\y,%ymm\rl2,%ymm\rh2
vpaddw %ymm\y,%ymm\rl2,%ymm\rl2
vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl2
vpsubw %ymm\rh3,%ymm\rl3,%ymm\rh3

#update
vpsubw %ymm12,%ymm\rln,%ymm\rln
vpaddw %ymm12,%ymm\rh0,%ymm\rh0
vpsubw %ymm12,%ymm\rl0,%ymm\rl0
vpsubw %ymm13,%ymm\rl0,%ymm\rl0

vpaddw %ymm13,%ymm\rh1,%ymm\rh1
vpsubw %ymm13,%ymm\rl1,%ymm\rl1
vpsubw %ymm14,%ymm\rl1,%ymm\rl1
vpaddw %ymm14,%ymm\rh2,%ymm\rh2
vpsubw %ymm14,%ymm\rl2,%ymm\rl2

vpsubw %ymm15,%ymm\rl2,%ymm\rl2
vpaddw %ymm15,%ymm\rh3,%ymm\rh3
vpsubw %ymm15,%ymm\rl3,%ymm\rl3
.endm

.text
ntt_level0_avx:
level0:
#zetas
vpbroadcastd (%rsi),%ymm15
vpbroadcastd 4(%rsi),%ymm1

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 256(%rdi),%ymm8
vmovdqa 288(%rdi),%ymm9
vmovdqa 320(%rdi),%ymm10
vmovdqa 352(%rdi),%ymm11

butterfly2 4,5,6,7,8,9,10,11

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm7,96(%rdi)
vmovdqa %ymm8,256(%rdi)
vmovdqa %ymm9,288(%rdi)
vmovdqa %ymm10,320(%rdi)
vmovdqa %ymm11,352(%rdi)
.macro level0 off
vpbroadcastq (_ZETAS_EXP+0)*2(%rsi),%ymm15
vmovdqa (64*\off+128)*2(%rdi),%ymm8
vmovdqa (64*\off+144)*2(%rdi),%ymm9
vmovdqa (64*\off+160)*2(%rdi),%ymm10
vmovdqa (64*\off+176)*2(%rdi),%ymm11
vpbroadcastq (_ZETAS_EXP+4)*2(%rsi),%ymm2

mul 8,9,10,11

vmovdqa (64*\off+ 0)*2(%rdi),%ymm4
vmovdqa (64*\off+ 16)*2(%rdi),%ymm5
vmovdqa (64*\off+ 32)*2(%rdi),%ymm6
vmovdqa (64*\off+ 48)*2(%rdi),%ymm7

reduce
update 3,4,5,6,7,8,9,10,11

vmovdqa %ymm3,(64*\off+ 0)*2(%rdi)
vmovdqa %ymm4,(64*\off+ 16)*2(%rdi)
vmovdqa %ymm5,(64*\off+ 32)*2(%rdi)
vmovdqa %ymm6,(64*\off+ 48)*2(%rdi)
vmovdqa %ymm8,(64*\off+128)*2(%rdi)
vmovdqa %ymm9,(64*\off+144)*2(%rdi)
vmovdqa %ymm10,(64*\off+160)*2(%rdi)
vmovdqa %ymm11,(64*\off+176)*2(%rdi)
.endm

ret
.macro levels1t6 off
/* level 1 */
vmovdqa (_ZETAS_EXP+224*\off+16)*2(%rsi),%ymm15
vmovdqa (128*\off+ 64)*2(%rdi),%ymm8
vmovdqa (128*\off+ 80)*2(%rdi),%ymm9
vmovdqa (128*\off+ 96)*2(%rdi),%ymm10
vmovdqa (128*\off+112)*2(%rdi),%ymm11
vmovdqa (_ZETAS_EXP+224*\off+32)*2(%rsi),%ymm2

ntt_levels1t6_avx:
level1:
#zetas
vpbroadcastd (%rsi),%ymm15
vpbroadcastd 4(%rsi),%ymm1

#load
vmovdqa (%rdi),%ymm4
vmovdqa 32(%rdi),%ymm5
vmovdqa 64(%rdi),%ymm6
vmovdqa 96(%rdi),%ymm7
vmovdqa 128(%rdi),%ymm8
vmovdqa 160(%rdi),%ymm9
vmovdqa 192(%rdi),%ymm10
vmovdqa 224(%rdi),%ymm11

butterfly2 4,5,6,7,8,9,10,11,3

level2:
#zetas
vmovdqu 8(%rsi),%ymm15
vmovdqu 40(%rsi),%ymm1

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

butterfly2 3,8,4,9,5,10,6,11,7

level3:
#zetas
vmovdqu 72(%rsi),%ymm15
vmovdqu 104(%rsi),%ymm1

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

butterfly2 7,5,3,10,8,6,4,11,9

level4:
#zetas
vmovdqu 136(%rsi),%ymm15
vmovdqu 168(%rsi),%ymm1

shuffle2 7,8,9,8
shuffle2 5,6,7,6
shuffle2 3,4,5,4
shuffle2 10,11,3,11

butterfly2 9,8,7,6,5,4,3,11,10

level5:
#zetas
vmovdqu 200(%rsi),%ymm15
vmovdqu 232(%rsi),%ymm1

shuffle1 9,5,10,5
shuffle1 8,4,9,4
shuffle1 7,3,8,3
shuffle1 6,11,7,11

butterfly2 10,5,9,4,8,3,7,11,6

level6:
#zetas
vmovdqu 264(%rsi),%ymm14
vmovdqu 328(%rsi),%ymm15
vmovdqu 296(%rsi),%ymm1
vmovdqu 360(%rsi),%ymm2

butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2

vmovdqa _16XV*2(%rdx),%ymm1
red16 10,12
red16 5,13
red16 9,14
red16 4,15
red16 8,2
red16 3,6
red16 7,12
red16 11,13

#store
vmovdqa %ymm10,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm9,64(%rdi)
vmovdqa %ymm4,96(%rdi)
vmovdqa %ymm8,128(%rdi)
vmovdqa %ymm3,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm11,224(%rdi)
mul 8,9,10,11

ret
vmovdqa (128*\off+ 0)*2(%rdi),%ymm4
vmovdqa (128*\off+ 16)*2(%rdi),%ymm5
vmovdqa (128*\off+ 32)*2(%rdi),%ymm6
vmovdqa (128*\off+ 48)*2(%rdi),%ymm7

reduce
update 3,4,5,6,7,8,9,10,11

/* level 2 */
shuffle8 5,10,7,10
shuffle8 6,11,5,11

vmovdqa (_ZETAS_EXP+224*\off+48)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+64)*2(%rsi),%ymm2

mul 7,10,5,11

shuffle8 3,8,6,8
shuffle8 4,9,3,9

reduce
update 4,6,8,3,9,7,10,5,11

/* level 3 */
shuffle4 8,5,9,5
shuffle4 3,11,8,11

vmovdqa (_ZETAS_EXP+224*\off+80)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+96)*2(%rsi),%ymm2

mul 9,5,8,11

shuffle4 4,7,3,7
shuffle4 6,10,4,10

reduce
update 6,3,7,4,10,9,5,8,11

/* level 4 */
shuffle2 7,8,10,8
shuffle2 4,11,7,11

vmovdqa (_ZETAS_EXP+224*\off+112)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+128)*2(%rsi),%ymm2

mul 10,8,7,11

shuffle2 6,9,4,9
shuffle2 3,5,6,5

reduce
update 3,4,9,6,5,10,8,7,11

/* level 5 */
shuffle1 9,7,5,7
shuffle1 6,11,9,11

vmovdqa (_ZETAS_EXP+224*\off+144)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+160)*2(%rsi),%ymm2

mul 5,7,9,11

shuffle1 3,10,6,10
shuffle1 4,8,3,8

reduce
update 4,6,10,3,8,5,7,9,11

/* level 6 */
vmovdqa (_ZETAS_EXP+224*\off+176)*2(%rsi),%ymm14
vmovdqa (_ZETAS_EXP+224*\off+208)*2(%rsi),%ymm15
vmovdqa (_ZETAS_EXP+224*\off+192)*2(%rsi),%ymm8
vmovdqa (_ZETAS_EXP+224*\off+224)*2(%rsi),%ymm2

mul 10,3,9,11,14,15,8,2

reduce
update 8,4,6,5,7,10,3,9,11

vmovdqa %ymm8,(128*\off+ 0)*2(%rdi)
vmovdqa %ymm4,(128*\off+ 16)*2(%rdi)
vmovdqa %ymm10,(128*\off+ 32)*2(%rdi)
vmovdqa %ymm3,(128*\off+ 48)*2(%rdi)
vmovdqa %ymm6,(128*\off+ 64)*2(%rdi)
vmovdqa %ymm5,(128*\off+ 80)*2(%rdi)
vmovdqa %ymm9,(128*\off+ 96)*2(%rdi)
vmovdqa %ymm11,(128*\off+112)*2(%rdi)
.endm

.text
.global cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx)
.global _cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx)
cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx):
_cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx):
#consts
vmovdqa _16XQ*2(%rsi),%ymm0
mov %rsi,%rdx
add $_ZETAS_EXP*2,%rsi
call ntt_level0_avx
add $128,%rdi
call ntt_level0_avx
sub $128,%rdi
add $8,%rsi
call ntt_levels1t6_avx
add $256,%rdi
add $392,%rsi
call ntt_levels1t6_avx

level0 0
level0 1

levels1t6 0
levels1t6 1

ret

+ 12
- 15
crypto_kem/kyber1024/avx2/ntt.h 查看文件

@@ -1,24 +1,21 @@
#ifndef PQCLEAN_KYBER1024_AVX2_NTT_H
#define PQCLEAN_KYBER1024_AVX2_NTT_H
#include "consts.h"

#include <immintrin.h>
#include <stdint.h>

void PQCLEAN_KYBER1024_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_ntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_invntt_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);

void PQCLEAN_KYBER1024_AVX2_nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_nttpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);

void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r,
const int16_t *a,
const int16_t *b,
const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r,
const int16_t *a,
const int16_t *b,
const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_basemul_avx(__m256i *r,
const __m256i *a,
const __m256i *b,
const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);

void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const __m256i *a, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(__m256i *r, const uint8_t *a, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);

#endif

+ 3
- 2
crypto_kem/kyber1024/avx2/params.h 查看文件

@@ -7,8 +7,6 @@
#define KYBER_N 256
#define KYBER_Q 3329

#define KYBER_ETA 2

#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */
#define KYBER_SSBYTES 32 /* size in bytes of shared key */

@@ -16,9 +14,12 @@
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_K 4
#define KYBER_ETA1 2
#define KYBER_POLYCOMPRESSEDBYTES 160
#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)

#define KYBER_ETA2 2

#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)


+ 176
- 133
crypto_kem/kyber1024/avx2/poly.c 查看文件

@@ -12,76 +12,99 @@
/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_compress
*
* Description: Compression and subsequent serialization of a polynomial
* Description: Compression and subsequent serialization of a polynomial.
* The coefficients of the input polynomial are assumed to
* lie in the invertal [0,q], i.e. the polynomial must be reduced
* by PQCLEAN_KYBER1024_AVX2_poly_reduce().
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length KYBER_POLYCOMPRESSEDBYTES)
* - poly *a: pointer to input polynomial
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) {
unsigned int i, j;
uint8_t t[8];

PQCLEAN_KYBER1024_AVX2_poly_csubq(a);

for (i = 0; i < KYBER_N / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
}
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[160], const poly *restrict a) {
size_t i;
uint32_t low;
__m256i f0, f1;
__m128i t0, t1;
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XV / 16]);
const __m256i shift1 = _mm256_set1_epi16(1 << 10);
const __m256i mask = _mm256_set1_epi16(31);
const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
const __m256i sllvdidx = _mm256_set1_epi64x(12);
const __m256i shufbidx = _mm256_set_epi8( 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9,
-1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0);

r[0] = (t[0] >> 0) | (t[1] << 5);
r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7);
r[2] = (t[3] >> 1) | (t[4] << 4);
r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6);
r[4] = (t[6] >> 2) | (t[7] << 3);
r += 5;
for (i = 0; i < KYBER_N / 32; i++) {
f0 = _mm256_load_si256(&a->vec[2 * i + 0]);
f1 = _mm256_load_si256(&a->vec[2 * i + 1]);
f0 = _mm256_mulhi_epi16(f0, v);
f1 = _mm256_mulhi_epi16(f1, v);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f1 = _mm256_mulhrs_epi16(f1, shift1);
f0 = _mm256_and_si256(f0, mask);
f1 = _mm256_and_si256(f1, mask);
f0 = _mm256_packus_epi16(f0, f1);
f0 = _mm256_maddubs_epi16(f0, shift2); // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
f0 = _mm256_madd_epi16(f0, shift3); // a0 a1 b0 b1 a2 a3 b2 b3
f0 = _mm256_sllv_epi32(f0, sllvdidx);
f0 = _mm256_srlv_epi64(f0, sllvdidx);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
t0 = _mm256_castsi256_si128(f0);
t1 = _mm256_extracti128_si256(f0, 1);
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
_mm_storeu_si128((__m128i *)&r[20 * i + 0], t0);
_mm_store_ss((float *)&low, _mm_castsi128_ps(t1));
r[20 * i + 16] = (uint8_t)low;
r[20 * i + 17] = (uint8_t)(low >> 0x08);
r[20 * i + 18] = (uint8_t)(low >> 0x10);
r[20 * i + 19] = (uint8_t)(low >> 0x18);
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_decompress
*
* Description: De-serialization and subsequent decompression of a polynomial;
* approximate inverse of PQCLEAN_KYBER1024_AVX2_poly_compress
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYCOMPRESSEDBYTES bytes)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r,
const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) {
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r, const uint8_t a[160]) {
unsigned int i;
int16_t h;
__m128i t;
__m256i f;
const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]);
const __m256i shufbidx = _mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5,
4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0);
const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31,
248, 1984, 62, 496, 3968, 124, 992, 31);
const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024,
128, 16, 512, 64, 8, 256, 32, 1024);

unsigned int j;
uint8_t t[8];
for (i = 0; i < KYBER_N / 8; i++) {
t[0] = (a[0] >> 0);
t[1] = (a[0] >> 5) | (a[1] << 3);
t[2] = (a[1] >> 2);
t[3] = (a[1] >> 7) | (a[2] << 1);
t[4] = (a[2] >> 4) | (a[3] << 4);
t[5] = (a[3] >> 1);
t[6] = (a[3] >> 6) | (a[4] << 2);
t[7] = (a[4] >> 3);
a += 5;

for (j = 0; j < 8; j++) {
r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5;
}
for (i = 0; i < KYBER_N / 16; i++) {
t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]);
h = (a[10 * i + 9] << 8) + a[10 * i + 8];
t = _mm_insert_epi16(t, h, 4);
f = _mm256_broadcastsi128_si256(t);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_and_si256(f, mask);
f = _mm256_mullo_epi16(f, shift);
f = _mm256_mulhrs_epi16(f, q);
_mm256_store_si256(&r->vec[i], f);
}
}


/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_tobytes
*
* Description: Serialization of a polynomial
* Description: Serialization of a polynomial in NTT representation.
* The coefficients of the input polynomial are assumed to
* lie in the invertal [0,q], i.e. the polynomial must be reduced
* by PQCLEAN_KYBER1024_AVX2_poly_reduce(). The coefficients are orderd as output by
* PQCLEAN_KYBER1024_AVX2_poly_ntt(); the serialized output coefficients are in bitreversed
* order.
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYBYTES bytes)
* - poly *a: pointer to input polynomial
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec);
}

/*************************************************
@@ -90,12 +113,12 @@ void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
* Description: De-serialization of a polynomial;
* inverse of PQCLEAN_KYBER1024_AVX2_poly_tobytes
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of KYBER_POLYBYTES bytes)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) {
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER1024_AVX2_qdata);
PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->vec, a, PQCLEAN_KYBER1024_AVX2_qdata.vec);
}

/*************************************************
@@ -103,11 +126,10 @@ void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBY
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r,
const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
__m256i f, g0, g1, g2, g3, h0, h1, h2, h3;
const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3));
const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0));
@@ -136,12 +158,12 @@ void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r,
g2 = _mm256_permute2x128_si256(h0,h1,0x31); \
g1 = _mm256_permute2x128_si256(h2,h3,0x20); \
g3 = _mm256_permute2x128_si256(h2,h3,0x31); \
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \
_mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \
_mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3)
_mm256_store_si256(&r->vec[0+2*(i)+0],g0); \
_mm256_store_si256(&r->vec[0+2*(i)+1],g1); \
_mm256_store_si256(&r->vec[8+2*(i)+0],g2); \
_mm256_store_si256(&r->vec[8+2*(i)+1],g3)

f = _mm256_load_si256((__m256i *)msg);
f = _mm256_loadu_si256((__m256i *)msg);
FROMMSG64(0);
FROMMSG64(1);
FROMMSG64(2);
@@ -151,32 +173,34 @@ void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r,
/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_tomsg
*
* Description: Convert polynomial to 32-byte message
* Description: Convert polynomial to 32-byte message.
* The coefficients of the input polynomial are assumed to
* lie in the invertal [0,q], i.e. the polynomial must be reduced
* by PQCLEAN_KYBER1024_AVX2_poly_reduce().
*
* Arguments: - uint8_t *msg: pointer to output message
* - poly *a: pointer to input polynomial
* - poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) {
unsigned int i;
uint32_t small;
__m256i f0, f1, g0, g1;
const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4);
const __m256i hq = _mm256_set1_epi16((KYBER_Q - 1) / 2);
const __m256i hhq = _mm256_set1_epi16((KYBER_Q - 1) / 4);

for (i = 0; i < KYBER_N / 32; i++) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]);
f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]);
f0 = _mm256_sub_epi16(hqs, f0);
f1 = _mm256_sub_epi16(hqs, f1);
f0 = _mm256_load_si256(&a->vec[2 * i + 0]);
f1 = _mm256_load_si256(&a->vec[2 * i + 1]);
f0 = _mm256_sub_epi16(hq, f0);
f1 = _mm256_sub_epi16(hq, f1);
g0 = _mm256_srai_epi16(f0, 15);
g1 = _mm256_srai_epi16(f1, 15);
f0 = _mm256_xor_si256(f0, g0);
f1 = _mm256_xor_si256(f1, g1);
f0 = _mm256_sub_epi16(hhqs, f0);
f1 = _mm256_sub_epi16(hhqs, f1);
f0 = _mm256_sub_epi16(f0, hhq);
f1 = _mm256_sub_epi16(f1, hhq);
f0 = _mm256_packs_epi16(f0, f1);
small = _mm256_movemask_epi8(f0);
small = ~small;
msg[4 * i + 0] = small;
msg[4 * i + 1] = small >> 16;
msg[4 * i + 2] = small >> 8;
@@ -185,24 +209,43 @@ void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise
* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA1
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
ALIGNED_UINT8(KYBER_ETA1 * KYBER_N / 4 + 32) buf; // +32 bytes as required by PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1
prf(buf.coeffs, KYBER_ETA1 * KYBER_N / 4, seed, nonce);
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r, buf.vec);
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA
* with parameter KYBER_ETA2
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf;
prf(buf.arr, sizeof(buf.arr), seed, nonce);
PQCLEAN_KYBER1024_AVX2_cbd(r, buf.arr);
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
ALIGNED_UINT8(KYBER_ETA2 * KYBER_N / 4) buf;
prf(buf.coeffs, KYBER_ETA2 * KYBER_N / 4, seed, nonce);
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta2(r, buf.vec);
}

void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,
#define NOISE_NBLOCKS ((KYBER_ETA1*KYBER_N/4+SHAKE256_RATE-1)/SHAKE256_RATE)
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(poly *r0,
poly *r1,
poly *r2,
poly *r3,
@@ -211,41 +254,46 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,
uint8_t nonce1,
uint8_t nonce2,
uint8_t nonce3) {
ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf;
ALIGNED_UINT8(NOISE_NBLOCKS * SHAKE256_RATE) buf[4];
__m256i f;
keccakx4_state state;

f = _mm256_load_si256((__m256i *)seed);
_mm256_store_si256((__m256i *)buf.arr[0], f);
_mm256_store_si256((__m256i *)buf.arr[1], f);
_mm256_store_si256((__m256i *)buf.arr[2], f);
_mm256_store_si256((__m256i *)buf.arr[3], f);
f = _mm256_loadu_si256((__m256i *)seed);
_mm256_store_si256(buf[0].vec, f);
_mm256_store_si256(buf[1].vec, f);
_mm256_store_si256(buf[2].vec, f);
_mm256_store_si256(buf[3].vec, f);

buf.arr[0][32] = nonce0;
buf.arr[1][32] = nonce1;
buf.arr[2][32] = nonce2;
buf.arr[3][32] = nonce3;
buf[0].coeffs[32] = nonce0;
buf[1].coeffs[32] = nonce1;
buf[2].coeffs[32] = nonce2;
buf[3].coeffs[32] = nonce3;

PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33);
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state);
PQCLEAN_KYBER1024_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 33);
PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, NOISE_NBLOCKS, &state);

PQCLEAN_KYBER1024_AVX2_cbd(r0, buf.arr[0]);
PQCLEAN_KYBER1024_AVX2_cbd(r1, buf.arr[1]);
PQCLEAN_KYBER1024_AVX2_cbd(r2, buf.arr[2]);
PQCLEAN_KYBER1024_AVX2_cbd(r3, buf.arr[3]);
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r0, buf[0].vec);
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r1, buf[1].vec);
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r2, buf[2].vec);
PQCLEAN_KYBER1024_AVX2_poly_cbd_eta1(r3, buf[3].vec);
}


/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_ntt
*
* Description: Computes negacyclic number-theoretic transform (NTT) of
* a polynomial in place;
* inputs assumed to be in normal order, output in bitreversed order
* a polynomial in place.
* Input coefficients assumed to be in normal order,
* output coefficients are in special order that is natural
* for the vectorization. Input coefficients are assumed to be
* bounded by q in absolute value, output coefficients are bounded
* by 16118 in absolute value.
*
* Arguments: - uint16_t *r: pointer to in/output polynomial
* Arguments: - poly *r: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) {
PQCLEAN_KYBER1024_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
PQCLEAN_KYBER1024_AVX2_ntt_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec);
}

/*************************************************
@@ -253,29 +301,35 @@ void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) {
*
* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
* of a polynomial in place;
* inputs assumed to be in bitreversed order, output in normal order
* Input coefficients assumed to be in special order from vectorized
* forward ntt, output in normal order. Input coefficients can be
* arbitrary 16-bit integers, output coefficients are bounded by 14870
* in absolute value.
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
* Arguments: - poly *a: pointer to in/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r) {
PQCLEAN_KYBER1024_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
PQCLEAN_KYBER1024_AVX2_invntt_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec);
}

void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r) {
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec);
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery
*
* Description: Multiplication of two polynomials in NTT domain
* Description: Multiplication of two polynomials in NTT domain.
* One of the input polynomials needs to have coefficients
* bounded by q, the other polynomial can have arbitrary
* coefficients. Output coefficients are bounded by 6656.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) {
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
PQCLEAN_KYBER1024_AVX2_basemul_avx(r->vec, a->vec, b->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec);
}

/*************************************************
@@ -287,7 +341,7 @@ void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, cons
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) {
PQCLEAN_KYBER1024_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
PQCLEAN_KYBER1024_AVX2_tomont_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec);
}

/*************************************************
@@ -299,28 +353,16 @@ void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) {
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r) {
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of a polynomial. For details of conditional subtraction
* of q see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) {
PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata);
PQCLEAN_KYBER1024_AVX2_reduce_avx(r->vec, PQCLEAN_KYBER1024_AVX2_qdata.vec);
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_add
*
* Description: Add two polynomials
* Description: Add two polynomials. No modular reduction
* is performed.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
@@ -328,20 +370,21 @@ void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) {
unsigned int i;
__m256i f0, f1;

for (i = 0; i < KYBER_N; i += 16) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
for (i = 0; i < KYBER_N / 16; i++) {
f0 = _mm256_load_si256(&a->vec[i]);
f1 = _mm256_load_si256(&b->vec[i]);
f0 = _mm256_add_epi16(f0, f1);
_mm256_store_si256((__m256i *)&r->coeffs[i], f0);
_mm256_store_si256(&r->vec[i], f0);
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_poly_sub
*
* Description: Subtract two polynomials
* Description: Subtract two polynomials. No modular reduction
* is performed.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
@@ -349,10 +392,10 @@ void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b) {
unsigned int i;
__m256i f0, f1;

for (i = 0; i < KYBER_N; i += 16) {
f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]);
f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]);
for (i = 0; i < KYBER_N / 16; i++) {
f0 = _mm256_load_si256(&a->vec[i]);
f1 = _mm256_load_si256(&b->vec[i]);
f0 = _mm256_sub_epi16(f0, f1);
_mm256_store_si256((__m256i *)&r->coeffs[i], f0);
_mm256_store_si256(&r->vec[i], f0);
}
}

+ 10
- 12
crypto_kem/kyber1024/avx2/poly.h 查看文件

@@ -1,19 +1,13 @@
#ifndef PQCLEAN_KYBER1024_AVX2_POLY_H
#define PQCLEAN_KYBER1024_AVX2_POLY_H
#include "align.h"
#include "params.h"
#include <immintrin.h>
#include <stdint.h>

/*
* Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
* coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
*/
typedef union {
__m256i dummy;
int16_t coeffs[KYBER_N];
} poly;
typedef ALIGNED_INT16(KYBER_N) poly;

void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a);
void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);

void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a);
@@ -22,8 +16,11 @@ void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBY
void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a);

void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,
void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);

void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);

void PQCLEAN_KYBER1024_AVX2_poly_getnoise_eta1_4x(poly *r0,
poly *r1,
poly *r2,
poly *r3,
@@ -33,6 +30,8 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0,
uint8_t nonce2,
uint8_t nonce3);



void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r);
void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r);
void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r);
@@ -40,7 +39,6 @@ void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, cons
void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r);

void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r);
void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r);

void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b);
void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b);


+ 93
- 72
crypto_kem/kyber1024/avx2/polyvec.c 查看文件

@@ -3,8 +3,79 @@
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <immintrin.h>
#include <stdint.h>

static void poly_compress11(uint8_t r[352 + 2], const poly *restrict a) {
unsigned int i;
__m256i f0, f1, f2;
__m128i t0, t1;
const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XV / 16]);
const __m256i v8 = _mm256_slli_epi16(v, 3);
const __m256i off = _mm256_set1_epi16(36);
const __m256i shift1 = _mm256_set1_epi16(1 << 13);
const __m256i mask = _mm256_set1_epi16(2047);
const __m256i shift2 = _mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
const __m256i sllvdidx = _mm256_set1_epi64x(10);
const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10);
const __m256i shufbidx = _mm256_set_epi8( 4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5,
-1, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);

for (i = 0; i < KYBER_N / 16; i++) {
f0 = _mm256_load_si256(&a->vec[i]);
f1 = _mm256_mullo_epi16(f0, v8);
f2 = _mm256_add_epi16(f0, off);
f0 = _mm256_slli_epi16(f0, 3);
f0 = _mm256_mulhi_epi16(f0, v);
f2 = _mm256_sub_epi16(f1, f2);
f1 = _mm256_andnot_si256(f1, f2);
f1 = _mm256_srli_epi16(f1, 15);
f0 = _mm256_sub_epi16(f0, f1);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f0 = _mm256_and_si256(f0, mask);
f0 = _mm256_madd_epi16(f0, shift2);
f0 = _mm256_sllv_epi32(f0, sllvdidx);
f1 = _mm256_bsrli_epi128(f0, 8);
f0 = _mm256_srlv_epi64(f0, srlvqidx);
f1 = _mm256_slli_epi64(f1, 34);
f0 = _mm256_add_epi64(f0, f1);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
t0 = _mm256_castsi256_si128(f0);
t1 = _mm256_extracti128_si256(f0, 1);
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
_mm_storeu_si128((__m128i *)&r[22 * i + 0], t0);
_mm_storel_epi64((__m128i *)&r[22 * i + 16], t1);
}
}

static void poly_decompress11(poly *restrict r, const uint8_t a[352 + 10]) {
unsigned int i;
__m256i f;
const __m256i q = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]);
const __m256i shufbidx = _mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8,
8, 7, 6, 5, 5, 4, 4, 3,
10, 9, 9, 8, 7, 6, 6, 5,
5, 4, 3, 2, 2, 1, 1, 0);
const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0);
const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0);
const __m256i shift = _mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32);
const __m256i mask = _mm256_set1_epi16(32752);

for (i = 0; i < KYBER_N / 16; i++) {
f = _mm256_loadu_si256((__m256i *)&a[22 * i]);
f = _mm256_permute4x64_epi64(f, 0x94);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_srlv_epi32(f, srlvdidx);
f = _mm256_srlv_epi64(f, srlvqidx);
f = _mm256_mullo_epi16(f, shift);
f = _mm256_srli_epi16(f, 1);
f = _mm256_and_si256(f, mask);
f = _mm256_mulhrs_epi16(f, q);
_mm256_store_si256(&r->vec[i], f);
}
}


/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_compress
*
@@ -14,33 +85,11 @@
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
* - polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES],
polyvec *restrict a) {
size_t i, j, k;

PQCLEAN_KYBER1024_AVX2_polyvec_csubq(a);
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a) {
size_t i;

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
for (k = 0; k < 8; k++) {
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2)
/ KYBER_Q) & 0x7ff;
}

r[ 0] = (t[0] >> 0);
r[ 1] = (t[0] >> 8) | (t[1] << 3);
r[ 2] = (t[1] >> 5) | (t[2] << 6);
r[ 3] = (t[2] >> 2);
r[ 4] = (t[2] >> 10) | (t[3] << 1);
r[ 5] = (t[3] >> 7) | (t[4] << 4);
r[ 6] = (t[4] >> 4) | (t[5] << 7);
r[ 7] = (t[5] >> 1);
r[ 8] = (t[5] >> 9) | (t[6] << 2);
r[ 9] = (t[6] >> 6) | (t[7] << 5);
r[10] = (t[7] >> 3);
r += 11;
}
poly_compress11(&r[352 * i], &a->vec[i]);
}
}

@@ -50,31 +99,15 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBY
* Description: De-serialize and decompress vector of polynomials;
* approximate inverse of PQCLEAN_KYBER1024_AVX2_polyvec_compress
*
* Arguments: - polyvec *r: pointer to output vector of polynomials
* Arguments: - polyvec *r: pointer to output vector of polynomials
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYVECCOMPRESSEDBYTES)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *restrict r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
size_t i, j, k;
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]) {
size_t i;

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8);
t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5);
t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10);
t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7);
t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4);
t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9);
t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6);
t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3);
a += 11;

for (k = 0; k < 8; k++) {
r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11;
}
}
poly_decompress11(&r->vec[i], &a[352 * i]);
}
}

@@ -100,7 +133,7 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyv
* Description: De-serialize vector of polynomials;
* inverse of PQCLEAN_KYBER1024_AVX2_polyvec_tobytes
*
* Arguments: - uint8_t *r: pointer to output byte array
* Arguments: - uint8_t *r: pointer to output byte array
* - const polyvec *a: pointer to input vector of polynomials
* (of length KYBER_POLYVECBYTES)
**************************************************/
@@ -141,29 +174,34 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r) {
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery
*
* Description: Pointwise multiply elements of a and b, accumulate into r,
* Description: Multiply elements in a and b in NTT domain, accumulate into r,
* and multiply by 2^-16.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b) {
PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, & PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) {
size_t i;
poly tmp;

PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]);
for (i = 1; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(&tmp, &a->vec[i], &b->vec[i]);
PQCLEAN_KYBER1024_AVX2_poly_add(r, r, &tmp);
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_reduce
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials
* of each element of a vector of polynomials;
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - polyvec *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) {
size_t i;
@@ -172,23 +210,6 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) {
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of each element of a vector of polynomials
* for details of conditional subtraction of q see comments in
* reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) {
size_t i;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_AVX2_poly_csubq(&r->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_AVX2_polyvec_add
*


+ 3
- 7
crypto_kem/kyber1024/avx2/polyvec.h 查看文件

@@ -8,9 +8,8 @@ typedef struct {
poly vec[KYBER_K];
} polyvec;

void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a);
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES + 2], polyvec *a);
void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES + 12]);

void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a);
void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);
@@ -18,12 +17,9 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_
void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r);
void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r);

void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b);
void PQCLEAN_KYBER1024_AVX2_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);

void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r);
void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r);

void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);



+ 4
- 5
crypto_kem/kyber1024/avx2/reduce.h 查看文件

@@ -1,10 +1,9 @@
#ifndef PQCLEAN_KYBER1024_AVX2_REDUCE_H
#define PQCLEAN_KYBER1024_AVX2_REDUCE_H
#include "consts.h"
#include <stdint.h>
#include "params.h"
#include <immintrin.h>

int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
int16_t PQCLEAN_KYBER1024_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_reduce_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);
void PQCLEAN_KYBER1024_AVX2_tomont_avx(__m256i *r, const __m256i *PQCLEAN_KYBER1024_AVX2_qdata);

#endif

+ 72
- 309
crypto_kem/kyber1024/avx2/rejsample.c 查看文件

@@ -4,311 +4,68 @@
#include "rejsample.h"
#include <immintrin.h>
#include <stdint.h>
#include <string.h>

//#define BMI

static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = {
{-1, -1, -1, -1, -1, -1, -1, -1},
{ 0, -1, -1, -1, -1, -1, -1, -1},
{ 2, -1, -1, -1, -1, -1, -1, -1},
{ 0, 2, -1, -1, -1, -1, -1, -1},
{ 4, -1, -1, -1, -1, -1, -1, -1},
{ 0, 4, -1, -1, -1, -1, -1, -1},
{ 2, 4, -1, -1, -1, -1, -1, -1},
{ 0, 2, 4, -1, -1, -1, -1, -1},
{ 6, -1, -1, -1, -1, -1, -1, -1},
{ 0, 6, -1, -1, -1, -1, -1, -1},
{ 2, 6, -1, -1, -1, -1, -1, -1},
{ 0, 2, 6, -1, -1, -1, -1, -1},
{ 4, 6, -1, -1, -1, -1, -1, -1},
{ 0, 4, 6, -1, -1, -1, -1, -1},
{ 2, 4, 6, -1, -1, -1, -1, -1},
{ 0, 2, 4, 6, -1, -1, -1, -1},
{ 8, -1, -1, -1, -1, -1, -1, -1},
{ 0, 8, -1, -1, -1, -1, -1, -1},
{ 2, 8, -1, -1, -1, -1, -1, -1},
{ 0, 2, 8, -1, -1, -1, -1, -1},
{ 4, 8, -1, -1, -1, -1, -1, -1},
{ 0, 4, 8, -1, -1, -1, -1, -1},
{ 2, 4, 8, -1, -1, -1, -1, -1},
{ 0, 2, 4, 8, -1, -1, -1, -1},
{ 6, 8, -1, -1, -1, -1, -1, -1},
{ 0, 6, 8, -1, -1, -1, -1, -1},
{ 2, 6, 8, -1, -1, -1, -1, -1},
{ 0, 2, 6, 8, -1, -1, -1, -1},
{ 4, 6, 8, -1, -1, -1, -1, -1},
{ 0, 4, 6, 8, -1, -1, -1, -1},
{ 2, 4, 6, 8, -1, -1, -1, -1},
{ 0, 2, 4, 6, 8, -1, -1, -1},
{10, -1, -1, -1, -1, -1, -1, -1},
{ 0, 10, -1, -1, -1, -1, -1, -1},
{ 2, 10, -1, -1, -1, -1, -1, -1},
{ 0, 2, 10, -1, -1, -1, -1, -1},
{ 4, 10, -1, -1, -1, -1, -1, -1},
{ 0, 4, 10, -1, -1, -1, -1, -1},
{ 2, 4, 10, -1, -1, -1, -1, -1},
{ 0, 2, 4, 10, -1, -1, -1, -1},
{ 6, 10, -1, -1, -1, -1, -1, -1},
{ 0, 6, 10, -1, -1, -1, -1, -1},
{ 2, 6, 10, -1, -1, -1, -1, -1},
{ 0, 2, 6, 10, -1, -1, -1, -1},
{ 4, 6, 10, -1, -1, -1, -1, -1},
{ 0, 4, 6, 10, -1, -1, -1, -1},
{ 2, 4, 6, 10, -1, -1, -1, -1},
{ 0, 2, 4, 6, 10, -1, -1, -1},
{ 8, 10, -1, -1, -1, -1, -1, -1},
{ 0, 8, 10, -1, -1, -1, -1, -1},
{ 2, 8, 10, -1, -1, -1, -1, -1},
{ 0, 2, 8, 10, -1, -1, -1, -1},
{ 4, 8, 10, -1, -1, -1, -1, -1},
{ 0, 4, 8, 10, -1, -1, -1, -1},
{ 2, 4, 8, 10, -1, -1, -1, -1},
{ 0, 2, 4, 8, 10, -1, -1, -1},
{ 6, 8, 10, -1, -1, -1, -1, -1},
{ 0, 6, 8, 10, -1, -1, -1, -1},
{ 2, 6, 8, 10, -1, -1, -1, -1},
{ 0, 2, 6, 8, 10, -1, -1, -1},
{ 4, 6, 8, 10, -1, -1, -1, -1},
{ 0, 4, 6, 8, 10, -1, -1, -1},
{ 2, 4, 6, 8, 10, -1, -1, -1},
{ 0, 2, 4, 6, 8, 10, -1, -1},
{12, -1, -1, -1, -1, -1, -1, -1},
{ 0, 12, -1, -1, -1, -1, -1, -1},
{ 2, 12, -1, -1, -1, -1, -1, -1},
{ 0, 2, 12, -1, -1, -1, -1, -1},
{ 4, 12, -1, -1, -1, -1, -1, -1},
{ 0, 4, 12, -1, -1, -1, -1, -1},
{ 2, 4, 12, -1, -1, -1, -1, -1},
{ 0, 2, 4, 12, -1, -1, -1, -1},
{ 6, 12, -1, -1, -1, -1, -1, -1},
{ 0, 6, 12, -1, -1, -1, -1, -1},
{ 2, 6, 12, -1, -1, -1, -1, -1},
{ 0, 2, 6, 12, -1, -1, -1, -1},
{ 4, 6, 12, -1, -1, -1, -1, -1},
{ 0, 4, 6, 12, -1, -1, -1, -1},
{ 2, 4, 6, 12, -1, -1, -1, -1},
{ 0, 2, 4, 6, 12, -1, -1, -1},
{ 8, 12, -1, -1, -1, -1, -1, -1},
{ 0, 8, 12, -1, -1, -1, -1, -1},
{ 2, 8, 12, -1, -1, -1, -1, -1},
{ 0, 2, 8, 12, -1, -1, -1, -1},
{ 4, 8, 12, -1, -1, -1, -1, -1},
{ 0, 4, 8, 12, -1, -1, -1, -1},
{ 2, 4, 8, 12, -1, -1, -1, -1},
{ 0, 2, 4, 8, 12, -1, -1, -1},
{ 6, 8, 12, -1, -1, -1, -1, -1},
{ 0, 6, 8, 12, -1, -1, -1, -1},
{ 2, 6, 8, 12, -1, -1, -1, -1},
{ 0, 2, 6, 8, 12, -1, -1, -1},
{ 4, 6, 8, 12, -1, -1, -1, -1},
{ 0, 4, 6, 8, 12, -1, -1, -1},
{ 2, 4, 6, 8, 12, -1, -1, -1},
{ 0, 2, 4, 6, 8, 12, -1, -1},
{10, 12, -1, -1, -1, -1, -1, -1},
{ 0, 10, 12, -1, -1, -1, -1, -1},
{ 2, 10, 12, -1, -1, -1, -1, -1},
{ 0, 2, 10, 12, -1, -1, -1, -1},
{ 4, 10, 12, -1, -1, -1, -1, -1},
{ 0, 4, 10, 12, -1, -1, -1, -1},
{ 2, 4, 10, 12, -1, -1, -1, -1},
{ 0, 2, 4, 10, 12, -1, -1, -1},
{ 6, 10, 12, -1, -1, -1, -1, -1},
{ 0, 6, 10, 12, -1, -1, -1, -1},
{ 2, 6, 10, 12, -1, -1, -1, -1},
{ 0, 2, 6, 10, 12, -1, -1, -1},
{ 4, 6, 10, 12, -1, -1, -1, -1},
{ 0, 4, 6, 10, 12, -1, -1, -1},
{ 2, 4, 6, 10, 12, -1, -1, -1},
{ 0, 2, 4, 6, 10, 12, -1, -1},
{ 8, 10, 12, -1, -1, -1, -1, -1},
{ 0, 8, 10, 12, -1, -1, -1, -1},
{ 2, 8, 10, 12, -1, -1, -1, -1},
{ 0, 2, 8, 10, 12, -1, -1, -1},
{ 4, 8, 10, 12, -1, -1, -1, -1},
{ 0, 4, 8, 10, 12, -1, -1, -1},
{ 2, 4, 8, 10, 12, -1, -1, -1},
{ 0, 2, 4, 8, 10, 12, -1, -1},
{ 6, 8, 10, 12, -1, -1, -1, -1},
{ 0, 6, 8, 10, 12, -1, -1, -1},
{ 2, 6, 8, 10, 12, -1, -1, -1},
{ 0, 2, 6, 8, 10, 12, -1, -1},
{ 4, 6, 8, 10, 12, -1, -1, -1},
{ 0, 4, 6, 8, 10, 12, -1, -1},
{ 2, 4, 6, 8, 10, 12, -1, -1},
{ 0, 2, 4, 6, 8, 10, 12, -1},
{14, -1, -1, -1, -1, -1, -1, -1},
{ 0, 14, -1, -1, -1, -1, -1, -1},
{ 2, 14, -1, -1, -1, -1, -1, -1},
{ 0, 2, 14, -1, -1, -1, -1, -1},
{ 4, 14, -1, -1, -1, -1, -1, -1},
{ 0, 4, 14, -1, -1, -1, -1, -1},
{ 2, 4, 14, -1, -1, -1, -1, -1},
{ 0, 2, 4, 14, -1, -1, -1, -1},
{ 6, 14, -1, -1, -1, -1, -1, -1},
{ 0, 6, 14, -1, -1, -1, -1, -1},
{ 2, 6, 14, -1, -1, -1, -1, -1},
{ 0, 2, 6, 14, -1, -1, -1, -1},
{ 4, 6, 14, -1, -1, -1, -1, -1},
{ 0, 4, 6, 14, -1, -1, -1, -1},
{ 2, 4, 6, 14, -1, -1, -1, -1},
{ 0, 2, 4, 6, 14, -1, -1, -1},
{ 8, 14, -1, -1, -1, -1, -1, -1},
{ 0, 8, 14, -1, -1, -1, -1, -1},
{ 2, 8, 14, -1, -1, -1, -1, -1},
{ 0, 2, 8, 14, -1, -1, -1, -1},
{ 4, 8, 14, -1, -1, -1, -1, -1},
{ 0, 4, 8, 14, -1, -1, -1, -1},
{ 2, 4, 8, 14, -1, -1, -1, -1},
{ 0, 2, 4, 8, 14, -1, -1, -1},
{ 6, 8, 14, -1, -1, -1, -1, -1},
{ 0, 6, 8, 14, -1, -1, -1, -1},
{ 2, 6, 8, 14, -1, -1, -1, -1},
{ 0, 2, 6, 8, 14, -1, -1, -1},
{ 4, 6, 8, 14, -1, -1, -1, -1},
{ 0, 4, 6, 8, 14, -1, -1, -1},
{ 2, 4, 6, 8, 14, -1, -1, -1},
{ 0, 2, 4, 6, 8, 14, -1, -1},
{10, 14, -1, -1, -1, -1, -1, -1},
{ 0, 10, 14, -1, -1, -1, -1, -1},
{ 2, 10, 14, -1, -1, -1, -1, -1},
{ 0, 2, 10, 14, -1, -1, -1, -1},
{ 4, 10, 14, -1, -1, -1, -1, -1},
{ 0, 4, 10, 14, -1, -1, -1, -1},
{ 2, 4, 10, 14, -1, -1, -1, -1},
{ 0, 2, 4, 10, 14, -1, -1, -1},
{ 6, 10, 14, -1, -1, -1, -1, -1},
{ 0, 6, 10, 14, -1, -1, -1, -1},
{ 2, 6, 10, 14, -1, -1, -1, -1},
{ 0, 2, 6, 10, 14, -1, -1, -1},
{ 4, 6, 10, 14, -1, -1, -1, -1},
{ 0, 4, 6, 10, 14, -1, -1, -1},
{ 2, 4, 6, 10, 14, -1, -1, -1},
{ 0, 2, 4, 6, 10, 14, -1, -1},
{ 8, 10, 14, -1, -1, -1, -1, -1},
{ 0, 8, 10, 14, -1, -1, -1, -1},
{ 2, 8, 10, 14, -1, -1, -1, -1},
{ 0, 2, 8, 10, 14, -1, -1, -1},
{ 4, 8, 10, 14, -1, -1, -1, -1},
{ 0, 4, 8, 10, 14, -1, -1, -1},
{ 2, 4, 8, 10, 14, -1, -1, -1},
{ 0, 2, 4, 8, 10, 14, -1, -1},
{ 6, 8, 10, 14, -1, -1, -1, -1},
{ 0, 6, 8, 10, 14, -1, -1, -1},
{ 2, 6, 8, 10, 14, -1, -1, -1},
{ 0, 2, 6, 8, 10, 14, -1, -1},
{ 4, 6, 8, 10, 14, -1, -1, -1},
{ 0, 4, 6, 8, 10, 14, -1, -1},
{ 2, 4, 6, 8, 10, 14, -1, -1},
{ 0, 2, 4, 6, 8, 10, 14, -1},
{12, 14, -1, -1, -1, -1, -1, -1},
{ 0, 12, 14, -1, -1, -1, -1, -1},
{ 2, 12, 14, -1, -1, -1, -1, -1},
{ 0, 2, 12, 14, -1, -1, -1, -1},
{ 4, 12, 14, -1, -1, -1, -1, -1},
{ 0, 4, 12, 14, -1, -1, -1, -1},
{ 2, 4, 12, 14, -1, -1, -1, -1},
{ 0, 2, 4, 12, 14, -1, -1, -1},
{ 6, 12, 14, -1, -1, -1, -1, -1},
{ 0, 6, 12, 14, -1, -1, -1, -1},
{ 2, 6, 12, 14, -1, -1, -1, -1},
{ 0, 2, 6, 12, 14, -1, -1, -1},
{ 4, 6, 12, 14, -1, -1, -1, -1},
{ 0, 4, 6, 12, 14, -1, -1, -1},
{ 2, 4, 6, 12, 14, -1, -1, -1},
{ 0, 2, 4, 6, 12, 14, -1, -1},
{ 8, 12, 14, -1, -1, -1, -1, -1},
{ 0, 8, 12, 14, -1, -1, -1, -1},
{ 2, 8, 12, 14, -1, -1, -1, -1},
{ 0, 2, 8, 12, 14, -1, -1, -1},
{ 4, 8, 12, 14, -1, -1, -1, -1},
{ 0, 4, 8, 12, 14, -1, -1, -1},
{ 2, 4, 8, 12, 14, -1, -1, -1},
{ 0, 2, 4, 8, 12, 14, -1, -1},
{ 6, 8, 12, 14, -1, -1, -1, -1},
{ 0, 6, 8, 12, 14, -1, -1, -1},
{ 2, 6, 8, 12, 14, -1, -1, -1},
{ 0, 2, 6, 8, 12, 14, -1, -1},
{ 4, 6, 8, 12, 14, -1, -1, -1},
{ 0, 4, 6, 8, 12, 14, -1, -1},
{ 2, 4, 6, 8, 12, 14, -1, -1},
{ 0, 2, 4, 6, 8, 12, 14, -1},
{10, 12, 14, -1, -1, -1, -1, -1},
{ 0, 10, 12, 14, -1, -1, -1, -1},
{ 2, 10, 12, 14, -1, -1, -1, -1},
{ 0, 2, 10, 12, 14, -1, -1, -1},
{ 4, 10, 12, 14, -1, -1, -1, -1},
{ 0, 4, 10, 12, 14, -1, -1, -1},
{ 2, 4, 10, 12, 14, -1, -1, -1},
{ 0, 2, 4, 10, 12, 14, -1, -1},
{ 6, 10, 12, 14, -1, -1, -1, -1},
{ 0, 6, 10, 12, 14, -1, -1, -1},
{ 2, 6, 10, 12, 14, -1, -1, -1},
{ 0, 2, 6, 10, 12, 14, -1, -1},
{ 4, 6, 10, 12, 14, -1, -1, -1},
{ 0, 4, 6, 10, 12, 14, -1, -1},
{ 2, 4, 6, 10, 12, 14, -1, -1},
{ 0, 2, 4, 6, 10, 12, 14, -1},
{ 8, 10, 12, 14, -1, -1, -1, -1},
{ 0, 8, 10, 12, 14, -1, -1, -1},
{ 2, 8, 10, 12, 14, -1, -1, -1},
{ 0, 2, 8, 10, 12, 14, -1, -1},
{ 4, 8, 10, 12, 14, -1, -1, -1},
{ 0, 4, 8, 10, 12, 14, -1, -1},
{ 2, 4, 8, 10, 12, 14, -1, -1},
{ 0, 2, 4, 8, 10, 12, 14, -1},
{ 6, 8, 10, 12, 14, -1, -1, -1},
{ 0, 6, 8, 10, 12, 14, -1, -1},
{ 2, 6, 8, 10, 12, 14, -1, -1},
{ 0, 2, 6, 8, 10, 12, 14, -1},
{ 4, 6, 8, 10, 12, 14, -1, -1},
{ 0, 4, 6, 8, 10, 12, 14, -1},
{ 2, 4, 6, 8, 10, 12, 14, -1},
{ 0, 2, 4, 6, 8, 10, 12, 14}
}
};

#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a)
#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a)

#define REJ_UNIFORM_BUFLEN 672
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r,
const uint8_t *restrict buf) {
unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, const uint8_t *buf) {
unsigned int ctr, pos;
uint16_t val;
uint16_t val0, val1;
uint32_t good;
const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1));
uint64_t idx0, idx1, idx2, idx3;
const __m256i bound = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_qdata.vec[_16XQ / 16]);
const __m256i ones = _mm256_set1_epi8(1);
const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XQ]);
const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XV]);
const __m256i mask = _mm256_set1_epi16(0xFFF);
const __m256i idx8 = _mm256_set_epi8(15, 14, 14, 13, 12, 11, 11, 10,
9, 8, 8, 7, 6, 5, 5, 4,
11, 10, 10, 9, 8, 7, 7, 6,
5, 4, 4, 3, 2, 1, 1, 0);
__m256i f0, f1, g0, g1, g2, g3;
__m128i f, t, pilo, pihi;

ctr = 0;
for (pos = 0; pos < 2 * KYBER_N; pos += 64) {
f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]);
f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]);

g0 = _mm256_cmpge_epu16(bound, f0);
g1 = _mm256_cmpge_epu16(bound, f1);
ctr = pos = 0;
while (ctr <= KYBER_N - 32 && pos <= REJ_UNIFORM_AVX_BUFLEN - 48) {
f0 = _mm256_loadu_si256((__m256i *)&buf[pos]);
f1 = _mm256_loadu_si256((__m256i *)&buf[pos + 24]);
f0 = _mm256_permute4x64_epi64(f0, 0x94);
f1 = _mm256_permute4x64_epi64(f1, 0x94);
f0 = _mm256_shuffle_epi8(f0, idx8);
f1 = _mm256_shuffle_epi8(f1, idx8);
g0 = _mm256_srli_epi16(f0, 4);
g1 = _mm256_srli_epi16(f1, 4);
f0 = _mm256_blend_epi16(f0, g0, 0xAA);
f1 = _mm256_blend_epi16(f1, g1, 0xAA);
f0 = _mm256_and_si256(f0, mask);
f1 = _mm256_and_si256(f1, mask);
pos += 48;

g0 = _mm256_cmpgt_epi16(bound, f0);
g1 = _mm256_cmpgt_epi16(bound, f1);

g0 = _mm256_packs_epi16(g0, g1);
good = _mm256_movemask_epi8(g0);

g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF]));
g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF]));
g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1);
g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1);

//g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good));
//g1 = _mm256_i64gather_epi64((long long *)idx.arr, g0, 8);
/* Barrett reduction of (still unsigned) values */
g2 = _mm256_mulhi_epu16(f0, v);
g3 = _mm256_mulhi_epu16(f1, v);
g2 = _mm256_srli_epi16(g2, 10);
g3 = _mm256_srli_epi16(g3, 10);
g2 = _mm256_mullo_epi16(g2, kyberq);
g3 = _mm256_mullo_epi16(g3, kyberq);
f0 = _mm256_sub_epi16(f0, g2);
f1 = _mm256_sub_epi16(f1, g3);
idx0 = _pdep_u64(good >> 0, 0x0101010101010101);
idx1 = _pdep_u64(good >> 8, 0x0101010101010101);
idx2 = _pdep_u64(good >> 16, 0x0101010101010101);
idx3 = _pdep_u64(good >> 24, 0x0101010101010101);
idx0 = (idx0 << 8) - idx0;
idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
idx1 = (idx1 << 8) - idx1;
idx1 = _pext_u64(0x0E0C0A0806040200, idx1);
idx2 = (idx2 << 8) - idx2;
idx2 = _pext_u64(0x0E0C0A0806040200, idx2);
idx3 = (idx3 << 8) - idx3;
idx3 = _pext_u64(0x0E0C0A0806040200, idx3);

g0 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx0));
g1 = _mm256_castsi128_si256(_mm_cvtsi64_si128(idx1));
g0 = _mm256_inserti128_si256(g0, _mm_cvtsi64_si128(idx2), 1);
g1 = _mm256_inserti128_si256(g1, _mm_cvtsi64_si128(idx3), 1);

g2 = _mm256_add_epi8(g0, ones);
g3 = _mm256_add_epi8(g1, ones);
@@ -328,34 +85,40 @@ unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r,
ctr += _mm_popcnt_u32((good >> 24) & 0xFF);
}

while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) {
f = _mm_load_si128((__m128i *)&buf[pos]);
t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f);
while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_AVX_BUFLEN - 12) {
f = _mm_loadu_si128((__m128i *)&buf[pos]);
f = _mm_shuffle_epi8(f, _mm256_castsi256_si128(idx8));
t = _mm_srli_epi16(f, 4);
f = _mm_blend_epi16(f, t, 0xAA);
f = _mm_and_si128(f, _mm256_castsi256_si128(mask));
pos += 12;

t = _mm_cmpgt_epi16(_mm256_castsi256_si128(bound), f);
good = _mm_movemask_epi8(t);
good = _pext_u32(good, 0x5555);
pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]);
pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
pilo = _mm_unpacklo_epi8(pilo, pihi);

/* Barrett reduction */
t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v));
t = _mm_srli_epi16(t, 10);
t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq));
f = _mm_sub_epi16(f, t);
good &= 0x5555;
idx0 = _pdep_u64(good, 0x1111111111111111);
idx0 = (idx0 << 8) - idx0;
idx0 = _pext_u64(0x0E0C0A0806040200, idx0);
pilo = _mm_cvtsi64_si128(idx0);

pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones));
pilo = _mm_unpacklo_epi8(pilo, pihi);
f = _mm_shuffle_epi8(f, pilo);
_mm_storeu_si128((__m128i *)&r[ctr], f);
ctr += _mm_popcnt_u32(good);
pos += 16;
}

while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) {
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;
while (ctr < KYBER_N && pos <= REJ_UNIFORM_AVX_BUFLEN - 3) {
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4));
pos += 3;

if (val < 19 * KYBER_Q) {
val -= ((int32_t)val * 20159 >> 26) * KYBER_Q;
r[ctr++] = val;
if (val0 < KYBER_Q) {
r[ctr++] = val0;
}
if (val1 < KYBER_Q && ctr < KYBER_N) {
r[ctr++] = val1;
}
}



+ 5
- 2
crypto_kem/kyber1024/avx2/rejsample.h 查看文件

@@ -1,9 +1,12 @@
#ifndef PQCLEAN_KYBER1024_AVX2_REJSAMPLE_H
#define PQCLEAN_KYBER1024_AVX2_REJSAMPLE_H
#include "params.h"
#include "symmetric.h"
#include <stdint.h>

unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r,
const unsigned char *buf);
#define REJ_UNIFORM_AVX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
#define REJ_UNIFORM_AVX_BUFLEN (REJ_UNIFORM_AVX_NBLOCKS*XOF_BLOCKBYTES)

unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r, const uint8_t *buf);

#endif

+ 6
- 6
crypto_kem/kyber1024/avx2/shuffle.S 查看文件

@@ -113,13 +113,13 @@ vmovdqa 224(%rsi),%ymm12

#csubq
csubq 5,13
csubq 6,14
csubq 7,15
csubq 8,1
csubq 6,13
csubq 7,13
csubq 8,13
csubq 9,13
csubq 10,14
csubq 11,15
csubq 12,1
csubq 10,13
csubq 11,13
csubq 12,13

#bitpack
vpsllw $12,%ymm6,%ymm4


+ 10
- 10
crypto_kem/kyber1024/avx2/shuffle.inc 查看文件

@@ -9,17 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2
#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3
vpsllq $32,%ymm\r1,%ymm12
vpsrlq $32,%ymm\r0,%ymm13
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3
#vpsllq $32,%ymm\r1,%ymm\r2
vmovsldup %ymm\r1,%ymm\r2
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrlq $32,%ymm\r0,%ymm\r0
#vmovshdup %ymm\r0,%ymm\r0
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm12
vpsrld $16,%ymm\r0,%ymm13
vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3
vpslld $16,%ymm\r1,%ymm\r2
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld $16,%ymm\r0,%ymm\r0
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

+ 12
- 18
crypto_kem/kyber1024/avx2/symmetric-shake.c 查看文件

@@ -9,12 +9,10 @@
*
* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
*
* Arguments: - xof_state *state: pointer to (uninitialized) output
* Keccak state
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input
* to be absorbed into state
* - uint8_t i additional byte of input
* - uint8_t j additional byte of input
* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
* - uint8_t i: additional byte of input
* - uint8_t j: additional byte of input
**************************************************/
void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state,
const uint8_t seed[KYBER_SYMBYTES],
@@ -26,8 +24,8 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state,
for (i = 0; i < KYBER_SYMBYTES; i++) {
extseed[i] = seed[i];
}
extseed[i++] = x;
extseed[i] = y;
extseed[KYBER_SYMBYTES + 0] = x;
extseed[KYBER_SYMBYTES + 1] = y;

shake128_absorb(state, extseed, sizeof(extseed));
}
@@ -38,23 +36,19 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state,
* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
* and then generates outlen bytes of SHAKE256 output
*
* Arguments: - uint8_t *out: pointer to output
* - size_t outlen: number of requested output bytes
* - const uint8_t *key: pointer to the key
* (of length KYBER_SYMBYTES)
* - uint8_t nonce: single-byte nonce (public PRF input)
* Arguments: - uint8_t *out: pointer to output
* - size_t outlen: number of requested output bytes
* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
* - uint8_t nonce: single-byte nonce (public PRF input)
**************************************************/
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out,
size_t outlen,
const uint8_t key[KYBER_SYMBYTES],
uint8_t nonce) {
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) {
unsigned int i;
uint8_t extkey[KYBER_SYMBYTES + 1];

for (i = 0; i < KYBER_SYMBYTES; i++) {
extkey[i] = key[i];
}
extkey[i] = nonce;
extkey[KYBER_SYMBYTES] = nonce;

shake256(out, outlen, extkey, sizeof(extkey));
}

+ 3
- 8
crypto_kem/kyber1024/avx2/symmetric.h 查看文件

@@ -15,21 +15,16 @@ void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *s,
uint8_t x,
uint8_t y);

void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out,
size_t outlen,
const uint8_t key[KYBER_SYMBYTES],
uint8_t nonce);
void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);

#define XOF_BLOCKBYTES SHAKE128_RATE

#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_ctx_release(STATE) shake128_ctx_release(STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) \
PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
#define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES)




+ 33
- 35
crypto_kem/kyber1024/avx2/verify.c 查看文件

@@ -8,31 +8,31 @@
*
* Description: Compare two arrays for equality in constant time.
*
* Arguments: const unsigned char *a: pointer to first byte array
* const unsigned char *b: pointer to second byte array
* size_t len: length of the byte arrays
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
* size_t len: length of the byte arrays
*
* Returns 0 if the byte arrays are equal, 1 otherwise
**************************************************/
int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
size_t pos;
size_t i;
uint64_t r;
__m256i avec, bvec, cvec;
__m256i f, g, h;

cvec = _mm256_setzero_si256();
for (pos = 0; pos + 32 <= len; pos += 32) {
avec = _mm256_loadu_si256((__m256i *)&a[pos]);
bvec = _mm256_loadu_si256((__m256i *)&b[pos]);
avec = _mm256_xor_si256(avec, bvec);
cvec = _mm256_or_si256(cvec, avec);
h = _mm256_setzero_si256();
for (i = 0; i < len / 32; i++) {
f = _mm256_loadu_si256((__m256i *)&a[32 * i]);
g = _mm256_loadu_si256((__m256i *)&b[32 * i]);
f = _mm256_xor_si256(f, g);
h = _mm256_or_si256(h, f);
}
r = 1 - _mm256_testz_si256(cvec, cvec);
r = 1 - _mm256_testz_si256(h, h);

if (pos < len) {
avec = _mm256_loadu_si256((__m256i *)&a[pos]);
bvec = _mm256_loadu_si256((__m256i *)&b[pos]);
cvec = _mm256_cmpeq_epi8(avec, bvec);
r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len));
a += 32 * i;
b += 32 * i;
len -= 32 * i;
for (i = 0; i < len; i++) {
r |= a[i] ^ b[i];
}

r = (-r) >> 63;
@@ -47,29 +47,27 @@ int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len
* assumes two's complement representation of negative integers.
* Runs in constant time.
*
* Arguments: unsigned char *r: pointer to output byte array
* Arguments: unsigned char *r: pointer to output byte array
* const unsigned char *x: pointer to input byte array
* size_t len: Amount of bytes to be copied
* unsigned char b: Condition bit; has to be in {0,1}
* size_t len: Amount of bytes to be copied
* unsigned char b: Condition bit; has to be in {0,1}
**************************************************/
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) {
size_t pos;
void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *x, size_t len, uint8_t b) {
size_t i;
__m256i xvec, rvec, bvec;

b = -b;
bvec = _mm256_set1_epi8(b);

for (pos = 0; pos + 32 <= len; pos += 32) {
rvec = _mm256_loadu_si256((__m256i *)&r[pos]);
xvec = _mm256_loadu_si256((__m256i *)&x[pos]);
xvec = _mm256_xor_si256(xvec, rvec);
xvec = _mm256_and_si256(xvec, bvec);
rvec = _mm256_xor_si256(rvec, xvec);
_mm256_storeu_si256((__m256i *)&r[pos], rvec);
bvec = _mm256_set1_epi64x(-(uint64_t)b);
for (i = 0; i < len / 32; i++) {
rvec = _mm256_loadu_si256((__m256i *)&r[32 * i]);
xvec = _mm256_loadu_si256((__m256i *)&x[32 * i]);
rvec = _mm256_blendv_epi8(rvec, xvec, bvec);
_mm256_storeu_si256((__m256i *)&r[32 * i], rvec);
}

while (pos < len) {
r[pos] ^= b & (x[pos] ^ r[pos]);
pos += 1;
r += 32 * i;
x += 32 * i;
len -= 32 * i;
for (i = 0; i < len; i++) {
r[i] ^= -b & (x[i] ^ r[i]);
}
}

+ 38
- 5
crypto_kem/kyber1024/clean/cbd.c 查看文件

@@ -5,7 +5,7 @@
/*************************************************
* Name: load32_littleendian
*
* Description: load bytes into a 32-bit integer
* Description: load 4 bytes into a 32-bit integer
* in little-endian order
*
* Arguments: - const uint8_t *x: pointer to input byte array
@@ -22,16 +22,29 @@ static uint32_t load32_littleendian(const uint8_t x[4]) {
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_cbd
* Name: load24_littleendian
*
* Description: load 3 bytes into a 32-bit integer
* in little-endian order.
* This function is only needed for Kyber-512
*
* Arguments: - const uint8_t *x: pointer to input byte array
*
* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
**************************************************/


/*************************************************
* Name: cbd2
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
* a centered binomial distribution with parameter eta=2
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) {
static void cbd2(poly *r, const uint8_t buf[2 * KYBER_N / 4]) {
unsigned int i, j;
uint32_t t, d;
int16_t a, b;
@@ -48,3 +61,23 @@ void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N
}
}
}

/*************************************************
* Name: cbd3
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter eta=3.
* This function is only needed for Kyber-512
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
**************************************************/

void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]) {
cbd2(r, buf);
}

void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]) {
cbd2(r, buf);
}

+ 3
- 1
crypto_kem/kyber1024/clean/cbd.h 查看文件

@@ -4,6 +4,8 @@
#include "poly.h"
#include <stdint.h>

void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);
void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(poly *r, const uint8_t buf[KYBER_ETA1 * KYBER_N / 4]);

void PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(poly *r, const uint8_t buf[KYBER_ETA2 * KYBER_N / 4]);

#endif

+ 72
- 77
crypto_kem/kyber1024/clean/indcpa.c 查看文件

@@ -15,8 +15,8 @@
* serialized vector of polynomials pk
* and the public seed used to generate the matrix A.
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* polyvec *pk: pointer to the input public-key polyvec
* Arguments: uint8_t *r: pointer to the output serialized public key
* polyvec *pk: pointer to the input public-key polyvec
* const uint8_t *seed: pointer to the input public seed
**************************************************/
static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
@@ -35,10 +35,8 @@ static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES],
* Description: De-serialize public key from a byte array;
* approximate inverse of pack_pk
*
* Arguments: - polyvec *pk: pointer to output public-key
* polynomial vector
* - uint8_t *seed: pointer to output seed to generate
* matrix A
* Arguments: - polyvec *pk: pointer to output public-key polynomial vector
* - uint8_t *seed: pointer to output seed to generate matrix A
* - const uint8_t *packedpk: pointer to input serialized public key
**************************************************/
static void unpack_pk(polyvec *pk,
@@ -56,7 +54,7 @@ static void unpack_pk(polyvec *pk,
*
* Description: Serialize the secret key
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - polyvec *sk: pointer to input vector of polynomials (secret key)
**************************************************/
static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
@@ -66,15 +64,12 @@ static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) {
/*************************************************
* Name: unpack_sk
*
* Description: De-serialize the secret key;
* inverse of pack_sk
* Description: De-serialize the secret key; inverse of pack_sk
*
* Arguments: - polyvec *sk: pointer to output vector of
* polynomials (secret key)
* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret key
**************************************************/
static void unpack_sk(polyvec *sk,
const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
static void unpack_sk(polyvec *sk, const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) {
PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(sk, packedsk);
}

@@ -86,12 +81,10 @@ static void unpack_sk(polyvec *sk,
* and the compressed and serialized polynomial v
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
* poly *pk: pointer to the input vector of polynomials b
* poly *v: pointer to the input polynomial v
**************************************************/
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
polyvec *b,
poly *v) {
static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], polyvec *b, poly *v) {
PQCLEAN_KYBER1024_CLEAN_polyvec_compress(r, b);
PQCLEAN_KYBER1024_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
}
@@ -102,13 +95,11 @@ static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES],
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of pack_ciphertext
*
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* Arguments: - polyvec *b: pointer to the output vector of polynomials b
* - poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
**************************************************/
static void unpack_ciphertext(polyvec *b,
poly *v,
const uint8_t c[KYBER_INDCPA_BYTES]) {
static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t c[KYBER_INDCPA_BYTES]) {
PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(b, c);
PQCLEAN_KYBER1024_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
}
@@ -119,11 +110,9 @@ static void unpack_ciphertext(polyvec *b,
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers
* (uniform mod q)
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned int len: requested number of 16-bit integers (uniform mod q)
* - const uint8_t *buf: pointer to input buffer (assumed to be uniformly random bytes)
* - unsigned int buflen: length of input buffer in bytes
*
* Returns number of sampled 16-bit integers (at most len)
@@ -133,16 +122,19 @@ static unsigned int rej_uniform(int16_t *r,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint16_t val;
uint16_t val0, val1;

ctr = pos = 0;
while (ctr < len && pos + 2 <= buflen) {
val = buf[pos] | ((uint16_t)buf[pos + 1] << 8);
pos += 2;
while (ctr < len && pos + 3 <= buflen) {
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
pos += 3;

if (val < 19 * KYBER_Q) {
val -= (val >> 12) * KYBER_Q; // Barrett reduction
r[ctr++] = (int16_t)val;
if (val0 < KYBER_Q) {
r[ctr++] = val0;
}
if (ctr < len && val1 < KYBER_Q) {
r[ctr++] = val1;
}
}

@@ -160,17 +152,16 @@ static unsigned int rej_uniform(int16_t *r,
* uniformly random. Performs rejection sampling on output of
* a XOF
*
* Arguments: - polyvec *a: pointer to ouptput matrix A
* Arguments: - polyvec *a: pointer to ouptput matrix A
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T
* is generated
* - int transposed: boolean deciding whether A or A^T is generated
**************************************************/
#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \
+ XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
// Not static for benchmarking
void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) {
unsigned int ctr, i, j;
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES];
unsigned int ctr, i, j, k;
unsigned int buflen, off;
uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
xof_state state;

for (i = 0; i < KYBER_K; i++) {
@@ -182,12 +173,17 @@ void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYM
}

xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state);
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf));
buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, buflen);

while (ctr < KYBER_N) {
xof_squeezeblocks(buf, 1, &state);
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf,
XOF_BLOCKBYTES);
off = buflen % 3;
for (k = 0; k < off; k++) {
buf[k] = buf[buflen - off + k];
}
xof_squeezeblocks(buf + off, 1, &state);
buflen = off + XOF_BLOCKBYTES;
ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, buflen);
}
xof_ctx_release(&state);
}
@@ -220,10 +216,10 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT
gen_a(a, publicseed);

for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++);
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(&skpv.vec[i], noiseseed, nonce++);
}
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++);
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(&e.vec[i], noiseseed, nonce++);
}

PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&skpv);
@@ -231,7 +227,7 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT

// matrix-vector multiplication
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&pkpv.vec[i], &a[i], &skpv);
PQCLEAN_KYBER1024_CLEAN_poly_tomont(&pkpv.vec[i]);
}

@@ -248,16 +244,15 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYT
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins
* used as seed (of length KYBER_SYMBYTES)
* to deterministically generate all
* randomness
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length KYBER_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length KYBER_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length KYBER_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins used as seed
* (of length KYBER_SYMBYTES) to deterministically
* generate all randomness
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t m[KYBER_INDCPA_MSGBYTES],
@@ -266,7 +261,7 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
unsigned int i;
uint8_t seed[KYBER_SYMBYTES];
uint8_t nonce = 0;
polyvec sp, pkpv, ep, at[KYBER_K], bp;
polyvec sp, pkpv, ep, at[KYBER_K], b;
poly v, k, epp;

unpack_pk(&pkpv, seed, pk);
@@ -274,32 +269,32 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
gen_at(at, seed);

for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++);
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(sp.vec + i, coins, nonce++);
}
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++);
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(ep.vec + i, coins, nonce++);
}
PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&epp, coins, nonce++);
PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(&epp, coins, nonce++);

PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&sp);

// matrix-vector multiplication
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp);
PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&b.vec[i], &at[i], &sp);
}

PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp);
PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&v, &pkpv, &sp);

PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&bp);
PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&b);
PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&v);

PQCLEAN_KYBER1024_CLEAN_polyvec_add(&bp, &bp, &ep);
PQCLEAN_KYBER1024_CLEAN_polyvec_add(&b, &b, &ep);
PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &epp);
PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &k);
PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(&bp);
PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(&b);
PQCLEAN_KYBER1024_CLEAN_poly_reduce(&v);

pack_ciphertext(c, &bp, &v);
pack_ciphertext(c, &b, &v);
}

/*************************************************
@@ -308,24 +303,24 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length KYBER_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length KYBER_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length KYBER_INDCPA_SECRETKEYBYTES)
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES],
const uint8_t c[KYBER_INDCPA_BYTES],
const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) {
polyvec bp, skpv;
polyvec b, skpv;
poly v, mp;

unpack_ciphertext(&bp, &v, c);
unpack_ciphertext(&b, &v, c);
unpack_sk(&skpv, sk);

PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&bp);
PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp);
PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&b);
PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(&mp, &skpv, &b);
PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&mp);

PQCLEAN_KYBER1024_CLEAN_poly_sub(&mp, &v, &mp);


+ 16
- 15
crypto_kem/kyber1024/clean/kem.c 查看文件

@@ -14,13 +14,14 @@
* for CCA-secure Kyber key encapsulation mechanism
*
* Arguments: - unsigned char *pk: pointer to output public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
* - unsigned char *sk: pointer to output private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* (an already allocated array of KYBER_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char pk[KYBER_PUBLICKEYBYTES],
unsigned char sk[KYBER_SECRETKEYBYTES]) {
size_t i;
PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(pk, sk);
for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -39,17 +40,17 @@ int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char
* secret for given public key
*
* Arguments: - unsigned char *ct: pointer to output cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
* - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* (an already allocated array of KYBER_SSBYTES bytes)
* - const unsigned char *pk: pointer to input public key
* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
* (an already allocated array of KYBER_PUBLICKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char *ct,
unsigned char *ss,
const unsigned char *pk) {
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char ct[KYBER_CIPHERTEXTBYTES],
unsigned char ss[KYBER_SSBYTES],
const unsigned char pk[KYBER_PUBLICKEYBYTES]) {
uint8_t buf[2 * KYBER_SYMBYTES];
/* Will contain key, coins */
uint8_t kr[2 * KYBER_SYMBYTES];
@@ -79,19 +80,19 @@ int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char *ct,
* cipher text and private key
*
* Arguments: - unsigned char *ss: pointer to output shared secret
* (an already allocated array of CRYPTO_BYTES bytes)
* (an already allocated array of KYBER_SSBYTES bytes)
* - const unsigned char *ct: pointer to input cipher text
* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
* (an already allocated array of KYBER_CIPHERTEXTBYTES bytes)
* - const unsigned char *sk: pointer to input private key
* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes)
* (an already allocated array of KYBER_SECRETKEYBYTES bytes)
*
* Returns 0.
*
* On failure, ss will contain a pseudo-random value.
**************************************************/
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(unsigned char *ss,
const unsigned char *ct,
const unsigned char *sk) {
int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(unsigned char ss[KYBER_SSBYTES],
const unsigned char ct[KYBER_CIPHERTEXTBYTES],
const unsigned char sk[KYBER_SECRETKEYBYTES]) {
size_t i;
int fail;
uint8_t buf[2 * KYBER_SYMBYTES];


+ 42
- 57
crypto_kem/kyber1024/clean/ntt.c 查看文件

@@ -3,11 +3,11 @@
#include "reduce.h"
#include <stdint.h>

/* Code to generate PQCLEAN_KYBER1024_CLEAN_zetas and PQCLEAN_KYBER1024_CLEAN_zetas_inv used in the number-theoretic transform:
/* Code to generate PQCLEAN_KYBER1024_CLEAN_zetas and zetas_inv used in the number-theoretic transform:

#define KYBER_ROOT_OF_UNITY 17

static const uint16_t tree[128] = {
static const uint8_t tree[128] = {
0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120,
4, 68, 36, 100, 20, 84, 52, 116, 12, 76, 44, 108, 28, 92, 60, 124,
2, 66, 34, 98, 18, 82, 50, 114, 10, 74, 42, 106, 26, 90, 58, 122,
@@ -19,51 +19,41 @@ static const uint16_t tree[128] = {
};

void init_ntt() {
unsigned int i, j, k;
unsigned int i;
int16_t tmp[128];

tmp[0] = MONT;
for(i = 1; i < 128; ++i)
tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q);
for(i=1;i<128;i++)
tmp[i] = fqmul(tmp[i-1],MONT*KYBER_ROOT_OF_UNITY % KYBER_Q);

for(i = 0; i < 128; ++i)
for(i=0;i<128;i++) {
PQCLEAN_KYBER1024_CLEAN_zetas[i] = tmp[tree[i]];

k = 0;
for(i = 64; i >= 1; i >>= 1)
for(j = i; j < 2*i; ++j)
PQCLEAN_KYBER1024_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]];

PQCLEAN_KYBER1024_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q;
if(PQCLEAN_KYBER1024_CLEAN_zetas[i] > KYBER_Q/2)
PQCLEAN_KYBER1024_CLEAN_zetas[i] -= KYBER_Q;
if(PQCLEAN_KYBER1024_CLEAN_zetas[i] < -KYBER_Q/2)
PQCLEAN_KYBER1024_CLEAN_zetas[i] += KYBER_Q;
}
}

*/

const int16_t PQCLEAN_KYBER1024_CLEAN_zetas[128] = {
2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962,
2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017,
732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047,
1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830,
107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226,
430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574,
1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349,
418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193,
1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459,
478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628
};

const int16_t PQCLEAN_KYBER1024_CLEAN_zetas_inv[128] = {
1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535,
1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465,
1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685,
1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235,
3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652,
1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853,
1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552,
2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871,
829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171,
3127, 3042, 1907, 1836, 1517, 359, 758, 1441
};
-1044, -758, -359, -1517, 1493, 1422, 287, 202,
-171, 622, 1577, 182, 962, -1202, -1474, 1468,
573, -1325, 264, 383, -829, 1458, -1602, -130,
-681, 1017, 732, 608, -1542, 411, -205, -1571,
1223, 652, -552, 1015, -1293, 1491, -282, -1544,
516, -8, -320, -666, -1618, -1162, 126, 1469,
-853, -90, -271, 830, 107, -1421, -247, -951,
-398, 961, -1508, -725, 448, -1065, 677, -1275,
-1103, 430, 555, 843, -1251, 871, 1550, 105,
422, 587, 177, -235, -291, -460, 1574, 1653,
-246, 778, 1159, -147, -777, 1483, -602, 1119,
-1590, 644, -872, 349, 418, 329, -156, -75,
817, 1097, 603, 610, 1322, -1285, -1465, 384,
-1215, -136, 1218, -1335, -874, 220, -1187, -1659,
-1185, -1530, -1278, 794, -1510, -854, -870, 478,
-108, -308, 996, 991, 958, -1460, 1522, 1628
};

/*************************************************
* Name: fqmul
@@ -82,11 +72,10 @@ static int16_t fqmul(int16_t a, int16_t b) {
/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_ntt
*
* Description: Inplace number-theoretic transform (NTT) in Rq
* Description: Inplace number-theoretic transform (NTT) in Rq.
* input is in standard order, output is in bitreversed order
*
* Arguments: - int16_t r[256]: pointer to input/output vector of elements
* of Zq
* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) {
unsigned int len, start, j, k;
@@ -96,7 +85,7 @@ void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) {
for (len = 128; len >= 2; len >>= 1) {
for (start = 0; start < 256; start = j + len) {
zeta = PQCLEAN_KYBER1024_CLEAN_zetas[k++];
for (j = start; j < start + len; ++j) {
for (j = start; j < start + len; j++) {
t = fqmul(zeta, r[j + len]);
r[j + len] = r[j] - t;
r[j] = r[j] + t;
@@ -112,28 +101,28 @@ void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) {
* multiplication by Montgomery factor 2^16.
* Input is in bitreversed order, output is in standard order
*
* Arguments: - int16_t r[256]: pointer to input/output vector of elements
* of Zq
* Arguments: - int16_t r[256]: pointer to input/output vector of elements of Zq
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]) {
unsigned int start, len, j, k;
int16_t t, zeta;
const int16_t f = 1441; // mont^2/128

k = 0;
k = 127;
for (len = 2; len <= 128; len <<= 1) {
for (start = 0; start < 256; start = j + len) {
zeta = PQCLEAN_KYBER1024_CLEAN_zetas_inv[k++];
for (j = start; j < start + len; ++j) {
zeta = PQCLEAN_KYBER1024_CLEAN_zetas[k--];
for (j = start; j < start + len; j++) {
t = r[j];
r[j] = PQCLEAN_KYBER1024_CLEAN_barrett_reduce(t + r[j + len]);
r[j + len] = t - r[j + len];
r[j + len] = r[j + len] - t;
r[j + len] = fqmul(zeta, r[j + len]);
}
}
}

for (j = 0; j < 256; ++j) {
r[j] = fqmul(r[j], PQCLEAN_KYBER1024_CLEAN_zetas_inv[127]);
for (j = 0; j < 256; j++) {
r[j] = fqmul(r[j], f);
}
}

@@ -143,19 +132,15 @@ void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]) {
* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta)
* used for multiplication of elements in Rq in NTT domain
*
* Arguments: - int16_t r[2]: pointer to the output polynomial
* Arguments: - int16_t r[2]: pointer to the output polynomial
* - const int16_t a[2]: pointer to the first factor
* - const int16_t b[2]: pointer to the second factor
* - int16_t zeta: integer defining the reduction polynomial
* - int16_t zeta: integer defining the reduction polynomial
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2],
const int16_t a[2],
const int16_t b[2],
int16_t zeta) {
void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) {
r[0] = fqmul(a[1], b[1]);
r[0] = fqmul(r[0], zeta);
r[0] += fqmul(a[0], b[0]);

r[1] = fqmul(a[0], b[1]);
r[1] += fqmul(a[1], b[0]);
}

+ 1
- 6
crypto_kem/kyber1024/clean/ntt.h 查看文件

@@ -5,15 +5,10 @@

extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetas[128];

extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetas_inv[128];

void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]);

void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]);

void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2],
const int16_t a[2],
const int16_t b[2],
int16_t zeta);
void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta);

#endif

+ 7
- 9
crypto_kem/kyber1024/clean/params.h 查看文件

@@ -7,8 +7,6 @@
#define KYBER_N 256
#define KYBER_Q 3329

#define KYBER_ETA 2

#define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */
#define KYBER_SSBYTES 32 /* size in bytes of shared key */

@@ -16,20 +14,20 @@
#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)

#define KYBER_K 4
#define KYBER_ETA1 2
#define KYBER_POLYCOMPRESSEDBYTES 160
#define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)

#define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
#define KYBER_ETA2 2

#define KYBER_INDCPA_MSGBYTES (KYBER_SYMBYTES)
#define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES)
#define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \
+ KYBER_POLYCOMPRESSEDBYTES)
#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)

#define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES)
/* 32 bytes of additional space to save H(pk) */
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \
+ KYBER_INDCPA_PUBLICKEYBYTES \
+ 2*KYBER_SYMBYTES)
#define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES
#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES)
#define KYBER_CIPHERTEXTBYTES (KYBER_INDCPA_BYTES)

#endif

+ 55
- 50
crypto_kem/kyber1024/clean/poly.c 查看文件

@@ -13,17 +13,19 @@
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length KYBER_POLYCOMPRESSEDBYTES)
* - poly *a: pointer to input polynomial
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) {
void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a) {
size_t i, j;
int16_t u;
uint8_t t[8];

PQCLEAN_KYBER1024_CLEAN_poly_csubq(a);

for (i = 0; i < KYBER_N / 8; i++) {
for (j = 0; j < 8; j++) {
t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
// map to positive standard representatives
u = a->coeffs[8 * i + j];
u += (u >> 15) & KYBER_Q;
t[j] = ((((uint32_t)u << 5) + KYBER_Q / 2) / KYBER_Q) & 31;
}

r[0] = (t[0] >> 0) | (t[1] << 5);
@@ -41,7 +43,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES],
* Description: De-serialization and subsequent decompression of a polynomial;
* approximate inverse of PQCLEAN_KYBER1024_CLEAN_poly_compress
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYCOMPRESSEDBYTES bytes)
**************************************************/
@@ -74,20 +76,21 @@ void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLY
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYBYTES bytes)
* - poly *a: pointer to input polynomial
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a) {
size_t i;
uint16_t t0, t1;

PQCLEAN_KYBER1024_CLEAN_poly_csubq(a);

for (i = 0; i < KYBER_N / 2; i++) {
t0 = a->coeffs[2 * i];
// map to positive standard representatives
t0 = a->coeffs[2 * i];
t0 += ((int16_t)t0 >> 15) & KYBER_Q;
t1 = a->coeffs[2 * i + 1];
r[3 * i + 0] = (uint8_t)(t0 >> 0);
r[3 * i + 1] = (uint8_t)((t0 >> 8) | (t1 << 4));
r[3 * i + 2] = (uint8_t)(t1 >> 4);
t1 += ((int16_t)t1 >> 15) & KYBER_Q;
r[3 * i + 0] = (t0 >> 0);
r[3 * i + 1] = (t0 >> 8) | (t1 << 4);
r[3 * i + 2] = (t1 >> 4);
}
}

@@ -97,7 +100,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) {
* Description: De-serialization of a polynomial;
* inverse of PQCLEAN_KYBER1024_CLEAN_poly_tobytes
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of KYBER_POLYBYTES bytes)
**************************************************/
@@ -114,7 +117,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYB
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) {
@@ -135,41 +138,60 @@ void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCP
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - poly *a: pointer to input polynomial
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) {
void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a) {
size_t i, j;
uint16_t t;

PQCLEAN_KYBER1024_CLEAN_poly_csubq(a);

for (i = 0; i < KYBER_N / 8; i++) {
msg[i] = 0;
for (j = 0; j < 8; j++) {
t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1;
t = a->coeffs[8 * i + j];
t += ((int16_t)t >> 15) & KYBER_Q;
t = (((t << 1) + KYBER_Q / 2) / KYBER_Q) & 1;
msg[i] |= t << j;
}
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise
* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA1
*
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
uint8_t buf[KYBER_ETA1 * KYBER_N / 4];
prf(buf, sizeof(buf), seed, nonce);
PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta1(r, buf);
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter KYBER_ETA
* with parameter KYBER_ETA2
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length KYBER_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
* - uint8_t nonce: one-byte input nonce
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
uint8_t buf[KYBER_ETA * KYBER_N / 4];
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) {
uint8_t buf[KYBER_ETA2 * KYBER_N / 4];
prf(buf, sizeof(buf), seed, nonce);
PQCLEAN_KYBER1024_CLEAN_cbd(r, buf);
PQCLEAN_KYBER1024_CLEAN_poly_cbd_eta2(r, buf);
}


/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_poly_ntt
*
@@ -202,7 +224,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(poly *r) {
*
* Description: Multiplication of two polynomials in NTT domain
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
@@ -210,8 +232,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, con
size_t i;
for (i = 0; i < KYBER_N / 4; i++) {
PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]);
PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2],
-PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]);
PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], -PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]);
}
}

@@ -246,28 +267,12 @@ void PQCLEAN_KYBER1024_CLEAN_poly_reduce(poly *r) {
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_poly_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of a polynomial. For details of conditional subtraction
* of q see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r) {
size_t i;
for (i = 0; i < KYBER_N; i++) {
r->coeffs[i] = PQCLEAN_KYBER1024_CLEAN_csubq(r->coeffs[i]);
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_poly_add
*
* Description: Add two polynomials
* Description: Add two polynomials; no modular reduction is performed
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
@@ -281,7 +286,7 @@ void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b) {
/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_poly_sub
*
* Description: Subtract two polynomials
* Description: Subtract two polynomials; no modular reduction is performed
*
* Arguments: - poly *r: pointer to output polynomial
* - const poly *a: pointer to first input polynomial


+ 6
- 5
crypto_kem/kyber1024/clean/poly.h 查看文件

@@ -11,16 +11,18 @@ typedef struct {
int16_t coeffs[KYBER_N];
} poly;

void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a);
void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], const poly *a);
void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]);

void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a);
void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], const poly *a);
void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]);

void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]);
void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a);
void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], const poly *a);

void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);
void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta1(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);

void PQCLEAN_KYBER1024_CLEAN_poly_getnoise_eta2(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce);

void PQCLEAN_KYBER1024_CLEAN_poly_ntt(poly *r);
void PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(poly *r);
@@ -28,7 +30,6 @@ void PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, con
void PQCLEAN_KYBER1024_CLEAN_poly_tomont(poly *r);

void PQCLEAN_KYBER1024_CLEAN_poly_reduce(poly *r);
void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r);

void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b);
void PQCLEAN_KYBER1024_CLEAN_poly_sub(poly *r, const poly *a, const poly *b);


+ 15
- 36
crypto_kem/kyber1024/clean/polyvec.c 查看文件

@@ -10,19 +10,18 @@
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYVECCOMPRESSEDBYTES)
* - polyvec *a: pointer to input vector of polynomials
* - const polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) {
void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a) {
unsigned int i, j, k;

PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(a);

uint16_t t[8];
for (i = 0; i < KYBER_K; i++) {
for (j = 0; j < KYBER_N / 8; j++) {
for (k = 0; k < 8; k++) {
t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2)
/ KYBER_Q) & 0x7ff;
t[k] = a->vec[i].coeffs[8 * j + k];
t[k] += ((int16_t)t[k] >> 15) & KYBER_Q;
t[k] = ((((uint32_t)t[k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff;
}

r[ 0] = (uint8_t)(t[0] >> 0);
@@ -51,8 +50,7 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDB
* - const uint8_t *a: pointer to input byte array
* (of length KYBER_POLYVECCOMPRESSEDBYTES)
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) {
unsigned int i, j, k;

uint16_t t[8];
@@ -82,9 +80,9 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r,
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for KYBER_POLYVECBYTES)
* - polyvec *a: pointer to input vector of polynomials
* - const polyvec *a: pointer to input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) {
void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a) {
unsigned int i;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]);
@@ -138,18 +136,16 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(polyvec *r) {
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery
*
* Description: Pointwise multiply elements of a and b, accumulate into r,
* Description: Multiply elements of a and b in NTT domain, accumulate into r,
* and multiply by 2^-16.
*
* Arguments: - poly *r: pointer to output polynomial
* Arguments: - poly *r: pointer to output polynomial
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b) {
void PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b) {
unsigned int i;
poly t;

@@ -166,10 +162,10 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r,
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_reduce
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials
* of each element of a vector of polynomials;
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
* Arguments: - polyvec *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r) {
unsigned int i;
@@ -178,29 +174,12 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r) {
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_csubq
*
* Description: Applies conditional subtraction of q to each coefficient
* of each element of a vector of polynomials
* for details of conditional subtraction of q see comments in
* reduce.c
*
* Arguments: - poly *r: pointer to input/output polynomial
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r) {
unsigned int i;
for (i = 0; i < KYBER_K; i++) {
PQCLEAN_KYBER1024_CLEAN_poly_csubq(&r->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_add
*
* Description: Add vectors of polynomials
*
* Arguments: - polyvec *r: pointer to output vector of polynomials
* Arguments: - polyvec *r: pointer to output vector of polynomials
* - const polyvec *a: pointer to first input vector of polynomials
* - const polyvec *b: pointer to second input vector of polynomials
**************************************************/


+ 4
- 8
crypto_kem/kyber1024/clean/polyvec.h 查看文件

@@ -8,22 +8,18 @@ typedef struct {
poly vec[KYBER_K];
} polyvec;

void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a);
void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r,
const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);
void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], const polyvec *a);
void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]);

void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a);
void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], const polyvec *a);
void PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]);

void PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(polyvec *r);
void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(polyvec *r);

void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r,
const polyvec *a,
const polyvec *b);
void PQCLEAN_KYBER1024_CLEAN_polyvec_basemul_acc_montgomery(poly *r, const polyvec *a, const polyvec *b);

void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r);
void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r);

void PQCLEAN_KYBER1024_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b);



+ 4
- 20
crypto_kem/kyber1024/clean/reduce.c 查看文件

@@ -6,8 +6,7 @@
* Name: PQCLEAN_KYBER1024_CLEAN_montgomery_reduce
*
* Description: Montgomery reduction; given a 32-bit integer a, computes
* 16-bit integer congruent to a * R^-1 mod q,
* where R=2^16
* 16-bit integer congruent to a * R^-1 mod q, where R=2^16
*
* Arguments: - int32_t a: input integer to be reduced;
* has to be in {-q2^15,...,q2^15-1}
@@ -29,32 +28,17 @@ int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a) {
* Name: PQCLEAN_KYBER1024_CLEAN_barrett_reduce
*
* Description: Barrett reduction; given a 16-bit integer a, computes
* 16-bit integer congruent to a mod q in {0,...,q}
* centered representative congruent to a mod q in {-(q-1)/2,...,(q-1)/2}
*
* Arguments: - int16_t a: input integer to be reduced
*
* Returns: integer in {0,...,q} congruent to a modulo q.
* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
**************************************************/
int16_t PQCLEAN_KYBER1024_CLEAN_barrett_reduce(int16_t a) {
int16_t t;
const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q;

t = (int32_t)v * a >> 26;
t = ((int32_t)v * a + (1 << 25)) >> 26;
t *= KYBER_Q;
return a - t;
}

/*************************************************
* Name: PQCLEAN_KYBER1024_CLEAN_csubq
*
* Description: Conditionallly subtract q
*
* Arguments: - int16_t x: input integer
*
* Returns: a - q if a >= q, else a
**************************************************/
int16_t PQCLEAN_KYBER1024_CLEAN_csubq(int16_t a) {
a -= KYBER_Q;
a += (a >> 15) & KYBER_Q;
return a;
}

+ 0
- 2
crypto_kem/kyber1024/clean/reduce.h 查看文件

@@ -10,6 +10,4 @@ int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a);

int16_t PQCLEAN_KYBER1024_CLEAN_barrett_reduce(int16_t a);

int16_t PQCLEAN_KYBER1024_CLEAN_csubq(int16_t a);

#endif

+ 12
- 18
crypto_kem/kyber1024/clean/symmetric-shake.c 查看文件

@@ -9,12 +9,10 @@
*
* Description: Absorb step of the SHAKE128 specialized for the Kyber context.
*
* Arguments: - xof_state *state: pointer to (uninitialized) output
* Keccak state
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input
* to be absorbed into state
* - uint8_t i additional byte of input
* - uint8_t j additional byte of input
* Arguments: - xof_state *state: pointer to (uninitialized) output Keccak state
* - const uint8_t *seed: pointer to KYBER_SYMBYTES input to be absorbed into state
* - uint8_t i: additional byte of input
* - uint8_t j: additional byte of input
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state,
const uint8_t seed[KYBER_SYMBYTES],
@@ -26,8 +24,8 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state,
for (i = 0; i < KYBER_SYMBYTES; i++) {
extseed[i] = seed[i];
}
extseed[i++] = x;
extseed[i] = y;
extseed[KYBER_SYMBYTES + 0] = x;
extseed[KYBER_SYMBYTES + 1] = y;

shake128_absorb(state, extseed, sizeof(extseed));
}
@@ -38,23 +36,19 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state,
* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input
* and then generates outlen bytes of SHAKE256 output
*
* Arguments: - uint8_t *out: pointer to output
* - size_t outlen: number of requested output bytes
* - const uint8_t *key: pointer to the key
* (of length KYBER_SYMBYTES)
* - uint8_t nonce: single-byte nonce (public PRF input)
* Arguments: - uint8_t *out: pointer to output
* - size_t outlen: number of requested output bytes
* - const uint8_t *key: pointer to the key (of length KYBER_SYMBYTES)
* - uint8_t nonce: single-byte nonce (public PRF input)
**************************************************/
void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out,
size_t outlen,
const uint8_t key[KYBER_SYMBYTES],
uint8_t nonce) {
void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce) {
unsigned int i;
uint8_t extkey[KYBER_SYMBYTES + 1];

for (i = 0; i < KYBER_SYMBYTES; i++) {
extkey[i] = key[i];
}
extkey[i] = nonce;
extkey[KYBER_SYMBYTES] = nonce;

shake256(out, outlen, extkey, sizeof(extkey));
}

+ 3
- 8
crypto_kem/kyber1024/clean/symmetric.h 查看文件

@@ -14,21 +14,16 @@ void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *s,
uint8_t x,
uint8_t y);

void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out,
size_t outlen,
const uint8_t key[KYBER_SYMBYTES],
uint8_t nonce);
void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, size_t outlen, const uint8_t key[KYBER_SYMBYTES], uint8_t nonce);

#define XOF_BLOCKBYTES SHAKE128_RATE

#define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES)
#define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES)
#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(STATE, SEED, X, Y)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \
shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define xof_ctx_release(STATE) shake128_ctx_release(STATE)
#define prf(OUT, OUTBYTES, KEY, NONCE) \
PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE)
#define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES)




+ 4
- 4
crypto_kem/kyber512-90s/META.yml 查看文件

@@ -3,10 +3,10 @@ type: kem
claimed-nist-level: 1
claimed-security: IND-CCA2
length-public-key: 800
length-ciphertext: 736
length-ciphertext: 768
length-secret-key: 1632
length-shared-secret: 32
nistkat-sha256: d081dafce242de5d2a9b1cfe2b304cf5ebaed71b7a91f028fefd569693307d45
nistkat-sha256: 7bfe0653b63b3fac7ee300a6e4801046c1a3d8d445b271633b6c9d81ed125e5b
principal-submitters:
- Peter Schwabe
auxiliary-submitters:
@@ -21,9 +21,9 @@ auxiliary-submitters:
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber
- name: avx2
version: https://github.com/pq-crystals/kyber/commit/844057468e69527bd15b17fbe03f4b61f9a22065 via https://github.com/jschanck/package-pqclean/tree/b45068b8/kyber
version: https://github.com/pq-crystals/kyber/commit/e7faae9f662f5b92fee4e966f09b2f23e1e91c65 via https://github.com/jschanck/package-pqclean/tree/231c9bec/kyber
supported_platforms:
- architecture: x86_64
operating_systems:


+ 27
- 30
crypto_kem/kyber512-90s/avx2/aes256ctr.c 查看文件

@@ -2,52 +2,48 @@
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
/*
Based heavily on public-domain code by Romain Dolbeau
Different handling of nonce+counter than original version
using separated 64-bit nonce and internal 64-bit counter, starting from zero
Public Domain
*/
/* Based heavily on public-domain code by Romain Dolbeau
* Different handling of nonce+counter than original version using
* separated 64-bit nonce and internal 64-bit counter, starting from zero
* Public Domain */


static inline void aesni_encrypt4(uint8_t out[64],
__m128i *n,
const __m128i rkeys[16]) {
__m128i f, f0, f1, f2, f3, t;
static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) {
__m128i f, f0, f1, f2, f3;
const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);

/* Load current counter value */
f = _mm_load_si128(n);

/* Increase counter in 4 consecutive blocks */
t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t);
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t);
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t);
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t);
f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx);
f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx);
f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx);
f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx);

/* Write counter for next iteration, increased by 4 */
_mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0)));

/* Actual AES encryption, 4x interleaved */
t = _mm_load_si128(&rkeys[0]);
f0 = _mm_xor_si128(f0, t);
f1 = _mm_xor_si128(f1, t);
f2 = _mm_xor_si128(f2, t);
f3 = _mm_xor_si128(f3, t);
f = _mm_load_si128(&rkeys[0]);
f0 = _mm_xor_si128(f0, f);
f1 = _mm_xor_si128(f1, f);
f2 = _mm_xor_si128(f2, f);
f3 = _mm_xor_si128(f3, f);

for (int i = 1; i < 14; i++) {
t = _mm_load_si128(&rkeys[i]);
f0 = _mm_aesenc_si128(f0, t);
f1 = _mm_aesenc_si128(f1, t);
f2 = _mm_aesenc_si128(f2, t);
f3 = _mm_aesenc_si128(f3, t);
f = _mm_load_si128(&rkeys[i]);
f0 = _mm_aesenc_si128(f0, f);
f1 = _mm_aesenc_si128(f1, f);
f2 = _mm_aesenc_si128(f2, f);
f3 = _mm_aesenc_si128(f3, f);
}

t = _mm_load_si128(&rkeys[14]);
f0 = _mm_aesenclast_si128(f0, t);
f1 = _mm_aesenclast_si128(f1, t);
f2 = _mm_aesenclast_si128(f2, t);
f3 = _mm_aesenclast_si128(f3, t);
f = _mm_load_si128(&rkeys[14]);
f0 = _mm_aesenclast_si128(f0, f);
f1 = _mm_aesenclast_si128(f1, f);
f2 = _mm_aesenclast_si128(f2, f);
f3 = _mm_aesenclast_si128(f3, f);

/* Write results */
_mm_storeu_si128((__m128i *)(out + 0), f0);
@@ -134,6 +130,7 @@ void PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(uint8_t *out,
while (outlen >= 64) {
aesni_encrypt4(out, &state.n, state.rkeys);
outlen -= 64;
out += 64;
}

if (outlen) {


+ 9
- 13
crypto_kem/kyber512-90s/avx2/align.h 查看文件

@@ -2,22 +2,18 @@
#define PQCLEAN_KYBER51290S_AVX2_ALIGN_H

#include <immintrin.h>
#include <stdint.h>

#define ALIGN16_TYPE(t) \
union { \
__m128i vec; \
t orig; \
#define ALIGNED_UINT8(N) \
union { \
uint8_t coeffs[(N)]; \
__m256i vec[((N)+31)/32]; \
}

#define ALIGN32_ARRAY(t, s) \
union { \
__m256i vec; \
t arr[(s)]; \
#define ALIGNED_INT16(N) \
union { \
int16_t coeffs[(N)]; \
__m256i vec[((N)+15)/16]; \
}

#define ALIGN32_ARRAY_2D(t, n, m) \
union { \
__m256i vec; \
t arr[(n)][(m)]; \
}
#endif

+ 1
- 1
crypto_kem/kyber512-90s/avx2/api.h 查看文件

@@ -5,7 +5,7 @@

#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_SECRETKEYBYTES 1632
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_PUBLICKEYBYTES 800
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_CIPHERTEXTBYTES 736
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_CIPHERTEXTBYTES 768
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_BYTES 32
#define PQCLEAN_KYBER51290S_AVX2_CRYPTO_ALGNAME "Kyber512-90s"



+ 96
- 205
crypto_kem/kyber512-90s/avx2/basemul.S 查看文件

@@ -1,216 +1,107 @@
#include "cdecl.h"
#include "params.h"

.macro schoolbook off,sign
#load
vmovdqa \off+32(%rsi),%ymm7 # b
vmovdqa \off+32(%rdx),%ymm8 # d
vmovdqa \off(%rsi),%ymm9 # a
vmovdqa \off(%rdx),%ymm10 # c

#mul
vpmullw %ymm7,%ymm8,%ymm11 # bd.lo
vpmulhw %ymm7,%ymm8,%ymm12 # bd.hi
vpmullw %ymm7,%ymm10,%ymm13 # bc.lo
vpmulhw %ymm7,%ymm10,%ymm7 # bc.hi
vpmullw %ymm9,%ymm8,%ymm14 # ad.lo
vpmulhw %ymm9,%ymm8,%ymm8 # ad.hi
vpmullw %ymm9,%ymm10,%ymm15 # ac.lo
vpmulhw %ymm9,%ymm10,%ymm9 # ac.hi

#reduce
vpmullw %ymm1,%ymm11,%ymm11
vpmulhw %ymm0,%ymm11,%ymm11
vpsubw %ymm11,%ymm12,%ymm11 # bd

#mul
vpmullw %ymm2,%ymm11,%ymm10 # rbd.lo
vpmulhw %ymm2,%ymm11,%ymm11 # rbd.hi

#unpack
vpunpcklwd %ymm7,%ymm13,%ymm12 # bc0
vpunpckhwd %ymm7,%ymm13,%ymm7 # bc1
vpunpcklwd %ymm8,%ymm14,%ymm13 # ad0
vpunpckhwd %ymm8,%ymm14,%ymm8 # ad1
vpunpcklwd %ymm9,%ymm15,%ymm14 # ac0
vpunpckhwd %ymm9,%ymm15,%ymm9 # ac1
vpunpcklwd %ymm11,%ymm10,%ymm15 # rbd0
vpunpckhwd %ymm11,%ymm10,%ymm10 # rbd1

#add
.ifeq \sign
vpaddd %ymm14,%ymm15,%ymm14 # x0
vpaddd %ymm9,%ymm10,%ymm9 # x1
.else
vpsubd %ymm15,%ymm14,%ymm14 # x0
vpsubd %ymm10,%ymm9,%ymm9 # x1
.endif
vpaddd %ymm12,%ymm13,%ymm12 # y0
vpaddd %ymm7,%ymm8,%ymm7 # y1
.endm

.macro red a0,a1,b0,b1,x,y,z
#pack
vpxor %ymm\x,%ymm\x,%ymm\x
vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y
vpblendw $0xAA,%ymm\x,%ymm\a1,%ymm\z
vpsrld $16,%ymm\a0,%ymm\a0
vpsrld $16,%ymm\a1,%ymm\a1
vpackusdw %ymm\z,%ymm\y,%ymm\z
vpackusdw %ymm\a1,%ymm\a0,%ymm\a0
vpblendw $0xAA,%ymm\x,%ymm\b0,%ymm\y
vpblendw $0xAA,%ymm\x,%ymm\b1,%ymm\x
vpsrld $16,%ymm\b0,%ymm\b0
vpsrld $16,%ymm\b1,%ymm\b1
vpackusdw %ymm\x,%ymm\y,%ymm\y
vpackusdw %ymm\b1,%ymm\b0,%ymm\b0

#reduce
vpmullw %ymm1,%ymm\z,%ymm\z
vpmullw %ymm1,%ymm\y,%ymm\y
vpmulhw %ymm0,%ymm\z,%ymm\z
vpmulhw %ymm0,%ymm\y,%ymm\y
vpsubw %ymm\z,%ymm\a0,%ymm\a0
vpsubw %ymm\y,%ymm\b0,%ymm\b0
.macro schoolbook off
vmovdqa _16XQINV*2(%rcx),%ymm0
vmovdqa (64*\off+ 0)*2(%rsi),%ymm1 # a0
vmovdqa (64*\off+16)*2(%rsi),%ymm2 # b0
vmovdqa (64*\off+32)*2(%rsi),%ymm3 # a1
vmovdqa (64*\off+48)*2(%rsi),%ymm4 # b1

vpmullw %ymm0,%ymm1,%ymm9 # a0.lo
vpmullw %ymm0,%ymm2,%ymm10 # b0.lo
vpmullw %ymm0,%ymm3,%ymm11 # a1.lo
vpmullw %ymm0,%ymm4,%ymm12 # b1.lo

vmovdqa (64*\off+ 0)*2(%rdx),%ymm5 # c0
vmovdqa (64*\off+16)*2(%rdx),%ymm6 # d0

vpmulhw %ymm5,%ymm1,%ymm13 # a0c0.hi
vpmulhw %ymm6,%ymm1,%ymm1 # a0d0.hi
vpmulhw %ymm5,%ymm2,%ymm14 # b0c0.hi
vpmulhw %ymm6,%ymm2,%ymm2 # b0d0.hi

vmovdqa (64*\off+32)*2(%rdx),%ymm7 # c1
vmovdqa (64*\off+48)*2(%rdx),%ymm8 # d1

vpmulhw %ymm7,%ymm3,%ymm15 # a1c1.hi
vpmulhw %ymm8,%ymm3,%ymm3 # a1d1.hi
vpmulhw %ymm7,%ymm4,%ymm0 # b1c1.hi
vpmulhw %ymm8,%ymm4,%ymm4 # b1d1.hi

vmovdqa %ymm13,(%rsp)

vpmullw %ymm5,%ymm9,%ymm13 # a0c0.lo
vpmullw %ymm6,%ymm9,%ymm9 # a0d0.lo
vpmullw %ymm5,%ymm10,%ymm5 # b0c0.lo
vpmullw %ymm6,%ymm10,%ymm10 # b0d0.lo

vpmullw %ymm7,%ymm11,%ymm6 # a1c1.lo
vpmullw %ymm8,%ymm11,%ymm11 # a1d1.lo
vpmullw %ymm7,%ymm12,%ymm7 # b1c1.lo
vpmullw %ymm8,%ymm12,%ymm12 # b1d1.lo

vmovdqa _16XQ*2(%rcx),%ymm8
vpmulhw %ymm8,%ymm13,%ymm13
vpmulhw %ymm8,%ymm9,%ymm9
vpmulhw %ymm8,%ymm5,%ymm5
vpmulhw %ymm8,%ymm10,%ymm10
vpmulhw %ymm8,%ymm6,%ymm6
vpmulhw %ymm8,%ymm11,%ymm11
vpmulhw %ymm8,%ymm7,%ymm7
vpmulhw %ymm8,%ymm12,%ymm12

vpsubw (%rsp),%ymm13,%ymm13 # -a0c0
vpsubw %ymm9,%ymm1,%ymm9 # a0d0
vpsubw %ymm5,%ymm14,%ymm5 # b0c0
vpsubw %ymm10,%ymm2,%ymm10 # b0d0

vpsubw %ymm6,%ymm15,%ymm6 # a1c1
vpsubw %ymm11,%ymm3,%ymm11 # a1d1
vpsubw %ymm7,%ymm0,%ymm7 # b1c1
vpsubw %ymm12,%ymm4,%ymm12 # b1d1

vmovdqa (%r9),%ymm0
vmovdqa 32(%r9),%ymm1
vpmullw %ymm0,%ymm10,%ymm2
vpmullw %ymm0,%ymm12,%ymm3
vpmulhw %ymm1,%ymm10,%ymm10
vpmulhw %ymm1,%ymm12,%ymm12
vpmulhw %ymm8,%ymm2,%ymm2
vpmulhw %ymm8,%ymm3,%ymm3
vpsubw %ymm2,%ymm10,%ymm10 # rb0d0
vpsubw %ymm3,%ymm12,%ymm12 # rb1d1

vpaddw %ymm5,%ymm9,%ymm9
vpaddw %ymm7,%ymm11,%ymm11
vpsubw %ymm13,%ymm10,%ymm13
vpsubw %ymm12,%ymm6,%ymm6

vmovdqa %ymm13,(64*\off+ 0)*2(%rdi)
vmovdqa %ymm9,(64*\off+16)*2(%rdi)
vmovdqa %ymm6,(64*\off+32)*2(%rdi)
vmovdqa %ymm11,(64*\off+48)*2(%rdi)
.endm

.text
basemul64_acc_avx:
poly0.0:
schoolbook 0,0

#mov
vmovdqa %ymm14,%ymm3
vmovdqa %ymm9,%ymm4
vmovdqa %ymm12,%ymm5
vmovdqa %ymm7,%ymm6

poly1.0:
schoolbook 512,0

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6



#reduce
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,(%rdi)
vmovdqa %ymm5,32(%rdi)

poly0.1:
schoolbook 64,1

#mov
vmovdqa %ymm14,%ymm3
vmovdqa %ymm9,%ymm4
vmovdqa %ymm12,%ymm5
vmovdqa %ymm7,%ymm6

poly1.1:
schoolbook 576,1

#add
vpaddd %ymm14,%ymm3,%ymm3
vpaddd %ymm9,%ymm4,%ymm4
vpaddd %ymm12,%ymm5,%ymm5
vpaddd %ymm7,%ymm6,%ymm6



#reduce
red 3,4,5,6,7,8,9

#store
vmovdqa %ymm3,64(%rdi)
vmovdqa %ymm5,96(%rdi)

ret

.global cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx)
.global _cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx)
cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx):
_cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx):
#consts
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_acc_avx

ret

basemul64_avx:
schoolbook 0,0

#reduce
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,(%rdi)
vmovdqa %ymm12,32(%rdi)

schoolbook 64,1

#reduce
red 14,9,12,7,8,10,11

#store
vmovdqa %ymm14,64(%rdi)
vmovdqa %ymm12,96(%rdi)

ret

.global cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx)
.global _cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx)
cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx):
_cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx):
#consts
vmovdqa _16XQ*2(%rcx),%ymm0
vmovdqa _16XQINV*2(%rcx),%ymm1

vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2
call basemul64_avx

vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx

vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2
add $128,%rdi
add $128,%rsi
add $128,%rdx
call basemul64_avx
mov %rsp,%r8
and $-32,%rsp
sub $32,%rsp

lea (_ZETAS_EXP+176)*2(%rcx),%r9
schoolbook 0

add $32*2,%r9
schoolbook 1

add $192*2,%r9
schoolbook 2

add $32*2,%r9
schoolbook 3

mov %r8,%rsp
ret

+ 109
- 50
crypto_kem/kyber512-90s/avx2/cbd.c 查看文件

@@ -4,66 +4,125 @@
#include <stdint.h>

/*************************************************
* Name: PQCLEAN_KYBER51290S_AVX2_cbd
* Name: cbd2
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter KYBER_ETA
* a centered binomial distribution with parameter eta=2
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *buf: pointer to input byte array
* Arguments: - poly *r: pointer to output polynomial
* - const __m256i *buf: pointer to aligned input byte array
**************************************************/
void PQCLEAN_KYBER51290S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) {
static void cbd2(poly *restrict r, const __m256i buf[2 * KYBER_N / 128]) {
unsigned int i;
__m256i vec0, vec1, vec2, vec3, tmp;
__m256i f0, f1, f2, f3;
const __m256i mask55 = _mm256_set1_epi32(0x55555555);
const __m256i mask33 = _mm256_set1_epi32(0x33333333);
const __m256i mask03 = _mm256_set1_epi32(0x03030303);
const __m256i mask0F = _mm256_set1_epi32(0x0F0F0F0F);

for (i = 0; i < KYBER_N / 64; i++) {
vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]);

vec1 = _mm256_srli_epi32(vec0, 1);
vec0 = _mm256_and_si256(mask55, vec0);
vec1 = _mm256_and_si256(mask55, vec1);
vec0 = _mm256_add_epi32(vec0, vec1);

vec1 = _mm256_srli_epi32(vec0, 2);
vec0 = _mm256_and_si256(mask33, vec0);
vec1 = _mm256_and_si256(mask33, vec1);

vec2 = _mm256_srli_epi32(vec0, 4);
vec3 = _mm256_srli_epi32(vec1, 4);
vec0 = _mm256_and_si256(mask03, vec0);
vec1 = _mm256_and_si256(mask03, vec1);
vec2 = _mm256_and_si256(mask03, vec2);
vec3 = _mm256_and_si256(mask03, vec3);

vec1 = _mm256_sub_epi8(vec0, vec1);
vec3 = _mm256_sub_epi8(vec2, vec3);

vec0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec1));
vec1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec1, 1));
vec2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(vec3));
vec3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vec3, 1));

tmp = _mm256_unpacklo_epi16(vec0, vec2);
vec2 = _mm256_unpackhi_epi16(vec0, vec2);
vec0 = tmp;
tmp = _mm256_unpacklo_epi16(vec1, vec3);
vec3 = _mm256_unpackhi_epi16(vec1, vec3);
vec1 = tmp;

tmp = _mm256_permute2x128_si256(vec0, vec2, 0x20);
vec2 = _mm256_permute2x128_si256(vec0, vec2, 0x31);
vec0 = tmp;
tmp = _mm256_permute2x128_si256(vec1, vec3, 0x20);
vec3 = _mm256_permute2x128_si256(vec1, vec3, 0x31);
vec1 = tmp;

_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 0], vec0);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 16], vec2);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 32], vec1);
_mm256_store_si256((__m256i *)&r->coeffs[64 * i + 48], vec3);
f0 = _mm256_load_si256(&buf[i]);

f1 = _mm256_srli_epi16(f0, 1);
f0 = _mm256_and_si256(mask55, f0);
f1 = _mm256_and_si256(mask55, f1);
f0 = _mm256_add_epi8(f0, f1);

f1 = _mm256_srli_epi16(f0, 2);
f0 = _mm256_and_si256(mask33, f0);
f1 = _mm256_and_si256(mask33, f1);
f0 = _mm256_add_epi8(f0, mask33);
f0 = _mm256_sub_epi8(f0, f1);

f1 = _mm256_srli_epi16(f0, 4);
f0 = _mm256_and_si256(mask0F, f0);
f1 = _mm256_and_si256(mask0F, f1);
f0 = _mm256_sub_epi8(f0, mask03);
f1 = _mm256_sub_epi8(f1, mask03);

f2 = _mm256_unpacklo_epi8(f0, f1);
f3 = _mm256_unpackhi_epi8(f0, f1);

f0 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f2));
f1 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f2, 1));
f2 = _mm256_cvtepi8_epi16(_mm256_castsi256_si128(f3));
f3 = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(f3, 1));

_mm256_store_si256(&r->vec[4 * i + 0], f0);
_mm256_store_si256(&r->vec[4 * i + 1], f2);
_mm256_store_si256(&r->vec[4 * i + 2], f1);
_mm256_store_si256(&r->vec[4 * i + 3], f3);
}
}

/*************************************************
* Name: cbd3
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter eta=3
* This function is only needed for Kyber-512
*
* Arguments: - poly *r: pointer to output polynomial
* - const __m256i *buf: pointer to aligned input byte array
**************************************************/
static void cbd3(poly *restrict r, const uint8_t buf[3 * KYBER_N / 4 + 8]) {
unsigned int i;
__m256i f0, f1, f2, f3;
const __m256i mask249 = _mm256_set1_epi32(0x249249);
const __m256i mask6DB = _mm256_set1_epi32(0x6DB6DB);
const __m256i mask07 = _mm256_set1_epi32(7);
const __m256i mask70 = _mm256_set1_epi32(7 << 16);
const __m256i mask3 = _mm256_set1_epi16(3);
const __m256i shufbidx = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, -1, 9, 8, 7, -1, 6, 5, 4,
-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);

for (i = 0; i < KYBER_N / 32; i++) {
f0 = _mm256_loadu_si256((__m256i *)&buf[24 * i]);
f0 = _mm256_permute4x64_epi64(f0, 0x94);
f0 = _mm256_shuffle_epi8(f0, shufbidx);

f1 = _mm256_srli_epi32(f0, 1);
f2 = _mm256_srli_epi32(f0, 2);
f0 = _mm256_and_si256(mask249, f0);
f1 = _mm256_and_si256(mask249, f1);
f2 = _mm256_and_si256(mask249, f2);
f0 = _mm256_add_epi32(f0, f1);
f0 = _mm256_add_epi32(f0, f2);

f1 = _mm256_srli_epi32(f0, 3);
f0 = _mm256_add_epi32(f0, mask6DB);
f0 = _mm256_sub_epi32(f0, f1);

f1 = _mm256_slli_epi32(f0, 10);
f2 = _mm256_srli_epi32(f0, 12);
f3 = _mm256_srli_epi32(f0, 2);
f0 = _mm256_and_si256(f0, mask07);
f1 = _mm256_and_si256(f1, mask70);
f2 = _mm256_and_si256(f2, mask07);
f3 = _mm256_and_si256(f3, mask70);
f0 = _mm256_add_epi16(f0, f1);
f1 = _mm256_add_epi16(f2, f3);
f0 = _mm256_sub_epi16(f0, mask3);
f1 = _mm256_sub_epi16(f1, mask3);

f2 = _mm256_unpacklo_epi32(f0, f1);
f3 = _mm256_unpackhi_epi32(f0, f1);

f0 = _mm256_permute2x128_si256(f2, f3, 0x20);
f1 = _mm256_permute2x128_si256(f2, f3, 0x31);

_mm256_store_si256(&r->vec[2 * i + 0], f0);
_mm256_store_si256(&r->vec[2 * i + 1], f1);
}
}

/* buf 32 bytes longer for cbd3 */
void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]) {
cbd3(r, (uint8_t *)buf);
}

void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]) {
cbd2(r, buf);
}

+ 4
- 1
crypto_kem/kyber512-90s/avx2/cbd.h 查看文件

@@ -2,8 +2,11 @@
#define PQCLEAN_KYBER51290S_AVX2_CBD_H
#include "params.h"
#include "poly.h"
#include <immintrin.h>
#include <stdint.h>

void PQCLEAN_KYBER51290S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]);
void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta1(poly *r, const __m256i buf[KYBER_ETA1 * KYBER_N / 128 + 1]);

void PQCLEAN_KYBER51290S_AVX2_poly_cbd_eta2(poly *r, const __m256i buf[KYBER_ETA2 * KYBER_N / 128]);

#endif

部分文件因文件數量過多而無法顯示

Loading…
取消
儲存