Browse Source

Add Dilithium's AVX2 implementations

kyber
Thom Wiggers 5 years ago
committed by Kris Kwiatkowski
parent
commit
6682216791
100 changed files with 9261 additions and 810 deletions
  1. +10
    -1
      crypto_sign/dilithium2/META.yml
  2. +6
    -0
      crypto_sign/dilithium2/avx2/LICENSE
  3. +43
    -0
      crypto_sign/dilithium2/avx2/Makefile
  4. +22
    -0
      crypto_sign/dilithium2/avx2/alignment.h
  5. +37
    -0
      crypto_sign/dilithium2/avx2/api.h
  6. +239
    -0
      crypto_sign/dilithium2/avx2/fips202x4.c
  7. +65
    -0
      crypto_sign/dilithium2/avx2/fips202x4.h
  8. +281
    -0
      crypto_sign/dilithium2/avx2/invntt.s
  9. +26
    -0
      crypto_sign/dilithium2/avx2/ntt.h
  10. +178
    -0
      crypto_sign/dilithium2/avx2/ntt.s
  11. +80
    -0
      crypto_sign/dilithium2/avx2/nttconsts.c
  12. +27
    -0
      crypto_sign/dilithium2/avx2/nttconsts.h
  13. +305
    -0
      crypto_sign/dilithium2/avx2/packing.c
  14. +36
    -0
      crypto_sign/dilithium2/avx2/packing.h
  15. +33
    -0
      crypto_sign/dilithium2/avx2/params.h
  16. +189
    -0
      crypto_sign/dilithium2/avx2/pointwise.S
  17. +914
    -0
      crypto_sign/dilithium2/avx2/poly.c
  18. +83
    -0
      crypto_sign/dilithium2/avx2/poly.h
  19. +353
    -0
      crypto_sign/dilithium2/avx2/polyvec.c
  20. +52
    -0
      crypto_sign/dilithium2/avx2/polyvec.h
  21. +9
    -0
      crypto_sign/dilithium2/avx2/reduce.h
  22. +91
    -0
      crypto_sign/dilithium2/avx2/reduce.s
  23. +443
    -0
      crypto_sign/dilithium2/avx2/rejsample.c
  24. +26
    -0
      crypto_sign/dilithium2/avx2/rejsample.h
  25. +115
    -0
      crypto_sign/dilithium2/avx2/rounding.c
  26. +12
    -0
      crypto_sign/dilithium2/avx2/rounding.h
  27. +23
    -0
      crypto_sign/dilithium2/avx2/shuffle.inc
  28. +433
    -0
      crypto_sign/dilithium2/avx2/sign.c
  29. +15
    -0
      crypto_sign/dilithium2/avx2/sign.h
  30. +26
    -0
      crypto_sign/dilithium2/avx2/stream.c
  31. +15
    -0
      crypto_sign/dilithium2/avx2/stream.h
  32. +23
    -0
      crypto_sign/dilithium2/avx2/symmetric.h
  33. +6
    -2
      crypto_sign/dilithium2/clean/LICENSE
  34. +3
    -3
      crypto_sign/dilithium2/clean/Makefile
  35. +1
    -1
      crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake
  36. +14
    -10
      crypto_sign/dilithium2/clean/api.h
  37. +15
    -14
      crypto_sign/dilithium2/clean/ntt.c
  38. +4
    -3
      crypto_sign/dilithium2/clean/ntt.h
  39. +62
    -54
      crypto_sign/dilithium2/clean/packing.c
  40. +29
    -24
      crypto_sign/dilithium2/clean/packing.h
  41. +3
    -5
      crypto_sign/dilithium2/clean/params.h
  42. +131
    -164
      crypto_sign/dilithium2/clean/poly.c
  43. +16
    -15
      crypto_sign/dilithium2/clean/poly.h
  44. +29
    -26
      crypto_sign/dilithium2/clean/polyvec.c
  45. +5
    -4
      crypto_sign/dilithium2/clean/polyvec.h
  46. +7
    -6
      crypto_sign/dilithium2/clean/reduce.c
  47. +2
    -2
      crypto_sign/dilithium2/clean/reduce.h
  48. +30
    -16
      crypto_sign/dilithium2/clean/rounding.c
  49. +2
    -2
      crypto_sign/dilithium2/clean/rounding.h
  50. +94
    -70
      crypto_sign/dilithium2/clean/sign.c
  51. +5
    -23
      crypto_sign/dilithium2/clean/sign.h
  52. +26
    -0
      crypto_sign/dilithium2/clean/stream.c
  53. +15
    -0
      crypto_sign/dilithium2/clean/stream.h
  54. +0
    -32
      crypto_sign/dilithium2/clean/symmetric.c
  55. +9
    -9
      crypto_sign/dilithium2/clean/symmetric.h
  56. +10
    -1
      crypto_sign/dilithium3/META.yml
  57. +6
    -0
      crypto_sign/dilithium3/avx2/LICENSE
  58. +43
    -0
      crypto_sign/dilithium3/avx2/Makefile
  59. +22
    -0
      crypto_sign/dilithium3/avx2/alignment.h
  60. +37
    -0
      crypto_sign/dilithium3/avx2/api.h
  61. +239
    -0
      crypto_sign/dilithium3/avx2/fips202x4.c
  62. +65
    -0
      crypto_sign/dilithium3/avx2/fips202x4.h
  63. +281
    -0
      crypto_sign/dilithium3/avx2/invntt.s
  64. +26
    -0
      crypto_sign/dilithium3/avx2/ntt.h
  65. +178
    -0
      crypto_sign/dilithium3/avx2/ntt.s
  66. +80
    -0
      crypto_sign/dilithium3/avx2/nttconsts.c
  67. +27
    -0
      crypto_sign/dilithium3/avx2/nttconsts.h
  68. +305
    -0
      crypto_sign/dilithium3/avx2/packing.c
  69. +36
    -0
      crypto_sign/dilithium3/avx2/packing.h
  70. +33
    -0
      crypto_sign/dilithium3/avx2/params.h
  71. +191
    -0
      crypto_sign/dilithium3/avx2/pointwise.S
  72. +914
    -0
      crypto_sign/dilithium3/avx2/poly.c
  73. +83
    -0
      crypto_sign/dilithium3/avx2/poly.h
  74. +353
    -0
      crypto_sign/dilithium3/avx2/polyvec.c
  75. +52
    -0
      crypto_sign/dilithium3/avx2/polyvec.h
  76. +9
    -0
      crypto_sign/dilithium3/avx2/reduce.h
  77. +91
    -0
      crypto_sign/dilithium3/avx2/reduce.s
  78. +443
    -0
      crypto_sign/dilithium3/avx2/rejsample.c
  79. +26
    -0
      crypto_sign/dilithium3/avx2/rejsample.h
  80. +115
    -0
      crypto_sign/dilithium3/avx2/rounding.c
  81. +12
    -0
      crypto_sign/dilithium3/avx2/rounding.h
  82. +23
    -0
      crypto_sign/dilithium3/avx2/shuffle.inc
  83. +446
    -0
      crypto_sign/dilithium3/avx2/sign.c
  84. +15
    -0
      crypto_sign/dilithium3/avx2/sign.h
  85. +26
    -0
      crypto_sign/dilithium3/avx2/stream.c
  86. +15
    -0
      crypto_sign/dilithium3/avx2/stream.h
  87. +23
    -0
      crypto_sign/dilithium3/avx2/symmetric.h
  88. +6
    -2
      crypto_sign/dilithium3/clean/LICENSE
  89. +3
    -3
      crypto_sign/dilithium3/clean/Makefile
  90. +1
    -1
      crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake
  91. +13
    -9
      crypto_sign/dilithium3/clean/api.h
  92. +15
    -14
      crypto_sign/dilithium3/clean/ntt.c
  93. +4
    -3
      crypto_sign/dilithium3/clean/ntt.h
  94. +62
    -54
      crypto_sign/dilithium3/clean/packing.c
  95. +29
    -24
      crypto_sign/dilithium3/clean/packing.h
  96. +4
    -4
      crypto_sign/dilithium3/clean/params.h
  97. +131
    -164
      crypto_sign/dilithium3/clean/poly.c
  98. +16
    -15
      crypto_sign/dilithium3/clean/poly.h
  99. +29
    -26
      crypto_sign/dilithium3/clean/polyvec.c
  100. +5
    -4
      crypto_sign/dilithium3/clean/polyvec.h

+ 10
- 1
crypto_sign/dilithium2/META.yml View File

@@ -17,4 +17,13 @@ auxiliary-submitters:
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/dilithium/commit/40f79645879b5c69835cd91d06945d7c24f39922
version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08
- name: avx2
version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
required_flags:
- avx2
- bmi2

+ 6
- 0
crypto_sign/dilithium2/avx2/LICENSE View File

@@ -0,0 +1,6 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and the random number generator
we are using public-domain code from sources
and by authors listed in comments on top of
the respective files.

+ 43
- 0
crypto_sign/dilithium2/avx2/Makefile View File

@@ -0,0 +1,43 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libdilithium2_avx2.a

SOURCES = fips202x4.c invntt.s nttconsts.c ntt.s packing.c pointwise.S poly.c \
polyvec.c reduce.s rejsample.c rounding.c sign.c stream.c
OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \
polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o
HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \
nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \
fips202x4.h shuffle.inc

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \
-Wmissing-prototypes -Wredundant-decls -std=c99 \
-Wcast-align \
-mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS)

all: $(LIB)

KECCAK4XDIR=../../../common/keccak4x
KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o
KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.o: %.s $(HEADERS)
$(AS) -o $@ $<

%.o: %.S $(HEADERS)
$(AS) -c -o $@ $<

$(LIB): $(OBJECTS) $(KECCAK4X)
$(AR) -r $@ $^

$(KECCAK4X):
$(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)
$(MAKE) -C $(KECCAK4XDIR) clean


+ 22
- 0
crypto_sign/dilithium2/avx2/alignment.h View File

@@ -0,0 +1,22 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H
#define PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H

#define ALIGNED_UINT8(N) \
union { \
uint32_t as_arr[N]; \
__m256i as_vec[(N)/32]; \
}

#define ALIGNED_UINT32(N) \
union { \
uint32_t as_arr[N]; \
__m256i as_vec[(N)/8]; \
}

#define ALIGNED_UINT64(N) \
union { \
uint64_t as_arr[N]; \
__m256i as_vec[(N)/8]; \
}

#endif //PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H

+ 37
- 0
crypto_sign/dilithium2/avx2/api.h View File

@@ -0,0 +1,37 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_API_H
#define PQCLEAN_DILITHIUM2_AVX2_API_H

#include <stddef.h>
#include <stdint.h>

#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1184U
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2800U
#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2044U

#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2"


int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(
uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);



#endif

+ 239
- 0
crypto_sign/dilithium2/avx2/fips202x4.c View File

@@ -0,0 +1,239 @@
#include <immintrin.h>
#include <stdint.h>

#include "fips202.h"
#include "fips202x4.h"
#include "params.h"

#define NROUNDS 24
#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset))))

static uint64_t load64(const uint8_t *x) {
unsigned int i;
uint64_t r = 0;

for (i = 0; i < 8; ++i) {
r |= (uint64_t)x[i] << 8 * i;
}

return r;
}

static void store64(uint8_t *x, uint64_t u) {
unsigned int i;

for (i = 0; i < 8; ++i) {
x[i] = (uint8_t)(u >> 8 * i);
}
}

/* Use implementation from the Keccak Code Package */
extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s);
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds

static void keccak_absorb4x(__m256i *s,
unsigned int r,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen,
uint8_t p) {
unsigned long long i;
uint8_t t0[200];
uint8_t t1[200];
uint8_t t2[200];
uint8_t t3[200];
uint64_t *ss = (uint64_t *)s;

for (i = 0; i < 25; ++i) {
s[i] = _mm256_xor_si256(s[i], s[i]);
}

while (mlen >= r) {
for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(m0 + 8 * i);
ss[4 * i + 1] ^= load64(m1 + 8 * i);
ss[4 * i + 2] ^= load64(m2 + 8 * i);
ss[4 * i + 3] ^= load64(m3 + 8 * i);
}

KeccakF1600_StatePermute4x(s);
mlen -= r;
m0 += r;
m1 += r;
m2 += r;
m3 += r;
}

for (i = 0; i < r; ++i) {
t0[i] = 0;
t1[i] = 0;
t2[i] = 0;
t3[i] = 0;
}
for (i = 0; i < mlen; ++i) {
t0[i] = m0[i];
t1[i] = m1[i];
t2[i] = m2[i];
t3[i] = m3[i];
}

t0[i] = p;
t1[i] = p;
t2[i] = p;
t3[i] = p;

t0[r - 1] |= 128;
t1[r - 1] |= 128;
t2[r - 1] |= 128;
t3[r - 1] |= 128;

for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(t0 + 8 * i);
ss[4 * i + 1] ^= load64(t1 + 8 * i);
ss[4 * i + 2] ^= load64(t2 + 8 * i);
ss[4 * i + 3] ^= load64(t3 + 8 * i);
}
}


static void keccak_squeezeblocks4x(uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
unsigned int r,
__m256i *s) {
unsigned int i;
uint64_t *ss = (uint64_t *)s;

while (nblocks > 0) {
KeccakF1600_StatePermute4x(s);
for (i = 0; i < r / 8; ++i) {
store64(h0 + 8 * i, ss[4 * i + 0]);
store64(h1 + 8 * i, ss[4 * i + 1]);
store64(h2 + 8 * i, ss[4 * i + 2]);
store64(h3 + 8 * i, ss[4 * i + 3]);
}

h0 += r;
h1 += r;
h2 += r;
h3 += r;
--nblocks;
}

}

void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen) {
keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F);
}

void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
__m256i *s) {
keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s);
}

void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen) {
keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F);
}

void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
__m256i *s) {
keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s);
}

void PQCLEAN_DILITHIUM2_AVX2_shake128_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long long hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen) {
unsigned int i;
unsigned long nblocks = hlen / SHAKE128_RATE;
uint8_t t[4][SHAKE128_RATE];
__m256i s[25];

PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen);
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s);

h0 += nblocks * SHAKE128_RATE;
h1 += nblocks * SHAKE128_RATE;
h2 += nblocks * SHAKE128_RATE;
h3 += nblocks * SHAKE128_RATE;
hlen -= nblocks * SHAKE128_RATE;

if (hlen) {
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s);
for (i = 0; i < hlen; ++i) {
h0[i] = t[0][i];
h1[i] = t[1][i];
h2[i] = t[2][i];
h3[i] = t[3][i];
}
}
}

void PQCLEAN_DILITHIUM2_AVX2_shake256_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long long hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen) {
unsigned int i;
unsigned long nblocks = hlen / SHAKE256_RATE;
uint8_t t[4][SHAKE256_RATE];
__m256i s[25];

PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen);
PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s);

h0 += nblocks * SHAKE256_RATE;
h1 += nblocks * SHAKE256_RATE;
h2 += nblocks * SHAKE256_RATE;
h3 += nblocks * SHAKE256_RATE;
hlen -= nblocks * SHAKE256_RATE;

if (hlen) {
PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s);
for (i = 0; i < hlen; ++i) {
h0[i] = t[0][i];
h1[i] = t[1][i];
h2[i] = t[2][i];
h3[i] = t[3][i];
}
}
}

+ 65
- 0
crypto_sign/dilithium2/avx2/fips202x4.h View File

@@ -0,0 +1,65 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_FIPS202X4_H
#define PQCLEAN_DILITHIUM2_AVX2_FIPS202X4_H

#include <immintrin.h>
#include <stdint.h>

#include "params.h"

void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen);

void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
__m256i *s);

void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen);

void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
__m256i *s);

void PQCLEAN_DILITHIUM2_AVX2_shake128_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long long hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen);

void PQCLEAN_DILITHIUM2_AVX2_shake256_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long long hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen);

#endif

+ 281
- 0
crypto_sign/dilithium2/avx2/invntt.s View File

@@ -0,0 +1,281 @@
.include "shuffle.inc"

.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3
vpaddd %ymm2,%ymm\l0,%ymm12
vpaddd %ymm2,%ymm\l1,%ymm13
vpaddd %ymm2,%ymm\l2,%ymm14

vpsubd %ymm\h0,%ymm12,%ymm12
vpsubd %ymm\h1,%ymm13,%ymm13
vpsubd %ymm\h2,%ymm14,%ymm14

vpmuludq %ymm\z0,%ymm12,%ymm12
vpmuludq %ymm\z0,%ymm13,%ymm13
vpaddd %ymm2,%ymm\l3,%ymm15

vpmuludq %ymm\z1,%ymm14,%ymm14
vpsubd %ymm\h3,%ymm15,%ymm15
vpaddd %ymm\l0,%ymm\h0,%ymm\l0

vpmuludq %ymm\z1,%ymm15,%ymm15
vpaddd %ymm\l1,%ymm\h1,%ymm\l1
vpaddd %ymm\l2,%ymm\h2,%ymm\l2

vpaddd %ymm\l3,%ymm\h3,%ymm\l3

vpmuludq %ymm0,%ymm12,%ymm\h0
vpmuludq %ymm0,%ymm13,%ymm\h1
vpmuludq %ymm0,%ymm14,%ymm\h2
vpmuludq %ymm0,%ymm15,%ymm\h3
vpmuludq %ymm1,%ymm\h0,%ymm\h0
vpmuludq %ymm1,%ymm\h1,%ymm\h1
vpmuludq %ymm1,%ymm\h2,%ymm\h2
vpmuludq %ymm1,%ymm\h3,%ymm\h3
vpaddq %ymm12,%ymm\h0,%ymm\h0
vpaddq %ymm13,%ymm\h1,%ymm\h1
vpaddq %ymm14,%ymm\h2,%ymm\h2
vpaddq %ymm15,%ymm\h3,%ymm\h3
vpsrlq $32,%ymm\h0,%ymm\h0
vpsrlq $32,%ymm\h1,%ymm\h1
vpsrlq $32,%ymm\h2,%ymm\h2
vpsrlq $32,%ymm\h3,%ymm\h3
.endm

.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx
PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm6
vmovdqa 32(%rsi),%ymm7
vmovdqa 64(%rsi),%ymm5
vmovdqa 96(%rsi),%ymm10

#reorder
shuffle8 6,5,8,5
shuffle8 7,10,6,10

shuffle4 8,6,4,6
shuffle4 5,10,8,10

vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vpsrlq $32,%ymm10,%ymm11

level0:
vpmovzxdq (%rdx),%ymm3
vpmovzxdq 16(%rdx),%ymm15
vpaddd %ymm2,%ymm4,%ymm12
vpaddd %ymm2,%ymm6,%ymm13
vpaddd %ymm2,%ymm8,%ymm14

vpsubd %ymm5,%ymm12,%ymm12
vpsubd %ymm7,%ymm13,%ymm13
vpsubd %ymm9,%ymm14,%ymm14

vpmuludq %ymm3,%ymm12,%ymm12
vpmuludq %ymm15,%ymm13,%ymm13
vpaddd %ymm2,%ymm10,%ymm15

vpsubd %ymm11,%ymm15,%ymm15
vpaddd %ymm4,%ymm5,%ymm4
vpaddd %ymm6,%ymm7,%ymm6
vpmovzxdq 32(%rdx),%ymm5
vpmovzxdq 48(%rdx),%ymm7

vpmuludq %ymm5,%ymm14,%ymm14
vpmuludq %ymm7,%ymm15,%ymm15
vpaddd %ymm8,%ymm9,%ymm8

vpaddd %ymm10,%ymm11,%ymm10

vpmuludq %ymm0,%ymm12,%ymm5
vpmuludq %ymm0,%ymm13,%ymm7
vpmuludq %ymm0,%ymm14,%ymm9
vpmuludq %ymm0,%ymm15,%ymm11
vpmuludq %ymm1,%ymm5,%ymm5
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm9,%ymm9
vpmuludq %ymm1,%ymm11,%ymm11
vpaddq %ymm12,%ymm5,%ymm5
vpaddq %ymm13,%ymm7,%ymm7
vpaddq %ymm14,%ymm9,%ymm9
vpaddq %ymm15,%ymm11,%ymm11
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm7,%ymm7
vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11

level1:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpmovzxdq 64(%rdx),%ymm15
vpmovzxdq 80(%rdx),%ymm3

butterfly 4,5,8,9,6,7,10,11

level2:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpmovzxdq 96(%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11 3,3

#shuffle
shuffle4 4,5,3,5
shuffle4 6,7,4,7
shuffle4 8,9,6,9
shuffle4 10,11,8,11

level3:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 112(%rdx),%ymm14
vpbroadcastd 116(%rdx),%ymm15
vpblendd $0xF0,%ymm15,%ymm14,%ymm10

butterfly 3,4,6,8,5,7,9,11 10,10

#shuffle
shuffle8 3,4,10,4
shuffle8 6,8,3,8
shuffle8 5,7,6,7
shuffle8 9,11,5,11

level4:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 120(%rdx),%ymm9

butterfly 10,3,6,5,4,8,7,11 9,9

#store
vmovdqa %ymm10,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm5,96(%rdi)
vmovdqa %ymm4,128(%rdi)
vmovdqa %ymm8,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx
PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
vmovdqa 256(%rsi),%ymm5
vmovdqa 512(%rsi),%ymm6
vmovdqa 768(%rsi),%ymm7
vmovdqa 1024(%rsi),%ymm8
vmovdqa 1280(%rsi),%ymm9
vmovdqa 1536(%rsi),%ymm10
vmovdqa 1792(%rsi),%ymm11

level5:
vpbroadcastd (%rdx),%ymm3
vpbroadcastd 4(%rdx),%ymm15
vpaddd %ymm2,%ymm4,%ymm12
vpaddd %ymm2,%ymm6,%ymm13
vpaddd %ymm2,%ymm8,%ymm14

vpsubd %ymm5,%ymm12,%ymm12
vpsubd %ymm7,%ymm13,%ymm13
vpsubd %ymm9,%ymm14,%ymm14

vpmuludq %ymm3,%ymm12,%ymm12
vpmuludq %ymm15,%ymm13,%ymm13
vpaddd %ymm2,%ymm10,%ymm15

vpsubd %ymm11,%ymm15,%ymm15
vpaddd %ymm4,%ymm5,%ymm4
vpaddd %ymm6,%ymm7,%ymm6
vpbroadcastd 8(%rdx),%ymm5
vpbroadcastd 12(%rdx),%ymm7

vpmuludq %ymm5,%ymm14,%ymm14
vpmuludq %ymm7,%ymm15,%ymm15
vpaddd %ymm8,%ymm9,%ymm8

vpaddd %ymm10,%ymm11,%ymm10

vpmuludq %ymm0,%ymm12,%ymm5
vpmuludq %ymm0,%ymm13,%ymm7
vpmuludq %ymm0,%ymm14,%ymm9
vpmuludq %ymm0,%ymm15,%ymm11
vpmuludq %ymm1,%ymm5,%ymm5
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm9,%ymm9
vpmuludq %ymm1,%ymm11,%ymm11
vpaddq %ymm12,%ymm5,%ymm5
vpaddq %ymm13,%ymm7,%ymm7
vpaddq %ymm14,%ymm9,%ymm9
vpaddq %ymm15,%ymm11,%ymm11
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm7,%ymm7
vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11

level6:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 16(%rdx),%ymm15
vpbroadcastd 20(%rdx),%ymm3

butterfly 4,5,8,9,6,7,10,11

level7:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 24(%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11 3,3

#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xdiv(%rip),%ymm3

vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
vpmuludq %ymm3,%ymm6,%ymm6
vpmuludq %ymm3,%ymm7,%ymm7
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm0,%ymm6,%ymm14
vpmuludq %ymm0,%ymm7,%ymm15
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm12,%ymm4,%ymm4
vpaddq %ymm13,%ymm5,%ymm5
vpaddq %ymm14,%ymm6,%ymm6
vpaddq %ymm15,%ymm7,%ymm7
vpsrlq $32,%ymm4,%ymm4
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7

#store
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_mask(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6
vpermd %ymm7,%ymm3,%ymm7
vpermd %ymm8,%ymm3,%ymm8
vpermd %ymm9,%ymm3,%ymm9
vpermd %ymm10,%ymm3,%ymm10
vpermd %ymm11,%ymm3,%ymm11
vmovdqa %xmm4,(%rdi)
vmovdqa %xmm5,128(%rdi)
vmovdqa %xmm6,256(%rdi)
vmovdqa %xmm7,384(%rdi)
vmovdqa %xmm8,512(%rdi)
vmovdqa %xmm9,640(%rdi)
vmovdqa %xmm10,768(%rdi)
vmovdqa %xmm11,896(%rdi)

ret

+ 26
- 0
crypto_sign/dilithium2/avx2/ntt.h View File

@@ -0,0 +1,26 @@
#ifndef NTT_H
#define NTT_H

#include <stdint.h>

#include "nttconsts.h"
#include "params.h"

void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx(uint64_t *tmp,
const uint32_t *a,
const uint32_t *zetas);
void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx(uint32_t *a,
const uint64_t *tmp,
const uint32_t *zetas);

void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx(uint64_t *tmp,
const uint32_t *a,
const uint32_t *zetas_inv);
void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx(uint32_t *a,
const uint64_t *tmp,
const uint32_t *zetas_inv);

void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b);
void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b);

#endif

+ 178
- 0
crypto_sign/dilithium2/avx2/ntt.s View File

@@ -0,0 +1,178 @@
.include "shuffle.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3
#mul
vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0
vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1
vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2
vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3

#reduce
vpmuludq %ymm0,%ymm\rh0,%ymm12
vpmuludq %ymm0,%ymm\rh1,%ymm13
vpmuludq %ymm0,%ymm\rh2,%ymm14
vpmuludq %ymm0,%ymm\rh3,%ymm15
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm\rh0,%ymm12,%ymm12
vpaddq %ymm\rh1,%ymm13,%ymm13
vpaddq %ymm\rh2,%ymm14,%ymm14
vpaddq %ymm\rh3,%ymm15,%ymm15
vpsrlq $32,%ymm12,%ymm12
vpsrlq $32,%ymm13,%ymm13
vpsrlq $32,%ymm14,%ymm14
vpsrlq $32,%ymm15,%ymm15

#update
vpaddd %ymm2,%ymm\rl0,%ymm\rh0
vpaddd %ymm2,%ymm\rl1,%ymm\rh1
vpaddd %ymm2,%ymm\rl2,%ymm\rh2
vpaddd %ymm2,%ymm\rl3,%ymm\rh3
vpaddd %ymm12,%ymm\rl0,%ymm\rl0
vpaddd %ymm13,%ymm\rl1,%ymm\rl1
vpaddd %ymm14,%ymm\rl2,%ymm\rl2
vpaddd %ymm15,%ymm\rl3,%ymm\rl3
vpsubd %ymm12,%ymm\rh0,%ymm\rh0
vpsubd %ymm13,%ymm\rh1,%ymm\rh1
vpsubd %ymm14,%ymm\rh2,%ymm\rh2
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.endm

.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx
PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2

level0:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd (%rdx),%ymm3

#load
vpmovzxdq (%rsi),%ymm4
vpmovzxdq 128(%rsi),%ymm5
vpmovzxdq 256(%rsi),%ymm6
vpmovzxdq 384(%rsi),%ymm7
vpmovzxdq 512(%rsi),%ymm8
vpmovzxdq 640(%rsi),%ymm9
vpmovzxdq 768(%rsi),%ymm10
vpmovzxdq 896(%rsi),%ymm11

butterfly 4,5,6,7,8,9,10,11

level1:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 4(%rdx),%ymm12
vpbroadcastd 8(%rdx),%ymm13

butterfly 4,5,8,9,6,7,10,11 12,12,13,13

level2:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 12(%rdx),%ymm12
vpbroadcastd 16(%rdx),%ymm13
vpbroadcastd 20(%rdx),%ymm14
vpbroadcastd 24(%rdx),%ymm15

butterfly 4,6,8,10,5,7,9,11 12,13,14,15

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,256(%rdi)
vmovdqa %ymm6,512(%rdi)
vmovdqa %ymm7,768(%rdi)
vmovdqa %ymm8,1024(%rdi)
vmovdqa %ymm9,1280(%rdi)
vmovdqa %ymm10,1536(%rdi)
vmovdqa %ymm11,1792(%rdi)

ret

.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx
PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
vmovdqa 32(%rsi),%ymm5
vmovdqa 64(%rsi),%ymm6
vmovdqa 96(%rsi),%ymm7
vmovdqa 128(%rsi),%ymm8
vmovdqa 160(%rsi),%ymm9
vmovdqa 192(%rsi),%ymm10
vmovdqa 224(%rsi),%ymm11

level3:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd (%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11

level4:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpbroadcastd 4(%rdx),%ymm12
vpbroadcastd 8(%rdx),%ymm13
vpblendd $0xF0,%ymm13,%ymm12,%ymm12

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

butterfly 3,8,4,9,5,10,6,11 12,12,12,12

level5:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpmovzxdq 12(%rdx),%ymm12

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

butterfly 7,5,3,10,8,6,4,11 12,12,12,12

level6:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpmovzxdq 28(%rdx),%ymm12
vpmovzxdq 44(%rdx),%ymm13

butterfly 7,5,8,6,3,10,4,11 12,12,13,13

level7:
#PQCLEAN_DILITHIUM2_AVX2_zetas
vpmovzxdq 60(%rdx),%ymm12
vpmovzxdq 76(%rdx),%ymm13
vpmovzxdq 92(%rdx),%ymm14
vpmovzxdq 108(%rdx),%ymm15

butterfly 7,3,8,4,5,10,6,11 12,13,14,15

#store
vpsllq $32,%ymm5,%ymm5
vpsllq $32,%ymm10,%ymm10
vpsllq $32,%ymm6,%ymm6
vpsllq $32,%ymm11,%ymm11
vpblendd $0xAA,%ymm5,%ymm7,%ymm7
vpblendd $0xAA,%ymm10,%ymm3,%ymm3
vpblendd $0xAA,%ymm6,%ymm8,%ymm8
vpblendd $0xAA,%ymm11,%ymm4,%ymm4

shuffle4 7,3,5,3
shuffle4 8,4,7,4

shuffle8 5,7,6,7
shuffle8 3,4,5,4

vmovdqa %ymm6,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm7,64(%rdi)
vmovdqa %ymm4,96(%rdi)

ret

+ 80
- 0
crypto_sign/dilithium2/avx2/nttconsts.c View File

@@ -0,0 +1,80 @@
#include "nttconsts.h"

#define QINV 4236238847 // -q^(-1) mod 2^32
#define MONT 4193792ULL
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)


const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};

#undef QINV
#undef MONT
#undef DIV


const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas = {
.as_arr = {
0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776,
3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667,
5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191,
3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439,
7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422,
1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579,
8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390,
7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620,
5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868,
3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076,
6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435,
5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599,
3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165,
7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031,
6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064,
2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112,
162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237,
8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977,
1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735,
6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892,
5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443,
7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090,
4834730, 7018208, 1976782
}
};

const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv = {
.as_arr = {
6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985,
4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748,
2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646,
1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087,
177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422,
6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573,
5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061,
6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386,
1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252,
1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818,
1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496,
7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525,
6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443,
7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093,
5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647,
4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669,
43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330,
1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900,
6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326,
4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745,
3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293,
518909, 2608894, 3975713
}
};

+ 27
- 0
crypto_sign/dilithium2/avx2/nttconsts.h View File

@@ -0,0 +1,27 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H
#define PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H

#include <immintrin.h>
#include <stdint.h>

#include "alignment.h"
#include "params.h"

typedef ALIGNED_UINT32(8) aligned_uint32x8_t;

typedef ALIGNED_UINT32(N) aligned_uint32xN_t;


extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv;

extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv;

#endif //PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H


+ 305
- 0
crypto_sign/dilithium2/avx2/packing.c View File

@@ -0,0 +1,305 @@
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_pk(
uint8_t *pk,
const uint8_t *rho,
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(
uint8_t *rho,
polyveck *t1,
const uint8_t *pk) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sk
*
* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t key[]: byte array containing key
* - const uint8_t tr[]: byte array containing tr
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
* - const polyveck *t0: pointer to vector t0
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(
uint8_t *sk,
const uint8_t *rho,
const uint8_t *key,
const uint8_t *tr,
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]);
}
sk += L * POLETA_SIZE_PACKED;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]);
}
sk += K * POLETA_SIZE_PACKED;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sk
*
* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t key[]: output byte array for key
* - const uint8_t tr[]: output byte array for tr
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - const polyveck *r0: pointer to output vector t0
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(
uint8_t *rho,
uint8_t *key,
uint8_t *tr,
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED);
}
sk += L * POLETA_SIZE_PACKED;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED);
}
sk += K * POLETA_SIZE_PACKED;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sig
*
* Description: Bit-pack signature sig = (z, h, c).
*
* Arguments: - uint8_t sig[]: output byte array
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
* - const poly *c: pointer to PQCLEAN_DILITHIUM2_AVX2_challenge polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(
uint8_t *sig,
const polyvecl *z,
const polyveck *h,
const poly *c) {
unsigned int i, j, k;
uint64_t signs, mask;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]);
}
sig += L * POLZ_SIZE_PACKED;

/* Encode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t)j;
}
}

sig[OMEGA + i] = (uint8_t)k;
}
while (k < OMEGA) {
sig[k++] = 0;
}
sig += OMEGA + K;

/* Encode c */
signs = 0;
mask = 1;
for (i = 0; i < N / 8; ++i) {
sig[i] = 0;
for (j = 0; j < 8; ++j) {
if (c->coeffs[8 * i + j] != 0) {
sig[i] |= (uint8_t)(1u << j);
if (c->coeffs[8 * i + j] == (Q - 1)) {
signs |= mask;
}
mask <<= 1;
}
}
}
sig += N / 8;
for (i = 0; i < 8; ++i) {
sig[i] = (uint8_t)(signs >> 8u * i);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sig
*
* Description: Unpack signature sig = (z, h, c).
*
* Arguments: - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - poly *c: pointer to output PQCLEAN_DILITHIUM2_AVX2_challenge polynomial
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(
polyvecl *z,
polyveck *h,
poly *c,
const uint8_t *sig) {
unsigned int i, j, k;
uint64_t signs;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED);
}
sig += L * POLZ_SIZE_PACKED;

/* Decode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
return 1;
}

for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
}
h->vec[i].coeffs[sig[j]] = 1;
}

k = sig[OMEGA + i];
}

/* Extra indices are zero for strong unforgeability */
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

sig += OMEGA + K;

/* Decode c */
for (i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}

signs = 0;
for (i = 0; i < 8; ++i) {
signs |= (uint64_t)sig[N / 8 + i] << 8 * i;
}

/* Extra sign bits are zero for strong unforgeability */
if (signs >> 60) {
return 1;
}

for (i = 0; i < N / 8; ++i) {
for (j = 0; j < 8; ++j) {
if ((sig[i] >> j) & 0x01) {
c->coeffs[8 * i + j] = 1;
c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1));
signs >>= 1;
}
}
}

return 0;
}

+ 36
- 0
crypto_sign/dilithium2/avx2/packing.h View File

@@ -0,0 +1,36 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_PACKING_H
#define PQCLEAN_DILITHIUM2_AVX2_PACKING_H

#include "params.h"
#include "polyvec.h"

void PQCLEAN_DILITHIUM2_AVX2_pack_pk(
uint8_t *pk,
const uint8_t *rho, const polyveck *t1);
void PQCLEAN_DILITHIUM2_AVX2_pack_sk(
uint8_t *sk,
const uint8_t *rho,
const uint8_t *key,
const uint8_t *tr,
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0);
void PQCLEAN_DILITHIUM2_AVX2_pack_sig(
uint8_t *sig,
const polyvecl *z, const polyveck *h, const poly *c);

void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(
uint8_t *rho, polyveck *t1,
const uint8_t *pk);
void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(
uint8_t *rho,
uint8_t *key,
uint8_t *tr,
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk);
int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(
polyvecl *z, polyveck *h, poly *c, const uint8_t *sig);

#endif

+ 33
- 0
crypto_sign/dilithium2/avx2/params.h View File

@@ -0,0 +1,33 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_PARAMS_H
#define PQCLEAN_DILITHIUM2_AVX2_PARAMS_H


#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define QBITS 23
#define D 14
#define GAMMA1 ((Q - 1)/16)
#define GAMMA2 (GAMMA1/2)
#define ALPHA (2*GAMMA2)

#define K 4
#define L 3
#define ETA 6
#define SETABITS 4
#define BETA 325
#define OMEGA 80


#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8)
#define POLT0_SIZE_PACKED ((N*D)/8)
#define POLETA_SIZE_PACKED ((N*SETABITS)/8)
#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8)
#define POLW1_SIZE_PACKED ((N*4)/8)

#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLT1_SIZE_PACKED)
#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + (L + K)*POLETA_SIZE_PACKED + CRHBYTES + K*POLT0_SIZE_PACKED)
#define CRYPTO_BYTES (L*POLZ_SIZE_PACKED + (OMEGA + K) + (N/8 + 8))

#endif

+ 189
- 0
crypto_sign/dilithium2/avx2/pointwise.S View File

@@ -0,0 +1,189 @@
#include "params.h"

.global PQCLEAN_DILITHIUM2_AVX2_pointwise_avx
PQCLEAN_DILITHIUM2_AVX2_pointwise_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1

xor %eax,%eax
_looptop1:
#load
vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa 64(%rsi),%ymm6
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vmovdqa 64(%rdx),%ymm14
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13
vpsrlq $32,%ymm14,%ymm15

#mul
vpmuludq %ymm2,%ymm10,%ymm2
vpmuludq %ymm3,%ymm11,%ymm3
vpmuludq %ymm4,%ymm12,%ymm4
vpmuludq %ymm5,%ymm13,%ymm5
vpmuludq %ymm6,%ymm14,%ymm6
vpmuludq %ymm7,%ymm15,%ymm7

#reduce
vpmuludq %ymm0,%ymm2,%ymm10
vpmuludq %ymm0,%ymm3,%ymm11
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm0,%ymm6,%ymm14
vpmuludq %ymm0,%ymm7,%ymm15
vpmuludq %ymm1,%ymm10,%ymm10
vpmuludq %ymm1,%ymm11,%ymm11
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm2,%ymm10,%ymm2
vpaddq %ymm3,%ymm11,%ymm3
vpaddq %ymm4,%ymm12,%ymm4
vpaddq %ymm5,%ymm13,%ymm5
vpaddq %ymm6,%ymm14,%ymm6
vpaddq %ymm7,%ymm15,%ymm7
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4
vpsrlq $32,%ymm6,%ymm6

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4
vpblendd $0xAA,%ymm7,%ymm6,%ymm6
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)
vmovdqa %ymm6,64(%rdi)

add $96,%rdi
add $96,%rsi
add $96,%rdx
add $1,%eax
cmp $10,%eax
jb _looptop1

vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13

#mul
vpmuludq %ymm2,%ymm10,%ymm2
vpmuludq %ymm3,%ymm11,%ymm3
vpmuludq %ymm4,%ymm12,%ymm4
vpmuludq %ymm5,%ymm13,%ymm5

#reduce
vpmuludq %ymm0,%ymm2,%ymm10
vpmuludq %ymm0,%ymm3,%ymm11
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm1,%ymm10,%ymm10
vpmuludq %ymm1,%ymm11,%ymm11
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpaddq %ymm2,%ymm10,%ymm2
vpaddq %ymm3,%ymm11,%ymm3
vpaddq %ymm4,%ymm12,%ymm4
vpaddq %ymm5,%ymm13,%ymm5
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4

#store
vpblendd $0x55,%ymm2,%ymm3,%ymm2
vpblendd $0x55,%ymm4,%ymm5,%ymm4
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

ret

.macro pointwise off
#load
vmovdqa \off(%rsi),%ymm6
vmovdqa \off+32(%rsi),%ymm8
vmovdqa \off(%rdx),%ymm10
vmovdqa \off+32(%rdx),%ymm12
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13

#mul
vpmuludq %ymm6,%ymm10,%ymm6
vpmuludq %ymm7,%ymm11,%ymm7
vpmuludq %ymm8,%ymm12,%ymm8
vpmuludq %ymm9,%ymm13,%ymm9
.endm

.macro acc
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm7,%ymm3,%ymm3
vpaddq %ymm8,%ymm4,%ymm4
vpaddq %ymm9,%ymm5,%ymm5
.endm

.global PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1

xor %eax,%eax
_looptop2:
pointwise 0

#mov
vmovdqa %ymm6,%ymm2
vmovdqa %ymm7,%ymm3
vmovdqa %ymm8,%ymm4
vmovdqa %ymm9,%ymm5

pointwise 1024
acc

pointwise 2048
acc



#reduce
vpmuludq %ymm0,%ymm2,%ymm6
vpmuludq %ymm0,%ymm3,%ymm7
vpmuludq %ymm0,%ymm4,%ymm8
vpmuludq %ymm0,%ymm5,%ymm9
vpmuludq %ymm1,%ymm6,%ymm6
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm8,%ymm8
vpmuludq %ymm1,%ymm9,%ymm9
vpaddq %ymm2,%ymm6,%ymm2
vpaddq %ymm3,%ymm7,%ymm3
vpaddq %ymm4,%ymm8,%ymm4
vpaddq %ymm5,%ymm9,%ymm5
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4

vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

add $64,%rsi
add $64,%rdx
add $64,%rdi
add $1,%eax
cmp $16,%eax
jb _looptop2

ret

+ 914
- 0
crypto_sign/dilithium2/avx2/poly.c View File

@@ -0,0 +1,914 @@
#include <immintrin.h>
#include <stdint.h>

#include "fips202x4.h"
#include "ntt.h"
#include "nttconsts.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "rejsample.h"
#include "rounding.h"
#include "symmetric.h"

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_reduce
*
* Description: Reduce all coefficients of input polynomial to representative
* in [0,2*Q[.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a) {
PQCLEAN_DILITHIUM2_AVX2_reduce_avx(a->coeffs);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_csubq
*
* Description: For all coefficients of input polynomial subtract Q if
* coefficient is bigger than Q.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_csubq(poly *a) {
PQCLEAN_DILITHIUM2_AVX2_csubq_avx(a->coeffs);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_freeze
*
* Description: Reduce all coefficients of the polynomial to standard
* representatives.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a) {
PQCLEAN_DILITHIUM2_AVX2_reduce_avx(a->coeffs);
PQCLEAN_DILITHIUM2_AVX2_csubq_avx(a->coeffs);

}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_add
*
* Description: Add polynomials. No modular reduction is performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first summand
* - const poly *b: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
unsigned int i;
__m256i vec0, vec1;
for (i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
vec1 = _mm256_load_si256(&b->coeffs_x8[i]);
vec0 = _mm256_add_epi32(vec0, vec1);
_mm256_store_si256(&c->coeffs_x8[i], vec0);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_sub
*
* Description: Subtract polynomials. Assumes coefficients of second input
* polynomial to be less than 2*Q. No modular reduction is
* performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial to be
* subtraced from first input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
unsigned int i;
__m256i vec0, vec1;
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec);

for (i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
vec1 = _mm256_load_si256(&b->coeffs_x8[i]);
vec0 = _mm256_add_epi32(vec0, twoq);
vec0 = _mm256_sub_epi32(vec0, vec1);
_mm256_store_si256(&c->coeffs_x8[i], vec0);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_shiftl
*
* Description: Multiply polynomial by 2^D without modular reduction. Assumes
* input coefficients to be less than 2^{32-D}.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a) {
unsigned int i;
__m256i vec;

for (i = 0; i < N / 8; i++) {
vec = _mm256_load_si256(&a->coeffs_x8[i]);
vec = _mm256_slli_epi32(vec, D);
_mm256_store_si256(&a->coeffs_x8[i], vec);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_ntt
*
* Description: Forward NTT. Output coefficients can be up to 16*Q larger than
* input coefficients.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a) {
unsigned int i;
ALIGNED_UINT64(N) tmp;

for (i = 0; i < N / 32; ++i) {
PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx(tmp.as_arr + 4 * i, a->coeffs + 4 * i, PQCLEAN_DILITHIUM2_AVX2_zetas.as_arr + 1);
}
for (i = 0; i < N / 32; ++i) {
PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx(a->coeffs + 32 * i, tmp.as_arr + 32 * i, PQCLEAN_DILITHIUM2_AVX2_zetas.as_arr + 8 + 31 * i);
}
}

/*************************************************
* Name: poly_invntt_montgomery
*
* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients
* need to be less than 2*Q. Output coefficients are less than 2*Q.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(poly *a) {
unsigned int i;
ALIGNED_UINT64(N) tmp;

for (i = 0; i < N / 32; i++) {
PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx(tmp.as_arr + 32 * i, a->coeffs + 32 * i, PQCLEAN_DILITHIUM2_AVX2_zetas_inv.as_arr + 31 * i);
}
for (i = 0; i < N / 32; i++) {
PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx(a->coeffs + 4 * i, tmp.as_arr + 4 * i, PQCLEAN_DILITHIUM2_AVX2_zetas_inv.as_arr + 248);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery
*
* Description: Pointwise multiplication of polynomials in NTT domain
* representation and multiplication of resulting polynomial
* with 2^{-32}. Output coefficients are less than 2*Q if input
* coefficient are less than 22*Q.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) {
PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(c->coeffs, a->coeffs, b->coeffs);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_power2round
*
* Description: For all coefficients c of the input polynomial,
* compute c0, c1 such that c mod Q = c1*2^D + c0
* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients Q + a0
* - const poly *v: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *restrict a1,
poly *restrict a0,
const poly *restrict a) {
for (size_t i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_power2round(a->coeffs[i], &a0->coeffs[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_decompose
*
* Description: For all coefficients c of the input polynomial,
* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients Q + a0
* - const poly *c: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(
poly *restrict a1,
poly *restrict a0,
const poly *restrict a) {
unsigned int i;
for (i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_decompose(a->coeffs[i], &a0->coeffs[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_make_hint
*
* Description: Compute hint polynomial. The coefficients of which indicate
* whether the low bits of the corresponding coefficient of
* the input polynomial overflow into the high bits.
*
* Arguments: - poly *h: pointer to output hint polynomial
* - const poly *a0: pointer to low part of input polynomial
* - const poly *a1: pointer to high part of input polynomial
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(
poly *restrict h,
const poly *restrict a0,
const poly *restrict a1) {
unsigned int i, s = 0;
for (i = 0; i < N; ++i) {
h->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_make_hint(a0->coeffs[i], a1->coeffs[i]);
s += h->coeffs[i];
}
return s;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_use_hint
*
* Description: Use hint polynomial to correct the high bits of a polynomial.
*
* Arguments: - poly *a: pointer to output polynomial with corrected high bits
* - const poly *b: pointer to input polynomial
* - const poly *h: pointer to input hint polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(
poly *restrict a,
const poly *restrict b,
const poly *restrict h) {
unsigned int i;

for (i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_use_hint(b->coeffs[i], h->coeffs[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_chknorm
*
* Description: Check infinity norm of polynomial against given bound.
* Assumes input coefficients to be standard representatives.
*
* Arguments: - const poly *a: pointer to polynomial
* - uint32_t B: norm bound
*
* Returns 0 if norm is strictly smaller than B and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, uint32_t B) {
unsigned int i;
int32_t t;

/* It is ok to leak which coefficient violates the bound since
the probability for each coefficient is independent of secret
data but we must not leak the sign of the centralized representative. */
for (i = 0; i < N; ++i) {
/* Absolute value of centralized representative */
t = (Q - 1) / 2 - a->coeffs[i];
t ^= (t >> 31);
t = (Q - 1) / 2 - t;

if ((uint32_t)t >= B) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: rej_uniform_ref
*
* Description: Sample uniformly random coefficients in [0, Q-1] by
* performing rejection sampling using array of random bytes.
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_uniform_ref(uint32_t *a,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t;

ctr = pos = 0;
while (ctr < len && pos + 3 <= buflen) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;

if (t < Q) {
a[ctr++] = t;
}
}

return ctr;
}

/*************************************************
* Name: poly_uniform
*
* Description: Sample polynomial with uniformly random coefficients
* in [0,Q-1] by performing rejection sampling using the
* output stream from SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a,
const unsigned char seed[SEEDBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int nblocks = POLY_UNIFORM_NBLOCKS;
unsigned int buflen = POLY_UNIFORM_BUFLEN;
unsigned char buf[POLY_UNIFORM_BUFLEN + 2];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, nblocks, &state);

ctr = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a->coeffs, N, buf, buflen);

while (ctr < N) {
off = buflen % 3;
for (i = 0; i < off; ++i) {
buf[i] = buf[buflen - off + i];
}

buflen = STREAM128_BLOCKBYTES + off;
stream128_squeezeblocks(buf + off, 1, &state);
ctr += rej_uniform_ref(a->coeffs + ctr, N - ctr, buf, buflen);
}
}

void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const unsigned char seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3) {
unsigned int i, ctr0, ctr1, ctr2, ctr3;
unsigned char inbuf[4][SEEDBYTES + 2];
unsigned char outbuf[4][5 * SHAKE128_RATE];
__m256i state[25];

for (i = 0; i < SEEDBYTES; ++i) {
inbuf[0][i] = seed[i];
inbuf[1][i] = seed[i];
inbuf[2][i] = seed[i];
inbuf[3][i] = seed[i];
}
inbuf[0][SEEDBYTES + 0] = nonce0;
inbuf[0][SEEDBYTES + 1] = nonce0 >> 8;
inbuf[1][SEEDBYTES + 0] = nonce1;
inbuf[1][SEEDBYTES + 1] = nonce1 >> 8;
inbuf[2][SEEDBYTES + 0] = nonce2;
inbuf[2][SEEDBYTES + 1] = nonce2 >> 8;
inbuf[3][SEEDBYTES + 0] = nonce3;
inbuf[3][SEEDBYTES + 1] = nonce3 >> 8;

PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3],
SEEDBYTES + 2);
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5,
state);

ctr0 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a0->coeffs, N, outbuf[0], 5 * SHAKE128_RATE);
ctr1 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a1->coeffs, N, outbuf[1], 5 * SHAKE128_RATE);
ctr2 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a2->coeffs, N, outbuf[2], 5 * SHAKE128_RATE);
ctr3 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a3->coeffs, N, outbuf[3], 5 * SHAKE128_RATE);

while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) {
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1,
state);

ctr0 += rej_uniform_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0],
SHAKE128_RATE);
ctr1 += rej_uniform_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1],
SHAKE128_RATE);
ctr2 += rej_uniform_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2],
SHAKE128_RATE);
ctr3 += rej_uniform_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3],
SHAKE128_RATE);
}
}

/*************************************************
* Name: rej_eta
*
* Description: Sample uniformly random coefficients in [-ETA, ETA] by
* performing rejection sampling using array of random bytes.
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_eta_ref(uint32_t *a,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;

ctr = pos = 0;
while (ctr < len && pos < buflen) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (t0 <= 2 * ETA) {
a[ctr++] = Q + ETA - t0;
}
if (t1 <= 2 * ETA && ctr < len) {
a[ctr++] = Q + ETA - t1;
}
}

return ctr;
}

/*************************************************
* Name: poly_uniform_eta
*
* Description: Sample polynomial with uniformly random coefficients
* in [-ETA,ETA] by performing rejection sampling using the
* output stream from SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a,
const unsigned char seed[SEEDBYTES],
uint16_t nonce) {
unsigned int ctr;
unsigned char buf[POLY_UNIFORM_ETA_BUFLEN];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);

ctr = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN);

while (ctr < N) {
stream128_squeezeblocks(buf, 1, &state);
ctr += rej_eta_ref(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES);
}
}

void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const unsigned char seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3) {
unsigned int i, ctr0, ctr1, ctr2, ctr3;
unsigned char inbuf[4][SEEDBYTES + 2];
unsigned char outbuf[4][2 * SHAKE128_RATE];
__m256i state[25];

for (i = 0; i < SEEDBYTES; ++i) {
inbuf[0][i] = seed[i];
inbuf[1][i] = seed[i];
inbuf[2][i] = seed[i];
inbuf[3][i] = seed[i];
}
inbuf[0][SEEDBYTES + 0] = nonce0;
inbuf[0][SEEDBYTES + 1] = nonce0 >> 8;
inbuf[1][SEEDBYTES + 0] = nonce1;
inbuf[1][SEEDBYTES + 1] = nonce1 >> 8;
inbuf[2][SEEDBYTES + 0] = nonce2;
inbuf[2][SEEDBYTES + 1] = nonce2 >> 8;
inbuf[3][SEEDBYTES + 0] = nonce3;
inbuf[3][SEEDBYTES + 1] = nonce3 >> 8;

PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3],
SEEDBYTES + 2);
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 2,
state);

ctr0 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a0->coeffs, N, outbuf[0], 2 * SHAKE128_RATE);
ctr1 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a1->coeffs, N, outbuf[1], 2 * SHAKE128_RATE);
ctr2 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a2->coeffs, N, outbuf[2], 2 * SHAKE128_RATE);
ctr3 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a3->coeffs, N, outbuf[3], 2 * SHAKE128_RATE);

while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) {
PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1,
state);

ctr0 += rej_eta_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], SHAKE128_RATE);
ctr1 += rej_eta_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], SHAKE128_RATE);
ctr2 += rej_eta_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], SHAKE128_RATE);
ctr3 += rej_eta_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], SHAKE128_RATE);
}
}

/*************************************************
* Name: rej_gamma1m1_ref
*
* Description: Sample uniformly random coefficients
* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling
* using array of random bytes.
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_gamma1m1_ref(uint32_t *a,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;

ctr = pos = 0;
while (ctr < len && pos + 5 <= buflen) {
t0 = buf[pos];
t0 |= (uint32_t)buf[pos + 1] << 8;
t0 |= (uint32_t)buf[pos + 2] << 16;
t0 &= 0xFFFFF;

t1 = buf[pos + 2] >> 4;
t1 |= (uint32_t)buf[pos + 3] << 4;
t1 |= (uint32_t)buf[pos + 4] << 12;

pos += 5;

if (t0 <= 2 * GAMMA1 - 2) {
a[ctr++] = Q + GAMMA1 - 1 - t0;
}
if (t1 <= 2 * GAMMA1 - 2 && ctr < len) {
a[ctr++] = Q + GAMMA1 - 1 - t1;
}
}
return ctr;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1
*
* Description: Sample polynomial with uniformly random coefficients
* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection
* sampling on output stream of SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* CRHBYTES
* - uint16_t nonce: 16-bit nonce
**************************************************/
#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES)
#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1(poly *a,
const unsigned char seed[CRHBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN;
unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4];
stream256_state state;

stream256_init(&state, seed, nonce);
stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state);

ctr = PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1(a->coeffs, N, buf, POLY_UNIFORM_GAMMA1M1_BUFLEN);

while (ctr < N) {
off = buflen % 5;
for (i = 0; i < off; ++i) {
buf[i] = buf[buflen - off + i];
}

buflen = STREAM256_BLOCKBYTES + off;
stream256_squeezeblocks(buf + off, 1, &state);
ctr += rej_gamma1m1_ref(a->coeffs + ctr, N - ctr, buf, buflen);
}
}

void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const unsigned char seed[CRHBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3) {
unsigned int i, ctr0, ctr1, ctr2, ctr3;
unsigned char inbuf[4][CRHBYTES + 2];
unsigned char outbuf[4][5 * SHAKE256_RATE];
__m256i state[25];

for (i = 0; i < CRHBYTES; ++i) {
inbuf[0][i] = seed[i];
inbuf[1][i] = seed[i];
inbuf[2][i] = seed[i];
inbuf[3][i] = seed[i];
}
inbuf[0][CRHBYTES + 0] = nonce0 & 0xFF;
inbuf[0][CRHBYTES + 1] = nonce0 >> 8;
inbuf[1][CRHBYTES + 0] = nonce1 & 0xFF;
inbuf[1][CRHBYTES + 1] = nonce1 >> 8;
inbuf[2][CRHBYTES + 0] = nonce2 & 0xFF;
inbuf[2][CRHBYTES + 1] = nonce2 >> 8;
inbuf[3][CRHBYTES + 0] = nonce3 & 0xFF;
inbuf[3][CRHBYTES + 1] = nonce3 >> 8;

PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3],
CRHBYTES + 2);
PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5,
state);

ctr0 = rej_gamma1m1_ref(a0->coeffs, N, outbuf[0], 5 * SHAKE256_RATE);
ctr1 = rej_gamma1m1_ref(a1->coeffs, N, outbuf[1], 5 * SHAKE256_RATE);
ctr2 = rej_gamma1m1_ref(a2->coeffs, N, outbuf[2], 5 * SHAKE256_RATE);
ctr3 = rej_gamma1m1_ref(a3->coeffs, N, outbuf[3], 5 * SHAKE256_RATE);

while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) {
PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1,
state);

ctr0 += rej_gamma1m1_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0],
SHAKE256_RATE);
ctr1 += rej_gamma1m1_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1],
SHAKE256_RATE);
ctr2 += rej_gamma1m1_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2],
SHAKE256_RATE);
ctr3 += rej_gamma1m1_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3],
SHAKE256_RATE);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyeta_pack
*
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
* Input coefficients are assumed to lie in [Q-ETA,Q+ETA].
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLETA_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;
unsigned char t[8];

for (i = 0; i < N / 2; ++i) {
t[0] = Q + ETA - a->coeffs[2 * i + 0];
t[1] = Q + ETA - a->coeffs[2 * i + 1];
r[i] = t[0] | (t[1] << 4);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack
*
* Description: Unpack polynomial with coefficients in [-ETA,ETA].
* Output coefficients lie in [Q-ETA,Q+ETA].
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *restrict r, const unsigned char *restrict a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[i] & 0x0F;
r->coeffs[2 * i + 1] = a[i] >> 4;
r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1];
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyt1_pack
*
* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits.
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLT1_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;

for (i = 0; i < N / 8; ++i) {
r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0));
r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1));
r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2));
r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3));
r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4));
r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5));
r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6));
r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7));
r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1));
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack
*
* Description: Unpack polynomial t1 with 9-bit coefficients.
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *restrict r, const unsigned char *restrict a) {
unsigned int i;
for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = ((a[9 * i + 0] >> 0) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF;
r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF;
r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF;
r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF;
r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF;
r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF;
r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF;
r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyt0_pack
*
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}].
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLT0_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;
uint32_t t[4];

for (i = 0; i < N / 4; ++i) {
t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0];
t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1];
t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2];
t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3];

r[7 * i + 0] = t[0];
r[7 * i + 1] = t[0] >> 8;
r[7 * i + 1] |= t[1] << 6;
r[7 * i + 2] = t[1] >> 2;
r[7 * i + 3] = t[1] >> 10;
r[7 * i + 3] |= t[2] << 4;
r[7 * i + 4] = t[2] >> 4;
r[7 * i + 5] = t[2] >> 12;
r[7 * i + 5] |= t[3] << 2;
r[7 * i + 6] = t[3] >> 6;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack
*
* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}].
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *restrict r, const unsigned char *restrict a) {
unsigned int i;

for (i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = a[7 * i + 0];
r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8;

r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2;
r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10;

r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4;
r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12;

r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6;

r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0];
r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1];
r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2];
r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3];
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyz_pack
*
* Description: Bit-pack polynomial z with coefficients
* in [-(GAMMA1 - 1), GAMMA1 - 1].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLZ_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;
uint32_t t[2];

for (i = 0; i < N / 2; ++i) {
/* Map to {0,...,2*GAMMA1 - 2} */
t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0];
t[0] += ((int32_t)t[0] >> 31) & Q;
t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1];
t[1] += ((int32_t)t[1] >> 31) & Q;

r[5 * i + 0] = t[0];
r[5 * i + 1] = t[0] >> 8;
r[5 * i + 2] = t[0] >> 16;
r[5 * i + 2] |= t[1] << 4;
r[5 * i + 3] = t[1] >> 4;
r[5 * i + 4] = t[1] >> 12;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyz_unpack
*
* Description: Unpack polynomial z with coefficients
* in [-(GAMMA1 - 1), GAMMA1 - 1].
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *restrict r, const unsigned char *restrict a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[5 * i + 0];
r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16;

r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;

r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q;
r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q;
}

}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyw1_pack
*
* Description: Bit-pack polynomial w1 with coefficients in [0, 15].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLW1_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4);
}
}

+ 83
- 0
crypto_sign/dilithium2/avx2/poly.h View File

@@ -0,0 +1,83 @@
#ifndef POLY_H
#define POLY_H

#include <immintrin.h>
#include <stdint.h>

#include "alignment.h"
#include "params.h"

typedef union {
uint32_t coeffs[N];
__m256i coeffs_x8[N / 8];
} poly;

void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_csubq(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h);

int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, uint32_t B);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);

void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t *r, const poly *a);
#endif

+ 353
- 0
crypto_sign/dilithium2/avx2/polyvec.c View File

@@ -0,0 +1,353 @@
#include <stdint.h>

#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
*
* Arguments: - polyvecl *w: pointer to output vector
* - const polyvecl *u: pointer to first summand
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
* Input coefficients are assumed to be less than 22*Q. Output
* coeffcient are less than 2*L*Q.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w,
const polyvecl *u,
const polyvecl *v) {
PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input coefficients to be standard representatives.
*
* Arguments: - const polyvecl *v: pointer to vector
* - uint32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B and 1
* otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/**************************************************************/
/************ Vectors of polynomials of length K **************/
/**************************************************************/


/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [0,2*Q[.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq
*
* Description: For all coefficients of polynomials in vector of length K
* subtract Q if coefficient is bigger than Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_csubq(&v->vec[i]);
}
}

/*************************************************
* Name: polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first summand
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* Assumes coefficients of polynomials in second input vector
* to be less than 2*Q. No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
* - const polyveck *v: pointer to second input vector to be
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{32-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
* than 2*Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input coefficients to be standard representatives.
*
* Arguments: - const polyveck *v: pointer to vector
* - uint32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B and 1
* otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients Q + a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients Q + a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint
*
* Description: Compute hint vector.
*
* Arguments: - polyveck *h: pointer to output vector
* - const polyveck *v0: pointer to low part of input vector
* - const polyveck *v1: pointer to high part of input vector
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1) {
unsigned int i, s = 0;

for (i = 0; i < K; ++i) {
s += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
}

return s;
}

/*************************************************
* Name: polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *v: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]);
}
}

+ 52
- 0
crypto_sign/dilithium2/avx2/polyvec.h View File

@@ -0,0 +1,52 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H
#define PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H

#include <stdint.h>

#include "params.h"
#include "poly.h"

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w,
const polyvecl *u,
const polyvecl *v);

int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B);



/* Vectors of polynomials of length K */
typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v);

int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t B);

void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h);

#endif

+ 9
- 0
crypto_sign/dilithium2/avx2/reduce.h View File

@@ -0,0 +1,9 @@
#ifndef REDUCE_H
#define REDUCE_H

#include <stdint.h>

void PQCLEAN_DILITHIUM2_AVX2_reduce_avx(uint32_t a[N]);
void PQCLEAN_DILITHIUM2_AVX2_csubq_avx(uint32_t a[N]);

#endif

+ 91
- 0
crypto_sign/dilithium2/avx2/reduce.s View File

@@ -0,0 +1,91 @@
.global PQCLEAN_DILITHIUM2_AVX2_reduce_avx
PQCLEAN_DILITHIUM2_AVX2_reduce_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x23ones(%rip),%ymm0

xor %eax,%eax
_looptop_rdc32:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7

#reduce
vpsrld $23,%ymm1,%ymm2
vpsrld $23,%ymm3,%ymm4
vpsrld $23,%ymm5,%ymm6
vpsrld $23,%ymm7,%ymm8
vpand %ymm0,%ymm1,%ymm1
vpand %ymm0,%ymm3,%ymm3
vpand %ymm0,%ymm5,%ymm5
vpand %ymm0,%ymm7,%ymm7
vpsubd %ymm2,%ymm1,%ymm1
vpsubd %ymm4,%ymm3,%ymm3
vpsubd %ymm6,%ymm5,%ymm5
vpsubd %ymm8,%ymm7,%ymm7
vpslld $13,%ymm2,%ymm2
vpslld $13,%ymm4,%ymm4
vpslld $13,%ymm6,%ymm6
vpslld $13,%ymm8,%ymm8
vpaddd %ymm2,%ymm1,%ymm1
vpaddd %ymm4,%ymm3,%ymm3
vpaddd %ymm6,%ymm5,%ymm5
vpaddd %ymm8,%ymm7,%ymm7

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm7,96(%rdi)

add $128,%rdi
add $1,%eax
cmp $8,%eax
jb _looptop_rdc32

ret

.global PQCLEAN_DILITHIUM2_AVX2_csubq_avx
PQCLEAN_DILITHIUM2_AVX2_csubq_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm0

xor %eax,%eax
_looptop_csubq:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7

#PQCLEAN_DILITHIUM2_AVX2_csubq
vpsubd %ymm0,%ymm1,%ymm1
vpsubd %ymm0,%ymm3,%ymm3
vpsubd %ymm0,%ymm5,%ymm5
vpsubd %ymm0,%ymm7,%ymm7
vpsrad $31,%ymm1,%ymm2
vpsrad $31,%ymm3,%ymm4
vpsrad $31,%ymm5,%ymm6
vpsrad $31,%ymm7,%ymm8
vpand %ymm0,%ymm2,%ymm2
vpand %ymm0,%ymm4,%ymm4
vpand %ymm0,%ymm6,%ymm6
vpand %ymm0,%ymm8,%ymm8
vpaddd %ymm2,%ymm1,%ymm1
vpaddd %ymm4,%ymm3,%ymm3
vpaddd %ymm6,%ymm5,%ymm5
vpaddd %ymm8,%ymm7,%ymm7

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm7,96(%rdi)

add $128,%rdi
add $1,%eax
cmp $8,%eax
jb _looptop_csubq

ret

+ 443
- 0
crypto_sign/dilithium2/avx2/rejsample.c View File

@@ -0,0 +1,443 @@
#include <immintrin.h>
#include <stdint.h>

#include "params.h"
#include "rejsample.h"

static const uint8_t idx[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 1, 0, 0, 0, 0, 0, 0, 0},
{ 0, 1, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 1, 2, 0, 0, 0, 0, 0, 0},
{ 0, 1, 2, 0, 0, 0, 0, 0},
{ 3, 0, 0, 0, 0, 0, 0, 0},
{ 0, 3, 0, 0, 0, 0, 0, 0},
{ 1, 3, 0, 0, 0, 0, 0, 0},
{ 0, 1, 3, 0, 0, 0, 0, 0},
{ 2, 3, 0, 0, 0, 0, 0, 0},
{ 0, 2, 3, 0, 0, 0, 0, 0},
{ 1, 2, 3, 0, 0, 0, 0, 0},
{ 0, 1, 2, 3, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 1, 4, 0, 0, 0, 0, 0, 0},
{ 0, 1, 4, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 1, 2, 4, 0, 0, 0, 0, 0},
{ 0, 1, 2, 4, 0, 0, 0, 0},
{ 3, 4, 0, 0, 0, 0, 0, 0},
{ 0, 3, 4, 0, 0, 0, 0, 0},
{ 1, 3, 4, 0, 0, 0, 0, 0},
{ 0, 1, 3, 4, 0, 0, 0, 0},
{ 2, 3, 4, 0, 0, 0, 0, 0},
{ 0, 2, 3, 4, 0, 0, 0, 0},
{ 1, 2, 3, 4, 0, 0, 0, 0},
{ 0, 1, 2, 3, 4, 0, 0, 0},
{ 5, 0, 0, 0, 0, 0, 0, 0},
{ 0, 5, 0, 0, 0, 0, 0, 0},
{ 1, 5, 0, 0, 0, 0, 0, 0},
{ 0, 1, 5, 0, 0, 0, 0, 0},
{ 2, 5, 0, 0, 0, 0, 0, 0},
{ 0, 2, 5, 0, 0, 0, 0, 0},
{ 1, 2, 5, 0, 0, 0, 0, 0},
{ 0, 1, 2, 5, 0, 0, 0, 0},
{ 3, 5, 0, 0, 0, 0, 0, 0},
{ 0, 3, 5, 0, 0, 0, 0, 0},
{ 1, 3, 5, 0, 0, 0, 0, 0},
{ 0, 1, 3, 5, 0, 0, 0, 0},
{ 2, 3, 5, 0, 0, 0, 0, 0},
{ 0, 2, 3, 5, 0, 0, 0, 0},
{ 1, 2, 3, 5, 0, 0, 0, 0},
{ 0, 1, 2, 3, 5, 0, 0, 0},
{ 4, 5, 0, 0, 0, 0, 0, 0},
{ 0, 4, 5, 0, 0, 0, 0, 0},
{ 1, 4, 5, 0, 0, 0, 0, 0},
{ 0, 1, 4, 5, 0, 0, 0, 0},
{ 2, 4, 5, 0, 0, 0, 0, 0},
{ 0, 2, 4, 5, 0, 0, 0, 0},
{ 1, 2, 4, 5, 0, 0, 0, 0},
{ 0, 1, 2, 4, 5, 0, 0, 0},
{ 3, 4, 5, 0, 0, 0, 0, 0},
{ 0, 3, 4, 5, 0, 0, 0, 0},
{ 1, 3, 4, 5, 0, 0, 0, 0},
{ 0, 1, 3, 4, 5, 0, 0, 0},
{ 2, 3, 4, 5, 0, 0, 0, 0},
{ 0, 2, 3, 4, 5, 0, 0, 0},
{ 1, 2, 3, 4, 5, 0, 0, 0},
{ 0, 1, 2, 3, 4, 5, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 1, 6, 0, 0, 0, 0, 0, 0},
{ 0, 1, 6, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 1, 2, 6, 0, 0, 0, 0, 0},
{ 0, 1, 2, 6, 0, 0, 0, 0},
{ 3, 6, 0, 0, 0, 0, 0, 0},
{ 0, 3, 6, 0, 0, 0, 0, 0},
{ 1, 3, 6, 0, 0, 0, 0, 0},
{ 0, 1, 3, 6, 0, 0, 0, 0},
{ 2, 3, 6, 0, 0, 0, 0, 0},
{ 0, 2, 3, 6, 0, 0, 0, 0},
{ 1, 2, 3, 6, 0, 0, 0, 0},
{ 0, 1, 2, 3, 6, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 1, 4, 6, 0, 0, 0, 0, 0},
{ 0, 1, 4, 6, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 1, 2, 4, 6, 0, 0, 0, 0},
{ 0, 1, 2, 4, 6, 0, 0, 0},
{ 3, 4, 6, 0, 0, 0, 0, 0},
{ 0, 3, 4, 6, 0, 0, 0, 0},
{ 1, 3, 4, 6, 0, 0, 0, 0},
{ 0, 1, 3, 4, 6, 0, 0, 0},
{ 2, 3, 4, 6, 0, 0, 0, 0},
{ 0, 2, 3, 4, 6, 0, 0, 0},
{ 1, 2, 3, 4, 6, 0, 0, 0},
{ 0, 1, 2, 3, 4, 6, 0, 0},
{ 5, 6, 0, 0, 0, 0, 0, 0},
{ 0, 5, 6, 0, 0, 0, 0, 0},
{ 1, 5, 6, 0, 0, 0, 0, 0},
{ 0, 1, 5, 6, 0, 0, 0, 0},
{ 2, 5, 6, 0, 0, 0, 0, 0},
{ 0, 2, 5, 6, 0, 0, 0, 0},
{ 1, 2, 5, 6, 0, 0, 0, 0},
{ 0, 1, 2, 5, 6, 0, 0, 0},
{ 3, 5, 6, 0, 0, 0, 0, 0},
{ 0, 3, 5, 6, 0, 0, 0, 0},
{ 1, 3, 5, 6, 0, 0, 0, 0},
{ 0, 1, 3, 5, 6, 0, 0, 0},
{ 2, 3, 5, 6, 0, 0, 0, 0},
{ 0, 2, 3, 5, 6, 0, 0, 0},
{ 1, 2, 3, 5, 6, 0, 0, 0},
{ 0, 1, 2, 3, 5, 6, 0, 0},
{ 4, 5, 6, 0, 0, 0, 0, 0},
{ 0, 4, 5, 6, 0, 0, 0, 0},
{ 1, 4, 5, 6, 0, 0, 0, 0},
{ 0, 1, 4, 5, 6, 0, 0, 0},
{ 2, 4, 5, 6, 0, 0, 0, 0},
{ 0, 2, 4, 5, 6, 0, 0, 0},
{ 1, 2, 4, 5, 6, 0, 0, 0},
{ 0, 1, 2, 4, 5, 6, 0, 0},
{ 3, 4, 5, 6, 0, 0, 0, 0},
{ 0, 3, 4, 5, 6, 0, 0, 0},
{ 1, 3, 4, 5, 6, 0, 0, 0},
{ 0, 1, 3, 4, 5, 6, 0, 0},
{ 2, 3, 4, 5, 6, 0, 0, 0},
{ 0, 2, 3, 4, 5, 6, 0, 0},
{ 1, 2, 3, 4, 5, 6, 0, 0},
{ 0, 1, 2, 3, 4, 5, 6, 0},
{ 7, 0, 0, 0, 0, 0, 0, 0},
{ 0, 7, 0, 0, 0, 0, 0, 0},
{ 1, 7, 0, 0, 0, 0, 0, 0},
{ 0, 1, 7, 0, 0, 0, 0, 0},
{ 2, 7, 0, 0, 0, 0, 0, 0},
{ 0, 2, 7, 0, 0, 0, 0, 0},
{ 1, 2, 7, 0, 0, 0, 0, 0},
{ 0, 1, 2, 7, 0, 0, 0, 0},
{ 3, 7, 0, 0, 0, 0, 0, 0},
{ 0, 3, 7, 0, 0, 0, 0, 0},
{ 1, 3, 7, 0, 0, 0, 0, 0},
{ 0, 1, 3, 7, 0, 0, 0, 0},
{ 2, 3, 7, 0, 0, 0, 0, 0},
{ 0, 2, 3, 7, 0, 0, 0, 0},
{ 1, 2, 3, 7, 0, 0, 0, 0},
{ 0, 1, 2, 3, 7, 0, 0, 0},
{ 4, 7, 0, 0, 0, 0, 0, 0},
{ 0, 4, 7, 0, 0, 0, 0, 0},
{ 1, 4, 7, 0, 0, 0, 0, 0},
{ 0, 1, 4, 7, 0, 0, 0, 0},
{ 2, 4, 7, 0, 0, 0, 0, 0},
{ 0, 2, 4, 7, 0, 0, 0, 0},
{ 1, 2, 4, 7, 0, 0, 0, 0},
{ 0, 1, 2, 4, 7, 0, 0, 0},
{ 3, 4, 7, 0, 0, 0, 0, 0},
{ 0, 3, 4, 7, 0, 0, 0, 0},
{ 1, 3, 4, 7, 0, 0, 0, 0},
{ 0, 1, 3, 4, 7, 0, 0, 0},
{ 2, 3, 4, 7, 0, 0, 0, 0},
{ 0, 2, 3, 4, 7, 0, 0, 0},
{ 1, 2, 3, 4, 7, 0, 0, 0},
{ 0, 1, 2, 3, 4, 7, 0, 0},
{ 5, 7, 0, 0, 0, 0, 0, 0},
{ 0, 5, 7, 0, 0, 0, 0, 0},
{ 1, 5, 7, 0, 0, 0, 0, 0},
{ 0, 1, 5, 7, 0, 0, 0, 0},
{ 2, 5, 7, 0, 0, 0, 0, 0},
{ 0, 2, 5, 7, 0, 0, 0, 0},
{ 1, 2, 5, 7, 0, 0, 0, 0},
{ 0, 1, 2, 5, 7, 0, 0, 0},
{ 3, 5, 7, 0, 0, 0, 0, 0},
{ 0, 3, 5, 7, 0, 0, 0, 0},
{ 1, 3, 5, 7, 0, 0, 0, 0},
{ 0, 1, 3, 5, 7, 0, 0, 0},
{ 2, 3, 5, 7, 0, 0, 0, 0},
{ 0, 2, 3, 5, 7, 0, 0, 0},
{ 1, 2, 3, 5, 7, 0, 0, 0},
{ 0, 1, 2, 3, 5, 7, 0, 0},
{ 4, 5, 7, 0, 0, 0, 0, 0},
{ 0, 4, 5, 7, 0, 0, 0, 0},
{ 1, 4, 5, 7, 0, 0, 0, 0},
{ 0, 1, 4, 5, 7, 0, 0, 0},
{ 2, 4, 5, 7, 0, 0, 0, 0},
{ 0, 2, 4, 5, 7, 0, 0, 0},
{ 1, 2, 4, 5, 7, 0, 0, 0},
{ 0, 1, 2, 4, 5, 7, 0, 0},
{ 3, 4, 5, 7, 0, 0, 0, 0},
{ 0, 3, 4, 5, 7, 0, 0, 0},
{ 1, 3, 4, 5, 7, 0, 0, 0},
{ 0, 1, 3, 4, 5, 7, 0, 0},
{ 2, 3, 4, 5, 7, 0, 0, 0},
{ 0, 2, 3, 4, 5, 7, 0, 0},
{ 1, 2, 3, 4, 5, 7, 0, 0},
{ 0, 1, 2, 3, 4, 5, 7, 0},
{ 6, 7, 0, 0, 0, 0, 0, 0},
{ 0, 6, 7, 0, 0, 0, 0, 0},
{ 1, 6, 7, 0, 0, 0, 0, 0},
{ 0, 1, 6, 7, 0, 0, 0, 0},
{ 2, 6, 7, 0, 0, 0, 0, 0},
{ 0, 2, 6, 7, 0, 0, 0, 0},
{ 1, 2, 6, 7, 0, 0, 0, 0},
{ 0, 1, 2, 6, 7, 0, 0, 0},
{ 3, 6, 7, 0, 0, 0, 0, 0},
{ 0, 3, 6, 7, 0, 0, 0, 0},
{ 1, 3, 6, 7, 0, 0, 0, 0},
{ 0, 1, 3, 6, 7, 0, 0, 0},
{ 2, 3, 6, 7, 0, 0, 0, 0},
{ 0, 2, 3, 6, 7, 0, 0, 0},
{ 1, 2, 3, 6, 7, 0, 0, 0},
{ 0, 1, 2, 3, 6, 7, 0, 0},
{ 4, 6, 7, 0, 0, 0, 0, 0},
{ 0, 4, 6, 7, 0, 0, 0, 0},
{ 1, 4, 6, 7, 0, 0, 0, 0},
{ 0, 1, 4, 6, 7, 0, 0, 0},
{ 2, 4, 6, 7, 0, 0, 0, 0},
{ 0, 2, 4, 6, 7, 0, 0, 0},
{ 1, 2, 4, 6, 7, 0, 0, 0},
{ 0, 1, 2, 4, 6, 7, 0, 0},
{ 3, 4, 6, 7, 0, 0, 0, 0},
{ 0, 3, 4, 6, 7, 0, 0, 0},
{ 1, 3, 4, 6, 7, 0, 0, 0},
{ 0, 1, 3, 4, 6, 7, 0, 0},
{ 2, 3, 4, 6, 7, 0, 0, 0},
{ 0, 2, 3, 4, 6, 7, 0, 0},
{ 1, 2, 3, 4, 6, 7, 0, 0},
{ 0, 1, 2, 3, 4, 6, 7, 0},
{ 5, 6, 7, 0, 0, 0, 0, 0},
{ 0, 5, 6, 7, 0, 0, 0, 0},
{ 1, 5, 6, 7, 0, 0, 0, 0},
{ 0, 1, 5, 6, 7, 0, 0, 0},
{ 2, 5, 6, 7, 0, 0, 0, 0},
{ 0, 2, 5, 6, 7, 0, 0, 0},
{ 1, 2, 5, 6, 7, 0, 0, 0},
{ 0, 1, 2, 5, 6, 7, 0, 0},
{ 3, 5, 6, 7, 0, 0, 0, 0},
{ 0, 3, 5, 6, 7, 0, 0, 0},
{ 1, 3, 5, 6, 7, 0, 0, 0},
{ 0, 1, 3, 5, 6, 7, 0, 0},
{ 2, 3, 5, 6, 7, 0, 0, 0},
{ 0, 2, 3, 5, 6, 7, 0, 0},
{ 1, 2, 3, 5, 6, 7, 0, 0},
{ 0, 1, 2, 3, 5, 6, 7, 0},
{ 4, 5, 6, 7, 0, 0, 0, 0},
{ 0, 4, 5, 6, 7, 0, 0, 0},
{ 1, 4, 5, 6, 7, 0, 0, 0},
{ 0, 1, 4, 5, 6, 7, 0, 0},
{ 2, 4, 5, 6, 7, 0, 0, 0},
{ 0, 2, 4, 5, 6, 7, 0, 0},
{ 1, 2, 4, 5, 6, 7, 0, 0},
{ 0, 1, 2, 4, 5, 6, 7, 0},
{ 3, 4, 5, 6, 7, 0, 0, 0},
{ 0, 3, 4, 5, 6, 7, 0, 0},
{ 1, 3, 4, 5, 6, 7, 0, 0},
{ 0, 1, 3, 4, 5, 6, 7, 0},
{ 2, 3, 4, 5, 6, 7, 0, 0},
{ 0, 2, 3, 4, 5, 6, 7, 0},
{ 1, 2, 3, 4, 5, 6, 7, 0},
{ 0, 1, 2, 3, 4, 5, 6, 7}
};

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform(
uint32_t *r,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int i, ctr, pos;
uint32_t vec[8];
__m256i d, tmp;
uint32_t good;
const __m256i bound = _mm256_set1_epi32(Q);

ctr = pos = 0;
while (ctr + 8 <= len && pos + 24 <= buflen) {
for (i = 0; i < 8; i++) {
vec[i] = buf[pos++];
vec[i] |= (uint32_t)buf[pos++] << 8;
vec[i] |= (uint32_t)buf[pos++] << 16;
vec[i] &= 0x7FFFFF;
}

d = _mm256_loadu_si256((__m256i_u *)vec);
tmp = _mm256_cmpgt_epi32(bound, d);
good = _mm256_movemask_ps((__m256)tmp);

__m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]);
tmp = _mm256_cvtepu8_epi32(rid);
d = _mm256_permutevar8x32_epi32(d, tmp);
_mm256_storeu_si256((__m256i_u *)&r[ctr], d);
ctr += __builtin_popcount(good);
}

while (ctr < len && pos + 3 <= buflen) {
vec[0] = buf[pos++];
vec[0] |= (uint32_t)buf[pos++] << 8;
vec[0] |= (uint32_t)buf[pos++] << 16;
vec[0] &= 0x7FFFFF;

if (vec[0] < Q) {
r[ctr++] = vec[0];
}
}

return ctr;
}

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta(
uint32_t *r,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int i, ctr, pos;
uint8_t vec[32];
__m256i tmp0, tmp1;
__m128i d0, d1, rid;
uint32_t good;
const __m256i bound = _mm256_set1_epi8(2 * ETA + 1);
const __m256i off = _mm256_set1_epi32(Q + ETA);

ctr = pos = 0;
while (ctr + 32 <= len && pos + 16 <= buflen) {
for (i = 0; i < 16; i++) {
vec[2 * i + 0] = buf[pos] & 0x0F;
vec[2 * i + 1] = buf[pos++] >> 4;
}

tmp0 = _mm256_loadu_si256((__m256i_u *)vec);
tmp1 = _mm256_cmpgt_epi8(bound, tmp0);
good = _mm256_movemask_epi8(tmp1);

d0 = _mm256_castsi256_si128(tmp0);
rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount(good & 0xFF);

d0 = _mm_bsrli_si128(d0, 8);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 8) & 0xFF);

d0 = _mm256_extracti128_si256(tmp0, 1);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 16) & 0xFF);

d0 = _mm_bsrli_si128(d0, 8);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 24) & 0xFF);
}

while (ctr < len && pos < buflen) {
vec[0] = buf[pos] & 0x0F;
vec[1] = buf[pos++] >> 4;

if (vec[0] <= 2 * ETA) {
r[ctr++] = Q + ETA - vec[0];
}
if (vec[1] <= 2 * ETA && ctr < len) {
r[ctr++] = Q + ETA - vec[1];
}
}

return ctr;
}

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1(
uint32_t *r,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int i, ctr, pos;
uint32_t vec[8];
__m256i d, tmp;
uint32_t good;
const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1);
const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1);

ctr = pos = 0;
while (ctr + 8 <= len && pos + 20 <= buflen) {
for (i = 0; i < 4; i++) {
vec[2 * i + 0] = buf[pos + 0];
vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8;
vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16;
vec[2 * i + 0] &= 0xFFFFF;

vec[2 * i + 1] = buf[pos + 2] >> 4;
vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4;
vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12;

pos += 5;
}

d = _mm256_loadu_si256((__m256i_u *)vec);
tmp = _mm256_cmpgt_epi32(bound, d);
good = _mm256_movemask_ps((__m256)tmp);
d = _mm256_sub_epi32(off, d);

__m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]);
tmp = _mm256_cvtepu8_epi32(rid);
d = _mm256_permutevar8x32_epi32(d, tmp);
_mm256_storeu_si256((__m256i_u *)&r[ctr], d);
ctr += __builtin_popcount(good);
}

while (ctr < len && pos + 5 <= buflen) {
vec[0] = buf[pos + 0];
vec[0] |= (uint32_t)buf[pos + 1] << 8;
vec[0] |= (uint32_t)buf[pos + 2] << 16;
vec[0] &= 0xFFFFF;

vec[1] = buf[pos + 2] >> 4;
vec[1] |= (uint32_t)buf[pos + 3] << 4;
vec[1] |= (uint32_t)buf[pos + 4] << 12;

pos += 5;

if (vec[0] <= 2 * GAMMA1 - 2) {
r[ctr++] = Q + GAMMA1 - 1 - vec[0];
}
if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) {
r[ctr++] = Q + GAMMA1 - 1 - vec[1];
}
}

return ctr;
}

+ 26
- 0
crypto_sign/dilithium2/avx2/rejsample.h View File

@@ -0,0 +1,26 @@
#ifndef REJSAMPLE_H
#define REJSAMPLE_H

#include <stdint.h>

#include "poly.h"

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform(
uint32_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen);

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta(
uint32_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen);

unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1(
uint32_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen);

#endif

+ 115
- 0
crypto_sign/dilithium2/avx2/rounding.c View File

@@ -0,0 +1,115 @@
#include "rounding.h"

/*************************************************
* Name: power2round
*
* Description: For finite field element a, compute a0, a1 such that
* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be standard representative.
*
* Arguments: - uint32_t a: input element
* - uint32_t *a0: pointer to output element Q + a0
*
* Returns a1.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0) {
int32_t t;

/* Centralized remainder mod 2^D */
t = a & ((1U << D) - 1);
t -= (1U << (D - 1)) + 1;
t += (t >> 31) & (1U << D);
t -= (1U << (D - 1)) - 1;
*a0 = Q + t;
a = (a - t) >> D;
return a;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_decompose
*
* Description: For finite field element a, compute high and low bits a0, a1 such
* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard
* representative.
*
* Arguments: - uint32_t a: input element
* - uint32_t *a0: pointer to output element Q + a0
*
* Returns a1.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0) {
int32_t t, u;

/* Centralized remainder mod ALPHA */
t = a & 0x7FFFF;
t += (a >> 19) << 9;
t -= ALPHA / 2 + 1;
t += (t >> 31) & ALPHA;
t -= ALPHA / 2 - 1;
a -= t;

/* Divide by ALPHA (possible to avoid) */
u = a - 1;
u >>= 31;
a = (a >> 19) + 1;
a -= u & 1;

/* Border case */
*a0 = Q + t - (a >> 4);
a &= 0xF;
return a;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_make_hint
*
* Description: Compute hint bit indicating whether the low bits of the
* input element overflow into the high bits. Inputs assumed to be
* standard representatives.
*
* Arguments: - uint32_t a0: low bits of input element
* - uint32_t a1: high bits of input element
*
* Returns 1 if high bits of a and b differ and 0 otherwise.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(const uint32_t a0, const uint32_t a1) {
if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) {
return 0;
}

return 1;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_use_hint
*
* Description: Correct high bits according to hint.
*
* Arguments: - uint32_t a: input element
* - unsigned int hint: hint bit
*
* Returns corrected high bits.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(const uint32_t a, const unsigned int hint) {
uint32_t a0, a1;

a1 = PQCLEAN_DILITHIUM2_AVX2_decompose(a, &a0);
if (hint == 0) {
return a1;
}
if (a0 > Q) {
return (a1 + 1) & 0xF;
}
return (a1 - 1) & 0xF;

/* If decompose does not divide out ALPHA:
if(hint == 0)
return a1;
else if(a0 > Q)
return (a1 + ALPHA) % (Q - 1);
else
return (a1 - ALPHA) % (Q - 1);
*/
}

+ 12
- 0
crypto_sign/dilithium2/avx2/rounding.h View File

@@ -0,0 +1,12 @@
#ifndef ROUNDING_H
#define ROUNDING_H

#include "params.h"
#include <stdint.h>

uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0);
uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0);
unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(uint32_t a0, uint32_t a1);
uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(uint32_t a, unsigned int hint);

#endif

+ 23
- 0
crypto_sign/dilithium2/avx2/shuffle.inc View File

@@ -0,0 +1,23 @@
.macro shuffle8 r0,r1,r2,r3
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
vpsllq $32,%ymm\r1,%ymm12
vpsrlq $32,%ymm\r0,%ymm13
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm12
vpsrld $16,%ymm\r0,%ymm13
vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3
.endm

+ 433
- 0
crypto_sign/dilithium2/avx2/sign.c View File

@@ -0,0 +1,433 @@
#include <stdint.h>
#include <string.h>

#include "fips202.h"
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|i|j).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[4], const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[0].vec[0],
&mat[0].vec[1],
&mat[0].vec[2],
&mat[1].vec[0],
rho, 0, 1, 2, 256);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[1].vec[1],
&mat[1].vec[2],
&mat[2].vec[0],
&mat[2].vec[1],
rho, 257, 258, 512, 513);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[2].vec[2],
&mat[3].vec[0],
&mat[3].vec[1],
&mat[3].vec[2],
rho, 514, 768, 769, 770);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_challenge
*
* Description: Implementation of H. Samples polynomial with 60 nonzero
* coefficients in {-1,1} using the output stream of
* SHAKE256(mu|w1).
*
* Arguments: - poly *c: pointer to output polynomial
* - const uint8_t mu[]: byte array containing mu
* - const polyveck *w1: pointer to vector w1
**************************************************/
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c,
const uint8_t mu[CRHBYTES],
const polyveck *w1) {
unsigned int i, b, pos;
uint64_t signs;
uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
uint8_t outbuf[SHAKE256_RATE];
shake256ctx state;

for (i = 0; i < CRHBYTES; ++i) {
inbuf[i] = mu[i];
}
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]);
}

shake256_absorb(&state, inbuf, sizeof(inbuf));
shake256_squeezeblocks(outbuf, 1, &state);

signs = 0;
for (i = 0; i < 8; ++i) {
signs |= (uint64_t) outbuf[i] << 8 * i;
}

pos = 8;

for (i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}

for (i = 196; i < 256; ++i) {
do {
if (pos >= SHAKE256_RATE) {
shake256_squeezeblocks(outbuf, 1, &state);
pos = 0;
}

b = outbuf[pos++];
} while (b > i);

c->coeffs[i] = c->coeffs[b];
c->coeffs[b] = 1;
c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1));
signs >>= 1;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair
*
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
unsigned int i;
uint8_t seedbuf[3 * SEEDBYTES];
uint8_t tr[CRHBYTES];
const uint8_t *rho, *rhoprime, *key;
uint16_t nonce = 0;
polyvecl mat[K];
polyvecl s1, s1hat;
polyveck s2, t, t1, t0;

/* Expand 32 bytes of randomness into rho, rhoprime and key */
randombytes(seedbuf, 3 * SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Expand matrix */
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho);

/* Sample short vectors s1 and s2 */
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s2.vec[0], rhoprime,
nonce, nonce + 1, nonce + 2, nonce + 3);
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[1], &s2.vec[2], &s2.vec[3], &t.vec[0], rhoprime,
nonce + 4, nonce + 5, nonce + 6, 0);

/* Matrix-vector multiplication */
s1hat = s1;
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1hat);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat);
//PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&t.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&t.vec[i]);
}

/* Add error vector s2 */
PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&t, &t, &s2);

/* Extract t1 and write public key */
PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&t);
PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(&t1, &t0, &t);
PQCLEAN_DILITHIUM2_AVX2_pack_pk(pk, rho, &t1);

/* Compute CRH(rho, t1) and write secret key */
crh(tr, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES);
PQCLEAN_DILITHIUM2_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0);

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES
* of len)
* - size_t *siglen: pointer to output length of signed message
* (should be PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES)
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk) {
size_t i;
unsigned int n;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint16_t nonce = 0;
poly c, chat;
polyvecl mat[K], s1, y, yhat, z;
polyveck t0, s2, w, w1, w0;
polyveck h, cs2, ct0;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk);


// use incremental hash API instead of copying around buffers
/* Compute CRH(tr, m) */
shake256incctx state;
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);

crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t0);

rej:
/* Sample intermediate vector y */
PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &yhat.vec[0],
rhoprime, nonce, nonce + 1, nonce + 2, 0);
nonce += 3;

/* Matrix-vector multiplication */
yhat = y;
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&yhat);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat);
PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&w.vec[i]);
}

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w);
PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &w0, &w);
PQCLEAN_DILITHIUM2_AVX2_challenge(&c, mu, &w1);
chat = c;
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat);

/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&cs2.vec[i]);
}
PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&w0, &w0, &cs2);
PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&w0);
if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) {
goto rej;
}

/* Compute z, reject if it reveals secret */
for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&z.vec[i]);
}
PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(&z, &z, &y);
PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(&z);
if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
goto rej;
}

/* Compute hints for w1 */
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]);
PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&ct0.vec[i]);
}

PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&ct0);
if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&ct0, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&w0, &w0, &ct0);
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w0);
n = PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(&h, &w0, &w1);
if (n > OMEGA) {
goto rej;
}

/* Write signature */
PQCLEAN_DILITHIUM2_AVX2_pack_sig(sig, &z, &h, &c);
*siglen = PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sm: pointer to output signed message (allocated
* array with PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - unsigned long long *smlen: pointer to output length of signed
* message
* - const uint8_t *m: pointer to message to be signed
* - unsigned long long mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk) {
int rc;
memmove(sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, m, mlen);
rc = PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk);
*smlen += mlen;
return rc;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify
*
* Description: Verify signed message.
*
* Arguments: - uint8_t *sig: signature
* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES)
* - uint8_t *m: pointer to message
* - size_t *mlen: pointer to output length of message
* - uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk) {
size_t i;
uint8_t rho[SEEDBYTES];
uint8_t mu[CRHBYTES];
poly c, chat, cp;
polyvecl mat[K], z;
polyveck t1, w1, h, tmp1, tmp2;

if (siglen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) {
return -1;
}

PQCLEAN_DILITHIUM2_AVX2_unpack_pk(rho, &t1, pk);
if (PQCLEAN_DILITHIUM2_AVX2_unpack_sig(&z, &h, &c, sig)) {
return -1;
}
if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
return -1;
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES);

shake256incctx state;
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);

/* Matrix-vector multiplication; compute Az - c2^dt1 */
PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho);
PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&z);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z);
}

chat = c;
PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat);
PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(&t1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t1);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]);
}

PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2);
PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(&tmp1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(&tmp1);

/* Reconstruct w1 */
PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&tmp1);
PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(&w1, &tmp1, &h);

/* Call random oracle and verify challenge */
PQCLEAN_DILITHIUM2_AVX2_challenge(&cp, mu, &w1);
for (i = 0; i < N; ++i) {
if (c.coeffs[i] != cp.coeffs[i]) {
return -1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open
*
* Description: Verify signed message.
*
* Arguments: - unsigned char *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* - unsigned long long *mlen: pointer to output length of message
* - const unsigned char *sm: pointer to signed message
* - unsigned long long smlen: length of signed message
* - const unsigned char *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk) {
size_t i;
if (smlen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) {
goto badsig;
}
*mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES;

if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES,
sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + i];
}
return 0;
}

/* Signature verification failed */
badsig:
*mlen = (size_t) -1;
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}

return -1;
}

+ 15
- 0
crypto_sign/dilithium2/avx2/sign.h View File

@@ -0,0 +1,15 @@
#ifndef SIGN_H
#define SIGN_H

#include "api.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"

void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES],
const polyveck *w1);


#endif


+ 26
- 0
crypto_sign/dilithium2/avx2/stream.c View File

@@ -0,0 +1,26 @@
#include "stream.h"

#include <string.h>

void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {

uint8_t buf[SEEDBYTES + 2];
memcpy(buf, seed, SEEDBYTES);
buf[SEEDBYTES] = (uint8_t)nonce;
buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8);

shake128_absorb(state, buf, SEEDBYTES + 2);
}


void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {

uint8_t buf[CRHBYTES + 2];
memcpy(buf, seed, CRHBYTES);
buf[CRHBYTES] = (uint8_t)nonce;
buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8);

shake256_absorb(state, buf, CRHBYTES + 2);
}

+ 15
- 0
crypto_sign/dilithium2/avx2/stream.h View File

@@ -0,0 +1,15 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_STREAM_H
#define PQCLEAN_DILITHIUM2_AVX2_STREAM_H

#include <stdint.h>

#include "fips202.h"
#include "params.h"

void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce);

#endif

+ 23
- 0
crypto_sign/dilithium2/avx2/symmetric.h View File

@@ -0,0 +1,23 @@
#ifndef PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H
#define PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H

#include "params.h"
#include "stream.h"


#include "fips202.h"

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE)

#define STREAM128_BLOCKBYTES SHAKE128_RATE
#define STREAM256_BLOCKBYTES SHAKE256_RATE

typedef shake128ctx stream128_state;
typedef shake256ctx stream256_state;


#endif

+ 6
- 2
crypto_sign/dilithium2/clean/LICENSE View File

@@ -1,2 +1,6 @@
Public Domain
Authors: Léo Ducas, Eike Kiltz, Tancrède Lepoint, Vadim Lyubashevsky, Gregor Seiler, Peter Schwabe, Damien Stehlé
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and the random number generator
we are using public-domain code from sources
and by authors listed in comments on top of
the respective files.

+ 3
- 3
crypto_sign/dilithium2/clean/Makefile View File

@@ -2,10 +2,10 @@

LIB=libdilithium2_clean.a

SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c symmetric.c
OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o symmetric.o
SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c
OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o
HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \
reduce.h rounding.h symmetric.h
reduce.h rounding.h symmetric.h stream.h

CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)



+ 1
- 1
crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake View File

@@ -2,7 +2,7 @@
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libdilithium2_clean.lib
OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj symmetric.obj
OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX

all: $(LIBRARY)


+ 14
- 10
crypto_sign/dilithium2/clean/api.h View File

@@ -4,14 +4,25 @@
#include <stddef.h>
#include <stdint.h>

#define MODE 2

#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1184U
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2800U
#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2044U

#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2"

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(
uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
@@ -21,13 +32,6 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);


#endif

+ 15
- 14
crypto_sign/dilithium2/clean/ntt.c View File

@@ -1,11 +1,12 @@
#include <stdint.h>

#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include <stdint.h>

/* Roots of unity in order needed by forward ntt */
static const uint32_t zetas[N] = {
/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM2_CLEAN_ntt */
static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas[N] = {
0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347,
2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464,
1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231,
@@ -40,8 +41,8 @@ static const uint32_t zetas[N] = {
8332111, 7018208, 3937738, 1400424, 7534263, 1976782
};

/* Roots of unity in order needed by inverse ntt */
static const uint32_t zetas_inv[N] = {
/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM2_CLEAN_ntt */
static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[N] = {
6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416,
3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036,
3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683,
@@ -77,7 +78,7 @@ static const uint32_t zetas_inv[N] = {
};

/*************************************************
* Name: ntt
* Name: PQCLEAN_DILITHIUM2_CLEAN_ntt
*
* Description: Forward NTT, in-place. No modular reduction is performed after
* additions or subtractions. Hence output coefficients can be up
@@ -86,16 +87,16 @@ static const uint32_t zetas_inv[N] = {
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) {
void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t *p) {
unsigned int len, start, j, k;
uint32_t zeta, t;

k = 1;
for (len = 128; len > 0; len >>= 1) {
for (start = 0; start < N; start = j + len) {
zeta = zetas[k++];
zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas[k++];
for (j = start; j < start + len; ++j) {
t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]);
t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]);
p[j + len] = p[j] + 2 * Q - t;
p[j] = p[j] + t;
}
@@ -104,7 +105,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) {
}

/*************************************************
* Name: invntt_frominvmont
* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont
*
* Description: Inverse NTT and multiplication by Montgomery factor 2^32.
* In-place. No modular reductions after additions or
@@ -113,7 +114,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) {
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]) {
void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t *p) {
unsigned int start, len, j, k;
uint32_t t, zeta;
const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q;
@@ -121,17 +122,17 @@ void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]) {
k = 0;
for (len = 1; len < N; len <<= 1) {
for (start = 0; start < N; start = j + len) {
zeta = zetas_inv[k++];
zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[k++];
for (j = start; j < start + len; ++j) {
t = p[j];
p[j] = t + p[j + len];
p[j + len] = t + 256 * Q - p[j + len];
p[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]);
p[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]);
}
}
}

for (j = 0; j < N; ++j) {
p[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)f * p[j]);
p[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) f * p[j]);
}
}

+ 4
- 3
crypto_sign/dilithium2/clean/ntt.h View File

@@ -1,9 +1,10 @@
#ifndef NTT_H
#define NTT_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_NTT_H
#define PQCLEAN_DILITHIUM2_CLEAN_NTT_H

#include "params.h"
#include <stdint.h>

#include "params.h"

void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]);
void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]);



+ 62
- 54
crypto_sign/dilithium2/clean/packing.c View File

@@ -4,17 +4,18 @@
#include "polyvec.h"

/*************************************************
* Name: pack_pk
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - unsigned char pk[]: output byte array
* - const unsigned char rho[]: byte array containing rho
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
const unsigned char rho[SEEDBYTES],
const polyveck *t1) {
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(
uint8_t *pk,
const uint8_t *rho,
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
@@ -28,17 +29,18 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
}

/*************************************************
* Name: unpack_pk
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const unsigned char rho[]: output byte array for rho
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - unsigned char pk[]: byte array containing bit-packed pk
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES],
polyveck *t1,
const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) {
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(
uint8_t *rho,
polyveck *t1,
const uint8_t *pk) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
@@ -52,25 +54,26 @@ void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES],
}

/*************************************************
* Name: pack_sk
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sk
*
* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0).
*
* Arguments: - unsigned char sk[]: output byte array
* - const unsigned char rho[]: byte array containing rho
* - const unsigned char key[]: byte array containing key
* - const unsigned char tr[]: byte array containing tr
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t key[]: byte array containing key
* - const uint8_t tr[]: byte array containing tr
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
* - const polyveck *t0: pointer to vector t0
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
const unsigned char rho[SEEDBYTES],
const unsigned char key[SEEDBYTES],
const unsigned char tr[CRHBYTES],
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0) {
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(
uint8_t *sk,
const uint8_t *rho,
const uint8_t *key,
const uint8_t *tr,
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
@@ -104,25 +107,26 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
}

/*************************************************
* Name: unpack_sk
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sk
*
* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0).
*
* Arguments: - const unsigned char rho[]: output byte array for rho
* - const unsigned char key[]: output byte array for key
* - const unsigned char tr[]: output byte array for tr
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t key[]: output byte array for key
* - const uint8_t tr[]: output byte array for tr
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - const polyveck *r0: pointer to output vector t0
* - unsigned char sk[]: byte array containing bit-packed sk
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES],
unsigned char key[SEEDBYTES],
unsigned char tr[CRHBYTES],
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const unsigned char sk[CRYPTO_SECRETKEYBYTES]) {
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(
uint8_t *rho,
uint8_t *key,
uint8_t *tr,
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
@@ -156,19 +160,20 @@ void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES],
}

/*************************************************
* Name: pack_sig
* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sig
*
* Description: Bit-pack signature sig = (z, h, c).
*
* Arguments: - unsigned char sig[]: output byte array
* Arguments: - uint8_t sig[]: output byte array
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
* - const poly *c: pointer to challenge polynomial
* - const poly *c: pointer to PQCLEAN_DILITHIUM2_CLEAN_challenge polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
const polyvecl *z,
const polyveck *h,
const poly *c) {
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(
uint8_t *sig,
const polyvecl *z,
const polyveck *h,
const poly *c) {
unsigned int i, j, k;
uint64_t signs, mask;

@@ -182,10 +187,11 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (unsigned char) j;
sig[k++] = (uint8_t)j;
}
}
sig[OMEGA + i] = (unsigned char) k;

sig[OMEGA + i] = (uint8_t)k;
}
while (k < OMEGA) {
sig[k++] = 0;
@@ -199,7 +205,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
sig[i] = 0;
for (j = 0; j < 8; ++j) {
if (c->coeffs[8 * i + j] != 0) {
sig[i] |= (unsigned char) (1U << j);
sig[i] |= (uint8_t)(1u << j);
if (c->coeffs[8 * i + j] == (Q - 1)) {
signs |= mask;
}
@@ -209,27 +215,28 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
}
sig += N / 8;
for (i = 0; i < 8; ++i) {
sig[i] = (unsigned char) (signs >> 8 * i);
sig[i] = (uint8_t)(signs >> 8u * i);
}
}

/*************************************************
* Name: unpack_sig
* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sig
*
* Description: Unpack signature sig = (z, h, c).
*
* Arguments: - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - poly *c: pointer to output challenge polynomial
* - const unsigned char sig[]: byte array containing
* - poly *c: pointer to output PQCLEAN_DILITHIUM2_CLEAN_challenge polynomial
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(polyvecl *z,
polyveck *h,
poly *c,
const unsigned char sig[CRYPTO_BYTES]) {
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(
polyvecl *z,
polyveck *h,
poly *c,
const uint8_t *sig) {
unsigned int i, j, k;
uint64_t signs;

@@ -266,6 +273,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(polyvecl *z,
return 1;
}
}

sig += OMEGA + K;

/* Decode c */


+ 29
- 24
crypto_sign/dilithium2/clean/packing.h View File

@@ -1,31 +1,36 @@
#ifndef PACKING_H
#define PACKING_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_PACKING_H
#define PQCLEAN_DILITHIUM2_CLEAN_PACKING_H

#include "params.h"
#include "polyvec.h"

void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
const unsigned char rho[SEEDBYTES], const polyveck *t1);
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
const unsigned char rho[SEEDBYTES],
const unsigned char key[SEEDBYTES],
const unsigned char tr[CRHBYTES],
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0);
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
const polyvecl *z, const polyveck *h, const poly *c);
void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(
uint8_t *pk,
const uint8_t *rho, const polyveck *t1);
void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(
uint8_t *sk,
const uint8_t *rho,
const uint8_t *key,
const uint8_t *tr,
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0);
void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(
uint8_t *sig,
const polyvecl *z, const polyveck *h, const poly *c);

void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], polyveck *t1,
const unsigned char pk[CRYPTO_PUBLICKEYBYTES]);
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES],
unsigned char key[SEEDBYTES],
unsigned char tr[CRHBYTES],
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const unsigned char sk[CRYPTO_SECRETKEYBYTES]);
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(polyvecl *z, polyveck *h, poly *c,
const unsigned char sig[CRYPTO_BYTES]);
void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(
uint8_t *rho, polyveck *t1,
const uint8_t *pk);
void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(
uint8_t *rho,
uint8_t *key,
uint8_t *tr,
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk);
int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(
polyvecl *z, polyveck *h, poly *c, const uint8_t *sig);

#endif

+ 3
- 5
crypto_sign/dilithium2/clean/params.h View File

@@ -1,19 +1,17 @@
#ifndef PARAMS_H
#define PARAMS_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H
#define PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H


#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define QBITS 23
#define ROOT_OF_UNITY 1753
#define D 14
#define GAMMA1 ((Q - 1)/16)
#define GAMMA2 (GAMMA1/2)
#define ALPHA (2*GAMMA2)


// DilithiumII parameters
#define K 4
#define L 3
#define ETA 6


+ 131
- 164
crypto_sign/dilithium2/clean/poly.c View File

@@ -1,10 +1,11 @@
#include <stdint.h>

#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "rounding.h"
#include "symmetric.h"
#include <stdint.h>


/*************************************************
@@ -16,8 +17,7 @@
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a->coeffs[i]);
}
}
@@ -31,8 +31,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a) {
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(poly *a) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_csubq(a->coeffs[i]);
}
}
@@ -46,8 +45,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(poly *a) {
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_freeze(a->coeffs[i]);
}
}
@@ -61,9 +59,8 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a) {
* - const poly *a: pointer to first summand
* - const poly *b: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) {
unsigned int i;
for (i = 0; i < N; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) {
for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
}
}
@@ -78,11 +75,10 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) {
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial to be
* subtraced from first input polynomial
* subtracted from first input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i];
}
}
@@ -96,8 +92,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) {
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] <<= D;
}
}
@@ -139,9 +134,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(poly *a) {
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) {
unsigned int i;

for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]);
}

@@ -160,12 +153,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *
* - const poly *v: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) {
unsigned int i;

for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_power2round(a->coeffs[i], &a0->coeffs[i]);
}

}

/*************************************************
@@ -182,12 +172,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a
* - const poly *c: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) {
unsigned int i;

for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_decompose(a->coeffs[i], &a0->coeffs[i]);
}

}

/*************************************************
@@ -204,13 +191,11 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a)
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) {
unsigned int i, s = 0;

for (i = 0; i < N; ++i) {
unsigned int s = 0;
for (size_t i = 0; i < N; ++i) {
h->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]);
s += h->coeffs[i];
}

return s;
}

@@ -224,12 +209,9 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, co
* - const poly *h: pointer to input hint polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *h) {
unsigned int i;

for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_use_hint(b->coeffs[i], h->coeffs[i]);
}

}

/*************************************************
@@ -244,15 +226,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *
* Returns 0 if norm is strictly smaller than B and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) {
unsigned int i;
int32_t t;

/* It is ok to leak which coefficient violates the bound since
the probability for each coefficient is independent of secret
data but we must not leak the sign of the centralized representative. */
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
/* Absolute value of centralized representative */
t = (int32_t) ((Q - 1) / 2 - a->coeffs[i]);
t = (int32_t)((Q - 1) / 2 - a->coeffs[i]);
t ^= (t >> 31);
t = (Q - 1) / 2 - t;

@@ -260,7 +240,6 @@ int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) {
return 1;
}
}

return 0;
}

@@ -272,7 +251,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) {
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
@@ -280,8 +259,8 @@ int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) {
**************************************************/
static unsigned int rej_uniform(uint32_t *a,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
const uint8_t *buf,
size_t buflen) {
unsigned int ctr, pos;
uint32_t t;

@@ -308,19 +287,20 @@ static unsigned int rej_uniform(uint32_t *a,
* output stream from SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* - const uint8_t seed[]: byte array with seed of length
* SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES)/STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a,
const unsigned char seed[SEEDBYTES],
const uint8_t seed[SEEDBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int buflen = POLY_UNIFORM_BUFLEN;
unsigned char buf[POLY_UNIFORM_BUFLEN + 2];
shake128ctx state;
unsigned int i, ctr;
size_t buflen = POLY_UNIFORM_BUFLEN;
uint8_t buf[POLY_UNIFORM_BUFLEN + 2];
stream128_state state;
size_t off;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);
@@ -347,7 +327,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a,
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
@@ -355,7 +335,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a,
**************************************************/
static unsigned int rej_eta(uint32_t *a,
unsigned int len,
const unsigned char *buf,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;
@@ -384,19 +364,18 @@ static unsigned int rej_eta(uint32_t *a,
* output stream from SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* - const uint8_t seed[]: byte array with seed of length
* SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_ETA_NBLOCKS (((N/2 * (1U << SETABITS)) / (2*ETA + 1)\
+ STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a,
const unsigned char seed[SEEDBYTES],
const uint8_t *seed,
uint16_t nonce) {
unsigned int ctr;
unsigned char buf[POLY_UNIFORM_ETA_BUFLEN];
shake128ctx state;
uint8_t buf[POLY_UNIFORM_ETA_BUFLEN];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);
@@ -418,7 +397,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a,
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
@@ -426,7 +405,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a,
**************************************************/
static unsigned int rej_gamma1m1(uint32_t *a,
unsigned int len,
const unsigned char *buf,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;
@@ -438,7 +417,7 @@ static unsigned int rej_gamma1m1(uint32_t *a,
t0 |= (uint32_t)buf[pos + 2] << 16;
t0 &= 0xFFFFF;

t1 = buf[pos + 2] >> 4;
t1 = buf[pos + 2] >> 4;
t1 |= (uint32_t)buf[pos + 3] << 4;
t1 |= (uint32_t)buf[pos + 4] << 12;

@@ -463,19 +442,19 @@ static unsigned int rej_gamma1m1(uint32_t *a,
* sampling on output stream of SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* - const uint8_t seed[]: byte array with seed of length
* CRHBYTES
* - uint16_t nonce: 16-bit nonce
**************************************************/
#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES)
#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES)
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(poly *a,
const unsigned char seed[CRHBYTES],
const uint8_t seed[CRHBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN;
unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4];
shake256ctx state;
uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4];
stream256_state state;

stream256_init(&state, seed, nonce);
stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state);
@@ -500,18 +479,18 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(poly *a,
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
* Input coefficients are assumed to lie in [Q-ETA,Q+ETA].
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLETA_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(unsigned char *r, const poly *a) {
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a) {
unsigned int i;
unsigned char t[8];
uint8_t t[8];

for (i = 0; i < N / 2; ++i) {
t[0] = (uint8_t) (Q + ETA - a->coeffs[2 * i + 0]);
t[1] = (uint8_t) (Q + ETA - a->coeffs[2 * i + 1]);
r[i] = (uint8_t) (t[0] | (t[1] << 4));
t[0] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 0]);
t[1] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 1]);
r[i] = (uint8_t)(t[0] | (t[1] << 4));
}
}

@@ -522,68 +501,67 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(unsigned char *r, const poly *a) {
* Output coefficients lie in [Q-ETA,Q+ETA].
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const unsigned char *a) {
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[i] & 0x0F;
r->coeffs[2 * i + 1] = a[i] >> 4;
r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1];
}

}

/*************************************************
* Name: polyt1_pack
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack
*
* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits.
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLT1_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(unsigned char *r, const poly *a) {
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a) {
unsigned int i;

for (i = 0; i < N / 8; ++i) {
r[9 * i + 0] = (uint8_t) ((a->coeffs[8 * i + 0] >> 0));
r[9 * i + 1] = (uint8_t) ((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1));
r[9 * i + 2] = (uint8_t) ((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2));
r[9 * i + 3] = (uint8_t) ((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3));
r[9 * i + 4] = (uint8_t) ((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4));
r[9 * i + 5] = (uint8_t) ((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5));
r[9 * i + 6] = (uint8_t) ((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6));
r[9 * i + 7] = (uint8_t) ((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7));
r[9 * i + 8] = (uint8_t) ((a->coeffs[8 * i + 7] >> 1));
r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0));
r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1));
r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2));
r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3));
r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4));
r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5));
r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6));
r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7));
r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1));
}

}

/*************************************************
* Name: polyt1_unpack
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack
*
* Description: Unpack polynomial t1 with 9-bit coefficients.
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) {
unsigned int i;

for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF;
r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF;
r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF;
r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF;
r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF;
r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF;
r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF;
r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF;
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) {
for (size_t i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t) a[9 * i + 1] << 8)) & 0x1FF;
r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t) a[9 * i + 2] << 7)) & 0x1FF;
r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t) a[9 * i + 3] << 6)) & 0x1FF;
r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t) a[9 * i + 4] << 5)) & 0x1FF;
r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t) a[9 * i + 5] << 4)) & 0x1FF;
r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t) a[9 * i + 6] << 3)) & 0x1FF;
r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t) a[9 * i + 7] << 2)) & 0x1FF;
r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t) a[9 * i + 8] << 1)) & 0x1FF;
}

}

/*************************************************
@@ -592,32 +570,30 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) {
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}].
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLT0_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(unsigned char *r, const poly *a) {
unsigned int i;
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a) {
uint32_t t[4];

for (i = 0; i < N / 4; ++i) {
t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0];
t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1];
t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2];
t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3];
r[7 * i + 0] = (uint8_t) (t[0]);
r[7 * i + 1] = (uint8_t) (t[0] >> 8);
r[7 * i + 1] |= (uint8_t) (t[1] << 6);
r[7 * i + 2] = (uint8_t) (t[1] >> 2);
r[7 * i + 3] = (uint8_t) (t[1] >> 10);
r[7 * i + 3] |= (uint8_t) (t[2] << 4);
r[7 * i + 4] = (uint8_t) (t[2] >> 4);
r[7 * i + 5] = (uint8_t) (t[2] >> 12);
r[7 * i + 5] |= (uint8_t) (t[3] << 2);
r[7 * i + 6] = (uint8_t) (t[3] >> 6);
for (size_t i = 0; i < N / 4; ++i) {
t[0] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 0];
t[1] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 1];
t[2] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 2];
t[3] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 3];
r[7 * i + 0] = (uint8_t)(t[0]);
r[7 * i + 1] = (uint8_t)(t[0] >> 8);
r[7 * i + 1] |= (uint8_t)(t[1] << 6);
r[7 * i + 2] = (uint8_t)(t[1] >> 2);
r[7 * i + 3] = (uint8_t)(t[1] >> 10);
r[7 * i + 3] |= (uint8_t)(t[2] << 4);
r[7 * i + 4] = (uint8_t)(t[2] >> 4);
r[7 * i + 5] = (uint8_t)(t[2] >> 12);
r[7 * i + 5] |= (uint8_t)(t[3] << 2);
r[7 * i + 6] = (uint8_t)(t[3] >> 6);
}

}

/*************************************************
@@ -627,32 +603,30 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(unsigned char *r, const poly *a) {
* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}].
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) {
unsigned int i;
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) {

for (i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = a[7 * i + 0];
r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8;
for (size_t i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = a[7 * i + 0];
r->coeffs[4 * i + 0] |= (uint32_t) (a[7 * i + 1] & 0x3F) << 8;

r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2;
r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10;
r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
r->coeffs[4 * i + 1] |= (uint32_t) a[7 * i + 2] << 2;
r->coeffs[4 * i + 1] |= (uint32_t) (a[7 * i + 3] & 0x0F) << 10;

r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4;
r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12;
r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
r->coeffs[4 * i + 2] |= (uint32_t) a[7 * i + 4] << 4;
r->coeffs[4 * i + 2] |= (uint32_t) (a[7 * i + 5] & 0x03) << 12;

r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6;
r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
r->coeffs[4 * i + 3] |= (uint32_t) a[7 * i + 6] << 6;

r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0];
r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1];
r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2];
r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3];
}

}

/*************************************************
@@ -662,29 +636,27 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) {
* in [-(GAMMA1 - 1), GAMMA1 - 1].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLZ_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(unsigned char *r, const poly *a) {
unsigned int i;
void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a) {
uint32_t t[2];

for (i = 0; i < N / 2; ++i) {
for (size_t i = 0; i < N / 2; ++i) {
/* Map to {0,...,2*GAMMA1 - 2} */
t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0];
t[0] += ((int32_t)t[0] >> 31) & Q;
t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1];
t[1] += ((int32_t)t[1] >> 31) & Q;

r[5 * i + 0] = (uint8_t) (t[0]);
r[5 * i + 1] = (uint8_t) (t[0] >> 8);
r[5 * i + 2] = (uint8_t) (t[0] >> 16);
r[5 * i + 2] |= (uint8_t) (t[1] << 4);
r[5 * i + 3] = (uint8_t) (t[1] >> 4);
r[5 * i + 4] = (uint8_t) (t[1] >> 12);
r[5 * i + 0] = (uint8_t)t[0];
r[5 * i + 1] = (uint8_t)(t[0] >> 8);
r[5 * i + 2] = (uint8_t)(t[0] >> 16);
r[5 * i + 2] |= (uint8_t)(t[1] << 4);
r[5 * i + 3] = (uint8_t)(t[1] >> 4);
r[5 * i + 4] = (uint8_t)(t[1] >> 12);
}

}

/*************************************************
@@ -695,26 +667,23 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(unsigned char *r, const poly *a) {
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const unsigned char *a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a) {
for (size_t i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[5 * i + 0];
r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16;
r->coeffs[2 * i + 0] |= (uint32_t) a[5 * i + 1] << 8;
r->coeffs[2 * i + 0] |= (uint32_t) (a[5 * i + 2] & 0x0F) << 16;

r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;
r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 3] << 4;
r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 4] << 12;

r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q;
r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q;
}

}

/*************************************************
@@ -723,15 +692,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const unsigned char *a) {
* Description: Bit-pack polynomial w1 with coefficients in [0, 15].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLW1_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(unsigned char *r, const poly *a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4));
void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a) {
for (size_t i = 0; i < N / 2; ++i) {
r[i] = (uint8_t)(a->coeffs[2 * i + 0] | a->coeffs[2 * i + 1] << 4);
}

}

+ 16
- 15
crypto_sign/dilithium2/clean/poly.h View File

@@ -1,9 +1,10 @@
#ifndef POLY_H
#define POLY_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLY_H
#define PQCLEAN_DILITHIUM2_CLEAN_POLY_H

#include "params.h"
#include <stdint.h>

#include "params.h"

typedef struct {
uint32_t coeffs[N];
} poly;
@@ -27,27 +28,27 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *

int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a,
const unsigned char seed[SEEDBYTES],
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a,
const unsigned char seed[SEEDBYTES],
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(poly *a,
const unsigned char seed[CRHBYTES],
const uint8_t seed[CRHBYTES],
uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const unsigned char *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const unsigned char *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const unsigned char *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const unsigned char *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a);

#endif

+ 29
- 26
crypto_sign/dilithium2/clean/polyvec.c View File

@@ -1,14 +1,15 @@
#include <stdint.h>

#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

/*************************************************
* Name: polyvecl_freeze
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
@@ -24,7 +25,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) {
}

/*************************************************
* Name: polyvecl_add
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
@@ -42,7 +43,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const
}

/*************************************************
* Name: polyvecl_ntt
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
@@ -58,7 +59,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v) {
}

/*************************************************
* Name: polyvecl_pointwise_acc_invmontgomery
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
@@ -85,7 +86,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w,
}

/*************************************************
* Name: polyvecl_chknorm
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input coefficients to be standard representatives.
@@ -96,14 +97,15 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w,
* Returns 0 if norm of all polynomials is strictly smaller than B and 1
* otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound) {
int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) {
return 1;
}
}

return 0;
}

@@ -113,7 +115,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound)


/*************************************************
* Name: polyveck_reduce
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [0,2*Q[.
@@ -129,7 +131,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v) {
}

/*************************************************
* Name: polyveck_csubq
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq
*
* Description: For all coefficients of polynomials in vector of length K
* subtract Q if coefficient is bigger than Q.
@@ -145,7 +147,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v) {
}

/*************************************************
* Name: polyveck_freeze
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
@@ -161,7 +163,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) {
}

/*************************************************
* Name: polyveck_add
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
@@ -179,7 +181,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const
}

/*************************************************
* Name: polyveck_sub
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* Assumes coefficients of polynomials in second input vector
@@ -199,7 +201,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const
}

/*************************************************
* Name: polyveck_shiftl
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{32-D}.
@@ -215,7 +217,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) {
}

/*************************************************
* Name: polyveck_ntt
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
@@ -231,7 +233,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) {
}

/*************************************************
* Name: polyveck_invntt_montgomery
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
@@ -248,7 +250,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v) {
}

/*************************************************
* Name: polyveck_chknorm
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input coefficients to be standard representatives.
@@ -259,19 +261,20 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v) {
* Returns 0 if norm of all polynomials are strictly smaller than B and 1
* otherwise.
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t bound) {
int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) {
if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: polyveck_power2round
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod Q = a1*2^D + a0
@@ -293,7 +296,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, c
}

/*************************************************
* Name: polyveck_decompose
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0
@@ -316,7 +319,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con
}

/*************************************************
* Name: polyveck_make_hint
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint
*
* Description: Compute hint vector.
*
@@ -339,19 +342,19 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h,
}

/*************************************************
* Name: polyveck_use_hint
* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *u: pointer to input vector
* - const polyveck *v: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]);
}
}

+ 5
- 4
crypto_sign/dilithium2/clean/polyvec.h View File

@@ -1,9 +1,10 @@
#ifndef POLYVEC_H
#define POLYVEC_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H
#define PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H

#include <stdint.h>

#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
@@ -46,6 +47,6 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con
unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);
void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h);

#endif

+ 7
- 6
crypto_sign/dilithium2/clean/reduce.c View File

@@ -1,9 +1,10 @@
#include <stdint.h>

#include "params.h"
#include "reduce.h"
#include <stdint.h>

/*************************************************
* Name: montgomery_reduce
* Name: PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce
*
* Description: For finite field element a with 0 <= a <= Q*2^32,
* compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q.
@@ -20,11 +21,11 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(uint64_t a) {
t *= Q;
t = a + t;
t >>= 32;
return (uint32_t) t;
return (uint32_t)t;
}

/*************************************************
* Name: reduce32
* Name: PQCLEAN_DILITHIUM2_CLEAN_reduce32
*
* Description: For finite field element a, compute r \equiv a (mod Q)
* such that 0 <= r < 2*Q.
@@ -43,7 +44,7 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(uint32_t a) {
}

/*************************************************
* Name: csubq
* Name: PQCLEAN_DILITHIUM2_CLEAN_csubq
*
* Description: Subtract Q if input coefficient is bigger than Q.
*
@@ -58,7 +59,7 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a) {
}

/*************************************************
* Name: freeze
* Name: PQCLEAN_DILITHIUM2_CLEAN_freeze
*
* Description: For finite field element a, compute standard
* representative r = a mod Q.


+ 2
- 2
crypto_sign/dilithium2/clean/reduce.h View File

@@ -1,5 +1,5 @@
#ifndef REDUCE_H
#define REDUCE_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H
#define PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H

#include <stdint.h>



+ 30
- 16
crypto_sign/dilithium2/clean/rounding.c View File

@@ -1,7 +1,10 @@
#include <stdint.h>

#include "params.h"
#include "rounding.h"

/*************************************************
* Name: power2round
* Name: PQCLEAN_DILITHIUM2_CLEAN_power2round
*
* Description: For finite field element a, compute a0, a1 such that
* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
@@ -17,16 +20,16 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0) {

/* Centralized remainder mod 2^D */
t = a & ((1U << D) - 1);
t -= ((1U << (D - 1)) + 1);
t += ((uint32_t)((int32_t)t >> 31) & (1U << D));
t -= ((1U << (D - 1)) - 1);
*a0 = (Q + t);
t -= (1U << (D - 1)) + 1;
t += ((uint32_t)((int32_t)t >> 31) & (1 << D));
t -= (1U << (D - 1)) - 1;
*a0 = Q + t;
a = (a - t) >> D;
return a;
}

/*************************************************
* Name: decompose
* Name: PQCLEAN_DILITHIUM2_CLEAN_decompose
*
* Description: For finite field element a, compute high and low bits a0, a1 such
* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
@@ -41,28 +44,29 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0) {
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0) {
int32_t t, u;

/* Centralized remainder mod ALPHA */
t = a & 0x7FFFF;
t += (int32_t) ((a >> 19) << 9);
t = a & 0x7FFFFu;
t += (int32_t)((a >> 19u) << 9u);
t -= ALPHA / 2 + 1;
t += (t >> 31) & ALPHA;
t -= ALPHA / 2 - 1;
a -= (uint32_t) t;
a -= (uint32_t)t;

/* Divide by ALPHA (possible to avoid) */
u = (int32_t) a - 1;
u = (int32_t)(a - 1);
u >>= 31;
a = (a >> 19) + 1;
a -= u & 1;

/* Border case */
*a0 = Q + (uint32_t)t - (a >> 4);
a &= 0xF;
*a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u));
a &= 0xFu;
return a;
}

/*************************************************
* Name: make_hint
* Name: PQCLEAN_DILITHIUM2_CLEAN_make_hint
*
* Description: Compute hint bit indicating whether the low bits of the
* input element overflow into the high bits. Inputs assumed to be
@@ -73,7 +77,7 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0) {
*
* Returns 1 if high bits of a and b differ and 0 otherwise.
**************************************************/
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) {
unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(const uint32_t a0, const uint32_t a1) {
if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) {
return 0;
}
@@ -82,7 +86,7 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) {
}

/*************************************************
* Name: use_hint
* Name: PQCLEAN_DILITHIUM2_CLEAN_use_hint
*
* Description: Correct high bits according to hint.
*
@@ -91,7 +95,7 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) {
*
* Returns corrected high bits.
**************************************************/
uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint) {
uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(const uint32_t a, const unsigned int hint) {
uint32_t a0, a1;

a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(a, &a0);
@@ -101,5 +105,15 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint) {
if (a0 > Q) {
return (a1 + 1) & 0xF;
}

return (a1 - 1) & 0xF;

/* If PQCLEAN_DILITHIUM2_CLEAN_decompose does not divide out ALPHA:
if(hint == 0)
return a1;
else if(a0 > Q)
return (a1 + ALPHA) % (Q - 1);
else
return (a1 - ALPHA) % (Q - 1);
*/
}

+ 2
- 2
crypto_sign/dilithium2/clean/rounding.h View File

@@ -1,5 +1,5 @@
#ifndef ROUNDING_H
#define ROUNDING_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H
#define PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H

#include <stdint.h>



+ 94
- 70
crypto_sign/dilithium2/clean/sign.c View File

@@ -1,3 +1,6 @@
#include <stdint.h>
#include <string.h>

#include "fips202.h"
#include "packing.h"
#include "params.h"
@@ -7,19 +10,17 @@
#include "sign.h"
#include "symmetric.h"

#include <stdint.h>

/*************************************************
* Name: expand_mat
* Name: PQCLEAN_DILITHIUM2_CLEAN_expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|i|j).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const unsigned char rho[]: byte array containing seed rho
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]) {
void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) {
unsigned int i, j;

for (i = 0; i < K; ++i) {
@@ -30,23 +31,23 @@ void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rh
}

/*************************************************
* Name: challenge
* Name: PQCLEAN_DILITHIUM2_CLEAN_challenge
*
* Description: Implementation of H. Samples polynomial with 60 nonzero
* coefficients in {-1,1} using the output stream of
* SHAKE256(mu|w1).
*
* Arguments: - poly *c: pointer to output polynomial
* - const unsigned char mu[]: byte array containing mu
* - const uint8_t mu[]: byte array containing mu
* - const polyveck *w1: pointer to vector w1
**************************************************/
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c,
const unsigned char mu[CRHBYTES],
const uint8_t mu[CRHBYTES],
const polyveck *w1) {
unsigned int i, b, pos;
uint64_t signs;
unsigned char inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
unsigned char outbuf[SHAKE256_RATE];
uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
uint8_t outbuf[SHAKE256_RATE];
shake256ctx state;

for (i = 0; i < CRHBYTES; ++i) {
@@ -88,22 +89,22 @@ void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c,
}

/*************************************************
* Name: crypto_sign_keypair
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair
*
* Description: Generates public and private key.
*
* Arguments: - unsigned char *pk: pointer to output public key (allocated
* array of CRYPTO_PUBLICKEYBYTES bytes)
* - unsigned char *sk: pointer to output private key (allocated
* array of CRYPTO_SECRETKEYBYTES bytes)
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
unsigned int i;
unsigned char seedbuf[3 * SEEDBYTES];
unsigned char tr[CRHBYTES];
const unsigned char *rho, *rhoprime, *key;
uint8_t seedbuf[3 * SEEDBYTES];
uint8_t tr[CRHBYTES];
const uint8_t *rho, *rhoprime, *key;
uint16_t nonce = 0;
polyvecl mat[K];
polyvecl s1, s1hat;
@@ -144,19 +145,35 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
PQCLEAN_DILITHIUM2_CLEAN_pack_pk(pk, rho, &t1);

/* Compute CRH(rho, t1) and write secret key */
crh(tr, pk, CRYPTO_PUBLICKEYBYTES);
crh(tr, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES);
PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0);

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES
* of len)
* - size_t *smlen: pointer to output length of signed message
* (should be PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES)
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk) {
const uint8_t *msg, size_t mlen,
const uint8_t *sk) {
unsigned long long i;
unsigned int n;
unsigned char seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
unsigned char *rho, *tr, *key, *mu, *rhoprime;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint16_t nonce = 0;
poly c, chat;
polyvecl mat[K], s1, y, yhat, z;
@@ -170,13 +187,12 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk);


// use incremental hash API instead of copying around buffers
/* Compute CRH(tr, msg) */
shake256incctx state;
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_absorb(&state, msg, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);

@@ -253,11 +269,51 @@ rej:

/* Write signature */
PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, &z, &h, &c);

*siglen = CRYPTO_BYTES;
*siglen = PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sm: pointer to output signed message (allocated
* array with PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - unsigned long long *smlen: pointer to output length of signed
* message
* - const uint8_t *m: pointer to message to be signed
* - unsigned long long mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk) {
int rc;
memmove(sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, m, mlen);
rc = PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk);
*smlen += mlen;
return rc;
}


/*************************************************
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify
*
* Description: Verify signed message.
*
* Arguments: - uint8_t *sig: signature
* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES)
* - uint8_t *m: pointer to message
* - size_t *mlen: pointer to output length of message
* - uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk) {
@@ -268,7 +324,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
polyvecl mat[K], z;
polyveck t1, w1, h, tmp1, tmp2;

if (siglen < CRYPTO_BYTES) {
if (siglen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) {
return -1;
}

@@ -281,7 +337,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, CRYPTO_PUBLICKEYBYTES);
crh(mu, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES);

shake256incctx state;
shake256_inc_init(&state);
@@ -325,40 +381,9 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
// All good
return 0;
}
/*************************************************
* Name: crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - unsigned char *sm: pointer to output signed message (allocated
* array with CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - unsigned long long *smlen: pointer to output length of signed
* message
* - const unsigned char *m: pointer to message to be signed
* - unsigned long long mlen: length of message
* - const unsigned char *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm,
size_t *smlen,
const uint8_t *m,
size_t mlen,
const uint8_t *sk) {
size_t i;
int rc;
for (i = 0; i < mlen; i++) {
sm[CRYPTO_BYTES + i] = m[i];
}
rc = PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk);
*smlen += mlen;
return rc;

}

/*************************************************
* Name: crypto_sign_open
* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open
*
* Description: Verify signed message.
*
@@ -371,24 +396,23 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm,
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m,
size_t *mlen,
const uint8_t *sm,
size_t smlen,
const uint8_t *pk) {
int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk) {
size_t i;
if (smlen < CRYPTO_BYTES) {
if (smlen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) {
goto badsig;
}
*mlen = smlen - CRYPTO_BYTES;
*mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES;

if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, CRYPTO_BYTES,
sm + CRYPTO_BYTES, *mlen, pk)) {
if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES,
sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (i = 0; i < *mlen; ++i) {
m[i] = sm[CRYPTO_BYTES + i];
m[i] = sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + i];
}
return 0;
}


+ 5
- 23
crypto_sign/dilithium2/clean/sign.h View File

@@ -1,30 +1,12 @@
#ifndef SIGN_H
#define SIGN_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SIGN_H
#define PQCLEAN_DILITHIUM2_CLEAN_SIGN_H

#include "api.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"

void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const unsigned char mu[CRHBYTES],
void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES],
const polyveck *w1);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

#endif

+ 26
- 0
crypto_sign/dilithium2/clean/stream.c View File

@@ -0,0 +1,26 @@
#include "stream.h"

#include <string.h>

void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {

uint8_t buf[SEEDBYTES + 2];
memcpy(buf, seed, SEEDBYTES);
buf[SEEDBYTES] = (uint8_t)nonce;
buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8);

shake128_absorb(state, buf, SEEDBYTES + 2);
}


void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {

uint8_t buf[CRHBYTES + 2];
memcpy(buf, seed, CRHBYTES);
buf[CRHBYTES] = (uint8_t)nonce;
buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8);

shake256_absorb(state, buf, CRHBYTES + 2);
}

+ 15
- 0
crypto_sign/dilithium2/clean/stream.h View File

@@ -0,0 +1,15 @@
#ifndef PQCLEAN_DILITHIUM2_CLEAN_STREAM_H
#define PQCLEAN_DILITHIUM2_CLEAN_STREAM_H

#include <stdint.h>

#include "fips202.h"
#include "params.h"

void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce);

#endif

+ 0
- 32
crypto_sign/dilithium2/clean/symmetric.c View File

@@ -1,32 +0,0 @@
#include "symmetric.h"
#include "fips202.h"

void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(shake128ctx *state,
const unsigned char seed[SEEDBYTES],
uint16_t nonce) {
unsigned int i;
unsigned char buf[SEEDBYTES + 2];

for (i = 0; i < SEEDBYTES; ++i) {
buf[i] = seed[i];
}
buf[SEEDBYTES] = (uint8_t) nonce;
buf[SEEDBYTES + 1] = (uint8_t) (nonce >> 8);

shake128_absorb(state, buf, sizeof(buf));
}

void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(shake256ctx *state,
const unsigned char seed[CRHBYTES],
uint16_t nonce) {
unsigned int i;
unsigned char buf[CRHBYTES + 2];

for (i = 0; i < CRHBYTES; ++i) {
buf[i] = seed[i];
}
buf[CRHBYTES] = (uint8_t) nonce;
buf[CRHBYTES + 1] = (uint8_t) (nonce >> 8);

shake256_absorb(state, buf, sizeof(buf));
}

+ 9
- 9
crypto_sign/dilithium2/clean/symmetric.h View File

@@ -1,8 +1,11 @@
#ifndef SYMMETRIC_H
#define SYMMETRIC_H
#ifndef PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H
#define PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H

#include "fips202.h"
#include "params.h"
#include "stream.h"


#include "fips202.h"

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(STATE, SEED, NONCE)
@@ -13,11 +16,8 @@
#define STREAM128_BLOCKBYTES SHAKE128_RATE
#define STREAM256_BLOCKBYTES SHAKE256_RATE

void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(shake128ctx *state,
const unsigned char *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(shake256ctx *state,
const unsigned char *seed,
uint16_t nonce);
typedef shake128ctx stream128_state;
typedef shake256ctx stream256_state;


#endif

+ 10
- 1
crypto_sign/dilithium3/META.yml View File

@@ -17,4 +17,13 @@ auxiliary-submitters:
- Damien Stehlé
implementations:
- name: clean
version: https://github.com/pq-crystals/dilithium/commit/40f79645879b5c69835cd91d06945d7c24f39922
version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08
- name: avx2
version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08
supported_platforms:
- architecture: x86_64
operating_systems:
- Linux
required_flags:
- avx2
- bmi2

+ 6
- 0
crypto_sign/dilithium3/avx2/LICENSE View File

@@ -0,0 +1,6 @@
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and the random number generator
we are using public-domain code from sources
and by authors listed in comments on top of
the respective files.

+ 43
- 0
crypto_sign/dilithium3/avx2/Makefile View File

@@ -0,0 +1,43 @@
# This Makefile can be used with GNU Make or BSD Make

LIB=libdilithium3_avx2.a

SOURCES = fips202x4.c invntt.s nttconsts.c ntt.s packing.c pointwise.S poly.c \
polyvec.c reduce.s rejsample.c rounding.c sign.c stream.c
OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \
polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o
HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \
nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \
fips202x4.h shuffle.inc

CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \
-Wmissing-prototypes -Wredundant-decls -std=c99 \
-Wcast-align \
-mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS)

all: $(LIB)

KECCAK4XDIR=../../../common/keccak4x
KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o
KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ)

%.o: %.c $(HEADERS)
$(CC) $(CFLAGS) -c -o $@ $<

%.o: %.s $(HEADERS)
$(AS) -o $@ $<

%.o: %.S $(HEADERS)
$(AS) -c -o $@ $<

$(LIB): $(OBJECTS) $(KECCAK4X)
$(AR) -r $@ $^

$(KECCAK4X):
$(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ)

clean:
$(RM) $(OBJECTS)
$(RM) $(LIB)
$(MAKE) -C $(KECCAK4XDIR) clean


+ 22
- 0
crypto_sign/dilithium3/avx2/alignment.h View File

@@ -0,0 +1,22 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H
#define PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H

#define ALIGNED_UINT8(N) \
union { \
uint32_t as_arr[N]; \
__m256i as_vec[(N)/32]; \
}

#define ALIGNED_UINT32(N) \
union { \
uint32_t as_arr[N]; \
__m256i as_vec[(N)/8]; \
}

#define ALIGNED_UINT64(N) \
union { \
uint64_t as_arr[N]; \
__m256i as_vec[(N)/8]; \
}

#endif //PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H

+ 37
- 0
crypto_sign/dilithium3/avx2/api.h View File

@@ -0,0 +1,37 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_API_H
#define PQCLEAN_DILITHIUM3_AVX2_API_H

#include <stddef.h>
#include <stdint.h>

#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES 1472U
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES 3504U
#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES 2701U

#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_ALGNAME "Dilithium3"


int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(
uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen, const uint8_t *sk);

int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);



#endif

+ 239
- 0
crypto_sign/dilithium3/avx2/fips202x4.c View File

@@ -0,0 +1,239 @@
#include <immintrin.h>
#include <stdint.h>

#include "fips202.h"
#include "fips202x4.h"
#include "params.h"

#define NROUNDS 24
#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset))))

static uint64_t load64(const uint8_t *x) {
unsigned int i;
uint64_t r = 0;

for (i = 0; i < 8; ++i) {
r |= (uint64_t)x[i] << 8 * i;
}

return r;
}

static void store64(uint8_t *x, uint64_t u) {
unsigned int i;

for (i = 0; i < 8; ++i) {
x[i] = (uint8_t)(u >> 8 * i);
}
}

/* Use implementation from the Keccak Code Package */
extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s);
#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds

static void keccak_absorb4x(__m256i *s,
unsigned int r,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen,
uint8_t p) {
unsigned long long i;
uint8_t t0[200];
uint8_t t1[200];
uint8_t t2[200];
uint8_t t3[200];
uint64_t *ss = (uint64_t *)s;

for (i = 0; i < 25; ++i) {
s[i] = _mm256_xor_si256(s[i], s[i]);
}

while (mlen >= r) {
for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(m0 + 8 * i);
ss[4 * i + 1] ^= load64(m1 + 8 * i);
ss[4 * i + 2] ^= load64(m2 + 8 * i);
ss[4 * i + 3] ^= load64(m3 + 8 * i);
}

KeccakF1600_StatePermute4x(s);
mlen -= r;
m0 += r;
m1 += r;
m2 += r;
m3 += r;
}

for (i = 0; i < r; ++i) {
t0[i] = 0;
t1[i] = 0;
t2[i] = 0;
t3[i] = 0;
}
for (i = 0; i < mlen; ++i) {
t0[i] = m0[i];
t1[i] = m1[i];
t2[i] = m2[i];
t3[i] = m3[i];
}

t0[i] = p;
t1[i] = p;
t2[i] = p;
t3[i] = p;

t0[r - 1] |= 128;
t1[r - 1] |= 128;
t2[r - 1] |= 128;
t3[r - 1] |= 128;

for (i = 0; i < r / 8; ++i) {
ss[4 * i + 0] ^= load64(t0 + 8 * i);
ss[4 * i + 1] ^= load64(t1 + 8 * i);
ss[4 * i + 2] ^= load64(t2 + 8 * i);
ss[4 * i + 3] ^= load64(t3 + 8 * i);
}
}


static void keccak_squeezeblocks4x(uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
unsigned int r,
__m256i *s) {
unsigned int i;
uint64_t *ss = (uint64_t *)s;

while (nblocks > 0) {
KeccakF1600_StatePermute4x(s);
for (i = 0; i < r / 8; ++i) {
store64(h0 + 8 * i, ss[4 * i + 0]);
store64(h1 + 8 * i, ss[4 * i + 1]);
store64(h2 + 8 * i, ss[4 * i + 2]);
store64(h3 + 8 * i, ss[4 * i + 3]);
}

h0 += r;
h1 += r;
h2 += r;
h3 += r;
--nblocks;
}

}

void PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen) {
keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F);
}

void PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
__m256i *s) {
keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s);
}

void PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen) {
keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F);
}

void PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
__m256i *s) {
keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s);
}

void PQCLEAN_DILITHIUM3_AVX2_shake128_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long long hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen) {
unsigned int i;
unsigned long nblocks = hlen / SHAKE128_RATE;
uint8_t t[4][SHAKE128_RATE];
__m256i s[25];

PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen);
PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s);

h0 += nblocks * SHAKE128_RATE;
h1 += nblocks * SHAKE128_RATE;
h2 += nblocks * SHAKE128_RATE;
h3 += nblocks * SHAKE128_RATE;
hlen -= nblocks * SHAKE128_RATE;

if (hlen) {
PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s);
for (i = 0; i < hlen; ++i) {
h0[i] = t[0][i];
h1[i] = t[1][i];
h2[i] = t[2][i];
h3[i] = t[3][i];
}
}
}

void PQCLEAN_DILITHIUM3_AVX2_shake256_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long long hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen) {
unsigned int i;
unsigned long nblocks = hlen / SHAKE256_RATE;
uint8_t t[4][SHAKE256_RATE];
__m256i s[25];

PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen);
PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s);

h0 += nblocks * SHAKE256_RATE;
h1 += nblocks * SHAKE256_RATE;
h2 += nblocks * SHAKE256_RATE;
h3 += nblocks * SHAKE256_RATE;
hlen -= nblocks * SHAKE256_RATE;

if (hlen) {
PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s);
for (i = 0; i < hlen; ++i) {
h0[i] = t[0][i];
h1[i] = t[1][i];
h2[i] = t[2][i];
h3[i] = t[3][i];
}
}
}

+ 65
- 0
crypto_sign/dilithium3/avx2/fips202x4.h View File

@@ -0,0 +1,65 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_FIPS202X4_H
#define PQCLEAN_DILITHIUM3_AVX2_FIPS202X4_H

#include <immintrin.h>
#include <stdint.h>

#include "params.h"

void PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen);

void PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
__m256i *s);

void PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x(
__m256i *s,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen);

void PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long nblocks,
__m256i *s);

void PQCLEAN_DILITHIUM3_AVX2_shake128_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long long hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen);

void PQCLEAN_DILITHIUM3_AVX2_shake256_4x(
uint8_t *h0,
uint8_t *h1,
uint8_t *h2,
uint8_t *h3,
unsigned long long hlen,
const uint8_t *m0,
const uint8_t *m1,
const uint8_t *m2,
const uint8_t *m3,
unsigned long long mlen);

#endif

+ 281
- 0
crypto_sign/dilithium3/avx2/invntt.s View File

@@ -0,0 +1,281 @@
.include "shuffle.inc"

.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3
vpaddd %ymm2,%ymm\l0,%ymm12
vpaddd %ymm2,%ymm\l1,%ymm13
vpaddd %ymm2,%ymm\l2,%ymm14

vpsubd %ymm\h0,%ymm12,%ymm12
vpsubd %ymm\h1,%ymm13,%ymm13
vpsubd %ymm\h2,%ymm14,%ymm14

vpmuludq %ymm\z0,%ymm12,%ymm12
vpmuludq %ymm\z0,%ymm13,%ymm13
vpaddd %ymm2,%ymm\l3,%ymm15

vpmuludq %ymm\z1,%ymm14,%ymm14
vpsubd %ymm\h3,%ymm15,%ymm15
vpaddd %ymm\l0,%ymm\h0,%ymm\l0

vpmuludq %ymm\z1,%ymm15,%ymm15
vpaddd %ymm\l1,%ymm\h1,%ymm\l1
vpaddd %ymm\l2,%ymm\h2,%ymm\l2

vpaddd %ymm\l3,%ymm\h3,%ymm\l3

vpmuludq %ymm0,%ymm12,%ymm\h0
vpmuludq %ymm0,%ymm13,%ymm\h1
vpmuludq %ymm0,%ymm14,%ymm\h2
vpmuludq %ymm0,%ymm15,%ymm\h3
vpmuludq %ymm1,%ymm\h0,%ymm\h0
vpmuludq %ymm1,%ymm\h1,%ymm\h1
vpmuludq %ymm1,%ymm\h2,%ymm\h2
vpmuludq %ymm1,%ymm\h3,%ymm\h3
vpaddq %ymm12,%ymm\h0,%ymm\h0
vpaddq %ymm13,%ymm\h1,%ymm\h1
vpaddq %ymm14,%ymm\h2,%ymm\h2
vpaddq %ymm15,%ymm\h3,%ymm\h3
vpsrlq $32,%ymm\h0,%ymm\h0
vpsrlq $32,%ymm\h1,%ymm\h1
vpsrlq $32,%ymm\h2,%ymm\h2
vpsrlq $32,%ymm\h3,%ymm\h3
.endm

.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx
PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm6
vmovdqa 32(%rsi),%ymm7
vmovdqa 64(%rsi),%ymm5
vmovdqa 96(%rsi),%ymm10

#reorder
shuffle8 6,5,8,5
shuffle8 7,10,6,10

shuffle4 8,6,4,6
shuffle4 5,10,8,10

vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vpsrlq $32,%ymm10,%ymm11

level0:
vpmovzxdq (%rdx),%ymm3
vpmovzxdq 16(%rdx),%ymm15
vpaddd %ymm2,%ymm4,%ymm12
vpaddd %ymm2,%ymm6,%ymm13
vpaddd %ymm2,%ymm8,%ymm14

vpsubd %ymm5,%ymm12,%ymm12
vpsubd %ymm7,%ymm13,%ymm13
vpsubd %ymm9,%ymm14,%ymm14

vpmuludq %ymm3,%ymm12,%ymm12
vpmuludq %ymm15,%ymm13,%ymm13
vpaddd %ymm2,%ymm10,%ymm15

vpsubd %ymm11,%ymm15,%ymm15
vpaddd %ymm4,%ymm5,%ymm4
vpaddd %ymm6,%ymm7,%ymm6
vpmovzxdq 32(%rdx),%ymm5
vpmovzxdq 48(%rdx),%ymm7

vpmuludq %ymm5,%ymm14,%ymm14
vpmuludq %ymm7,%ymm15,%ymm15
vpaddd %ymm8,%ymm9,%ymm8

vpaddd %ymm10,%ymm11,%ymm10

vpmuludq %ymm0,%ymm12,%ymm5
vpmuludq %ymm0,%ymm13,%ymm7
vpmuludq %ymm0,%ymm14,%ymm9
vpmuludq %ymm0,%ymm15,%ymm11
vpmuludq %ymm1,%ymm5,%ymm5
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm9,%ymm9
vpmuludq %ymm1,%ymm11,%ymm11
vpaddq %ymm12,%ymm5,%ymm5
vpaddq %ymm13,%ymm7,%ymm7
vpaddq %ymm14,%ymm9,%ymm9
vpaddq %ymm15,%ymm11,%ymm11
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm7,%ymm7
vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11

level1:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpmovzxdq 64(%rdx),%ymm15
vpmovzxdq 80(%rdx),%ymm3

butterfly 4,5,8,9,6,7,10,11

level2:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpmovzxdq 96(%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11 3,3

#shuffle
shuffle4 4,5,3,5
shuffle4 6,7,4,7
shuffle4 8,9,6,9
shuffle4 10,11,8,11

level3:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd 112(%rdx),%ymm14
vpbroadcastd 116(%rdx),%ymm15
vpblendd $0xF0,%ymm15,%ymm14,%ymm10

butterfly 3,4,6,8,5,7,9,11 10,10

#shuffle
shuffle8 3,4,10,4
shuffle8 6,8,3,8
shuffle8 5,7,6,7
shuffle8 9,11,5,11

level4:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd 120(%rdx),%ymm9

butterfly 10,3,6,5,4,8,7,11 9,9

#store
vmovdqa %ymm10,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm6,64(%rdi)
vmovdqa %ymm5,96(%rdi)
vmovdqa %ymm4,128(%rdi)
vmovdqa %ymm8,160(%rdi)
vmovdqa %ymm7,192(%rdi)
vmovdqa %ymm11,224(%rdi)

ret

.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx
PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
vmovdqa 256(%rsi),%ymm5
vmovdqa 512(%rsi),%ymm6
vmovdqa 768(%rsi),%ymm7
vmovdqa 1024(%rsi),%ymm8
vmovdqa 1280(%rsi),%ymm9
vmovdqa 1536(%rsi),%ymm10
vmovdqa 1792(%rsi),%ymm11

level5:
vpbroadcastd (%rdx),%ymm3
vpbroadcastd 4(%rdx),%ymm15
vpaddd %ymm2,%ymm4,%ymm12
vpaddd %ymm2,%ymm6,%ymm13
vpaddd %ymm2,%ymm8,%ymm14

vpsubd %ymm5,%ymm12,%ymm12
vpsubd %ymm7,%ymm13,%ymm13
vpsubd %ymm9,%ymm14,%ymm14

vpmuludq %ymm3,%ymm12,%ymm12
vpmuludq %ymm15,%ymm13,%ymm13
vpaddd %ymm2,%ymm10,%ymm15

vpsubd %ymm11,%ymm15,%ymm15
vpaddd %ymm4,%ymm5,%ymm4
vpaddd %ymm6,%ymm7,%ymm6
vpbroadcastd 8(%rdx),%ymm5
vpbroadcastd 12(%rdx),%ymm7

vpmuludq %ymm5,%ymm14,%ymm14
vpmuludq %ymm7,%ymm15,%ymm15
vpaddd %ymm8,%ymm9,%ymm8

vpaddd %ymm10,%ymm11,%ymm10

vpmuludq %ymm0,%ymm12,%ymm5
vpmuludq %ymm0,%ymm13,%ymm7
vpmuludq %ymm0,%ymm14,%ymm9
vpmuludq %ymm0,%ymm15,%ymm11
vpmuludq %ymm1,%ymm5,%ymm5
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm9,%ymm9
vpmuludq %ymm1,%ymm11,%ymm11
vpaddq %ymm12,%ymm5,%ymm5
vpaddq %ymm13,%ymm7,%ymm7
vpaddq %ymm14,%ymm9,%ymm9
vpaddq %ymm15,%ymm11,%ymm11
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm7,%ymm7
vpsrlq $32,%ymm9,%ymm9
vpsrlq $32,%ymm11,%ymm11

level6:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd 16(%rdx),%ymm15
vpbroadcastd 20(%rdx),%ymm3

butterfly 4,5,8,9,6,7,10,11

level7:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd 24(%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11 3,3

#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xdiv(%rip),%ymm3

vpmuludq %ymm3,%ymm4,%ymm4
vpmuludq %ymm3,%ymm5,%ymm5
vpmuludq %ymm3,%ymm6,%ymm6
vpmuludq %ymm3,%ymm7,%ymm7
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm0,%ymm6,%ymm14
vpmuludq %ymm0,%ymm7,%ymm15
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm12,%ymm4,%ymm4
vpaddq %ymm13,%ymm5,%ymm5
vpaddq %ymm14,%ymm6,%ymm6
vpaddq %ymm15,%ymm7,%ymm7
vpsrlq $32,%ymm4,%ymm4
vpsrlq $32,%ymm5,%ymm5
vpsrlq $32,%ymm6,%ymm6
vpsrlq $32,%ymm7,%ymm7

#store
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_mask(%rip),%ymm3
vpermd %ymm4,%ymm3,%ymm4
vpermd %ymm5,%ymm3,%ymm5
vpermd %ymm6,%ymm3,%ymm6
vpermd %ymm7,%ymm3,%ymm7
vpermd %ymm8,%ymm3,%ymm8
vpermd %ymm9,%ymm3,%ymm9
vpermd %ymm10,%ymm3,%ymm10
vpermd %ymm11,%ymm3,%ymm11
vmovdqa %xmm4,(%rdi)
vmovdqa %xmm5,128(%rdi)
vmovdqa %xmm6,256(%rdi)
vmovdqa %xmm7,384(%rdi)
vmovdqa %xmm8,512(%rdi)
vmovdqa %xmm9,640(%rdi)
vmovdqa %xmm10,768(%rdi)
vmovdqa %xmm11,896(%rdi)

ret

+ 26
- 0
crypto_sign/dilithium3/avx2/ntt.h View File

@@ -0,0 +1,26 @@
#ifndef NTT_H
#define NTT_H

#include <stdint.h>

#include "nttconsts.h"
#include "params.h"

void PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx(uint64_t *tmp,
const uint32_t *a,
const uint32_t *zetas);
void PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx(uint32_t *a,
const uint64_t *tmp,
const uint32_t *zetas);

void PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx(uint64_t *tmp,
const uint32_t *a,
const uint32_t *zetas_inv);
void PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx(uint32_t *a,
const uint64_t *tmp,
const uint32_t *zetas_inv);

void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b);
void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b);

#endif

+ 178
- 0
crypto_sign/dilithium3/avx2/ntt.s View File

@@ -0,0 +1,178 @@
.include "shuffle.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3
#mul
vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0
vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1
vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2
vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3

#reduce
vpmuludq %ymm0,%ymm\rh0,%ymm12
vpmuludq %ymm0,%ymm\rh1,%ymm13
vpmuludq %ymm0,%ymm\rh2,%ymm14
vpmuludq %ymm0,%ymm\rh3,%ymm15
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm\rh0,%ymm12,%ymm12
vpaddq %ymm\rh1,%ymm13,%ymm13
vpaddq %ymm\rh2,%ymm14,%ymm14
vpaddq %ymm\rh3,%ymm15,%ymm15
vpsrlq $32,%ymm12,%ymm12
vpsrlq $32,%ymm13,%ymm13
vpsrlq $32,%ymm14,%ymm14
vpsrlq $32,%ymm15,%ymm15

#update
vpaddd %ymm2,%ymm\rl0,%ymm\rh0
vpaddd %ymm2,%ymm\rl1,%ymm\rh1
vpaddd %ymm2,%ymm\rl2,%ymm\rh2
vpaddd %ymm2,%ymm\rl3,%ymm\rh3
vpaddd %ymm12,%ymm\rl0,%ymm\rl0
vpaddd %ymm13,%ymm\rl1,%ymm\rl1
vpaddd %ymm14,%ymm\rl2,%ymm\rl2
vpaddd %ymm15,%ymm\rl3,%ymm\rl3
vpsubd %ymm12,%ymm\rh0,%ymm\rh0
vpsubd %ymm13,%ymm\rh1,%ymm\rh1
vpsubd %ymm14,%ymm\rh2,%ymm\rh2
vpsubd %ymm15,%ymm\rh3,%ymm\rh3
.endm

.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx
PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2

level0:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd (%rdx),%ymm3

#load
vpmovzxdq (%rsi),%ymm4
vpmovzxdq 128(%rsi),%ymm5
vpmovzxdq 256(%rsi),%ymm6
vpmovzxdq 384(%rsi),%ymm7
vpmovzxdq 512(%rsi),%ymm8
vpmovzxdq 640(%rsi),%ymm9
vpmovzxdq 768(%rsi),%ymm10
vpmovzxdq 896(%rsi),%ymm11

butterfly 4,5,6,7,8,9,10,11

level1:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd 4(%rdx),%ymm12
vpbroadcastd 8(%rdx),%ymm13

butterfly 4,5,8,9,6,7,10,11 12,12,13,13

level2:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd 12(%rdx),%ymm12
vpbroadcastd 16(%rdx),%ymm13
vpbroadcastd 20(%rdx),%ymm14
vpbroadcastd 24(%rdx),%ymm15

butterfly 4,6,8,10,5,7,9,11 12,13,14,15

#store
vmovdqa %ymm4,(%rdi)
vmovdqa %ymm5,256(%rdi)
vmovdqa %ymm6,512(%rdi)
vmovdqa %ymm7,768(%rdi)
vmovdqa %ymm8,1024(%rdi)
vmovdqa %ymm9,1280(%rdi)
vmovdqa %ymm10,1536(%rdi)
vmovdqa %ymm11,1792(%rdi)

ret

.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx
PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2

#load
vmovdqa (%rsi),%ymm4
vmovdqa 32(%rsi),%ymm5
vmovdqa 64(%rsi),%ymm6
vmovdqa 96(%rsi),%ymm7
vmovdqa 128(%rsi),%ymm8
vmovdqa 160(%rsi),%ymm9
vmovdqa 192(%rsi),%ymm10
vmovdqa 224(%rsi),%ymm11

level3:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd (%rdx),%ymm3

butterfly 4,5,6,7,8,9,10,11

level4:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpbroadcastd 4(%rdx),%ymm12
vpbroadcastd 8(%rdx),%ymm13
vpblendd $0xF0,%ymm13,%ymm12,%ymm12

shuffle8 4,8,3,8
shuffle8 5,9,4,9
shuffle8 6,10,5,10
shuffle8 7,11,6,11

butterfly 3,8,4,9,5,10,6,11 12,12,12,12

level5:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpmovzxdq 12(%rdx),%ymm12

shuffle4 3,5,7,5
shuffle4 8,10,3,10
shuffle4 4,6,8,6
shuffle4 9,11,4,11

butterfly 7,5,3,10,8,6,4,11 12,12,12,12

level6:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpmovzxdq 28(%rdx),%ymm12
vpmovzxdq 44(%rdx),%ymm13

butterfly 7,5,8,6,3,10,4,11 12,12,13,13

level7:
#PQCLEAN_DILITHIUM3_AVX2_zetas
vpmovzxdq 60(%rdx),%ymm12
vpmovzxdq 76(%rdx),%ymm13
vpmovzxdq 92(%rdx),%ymm14
vpmovzxdq 108(%rdx),%ymm15

butterfly 7,3,8,4,5,10,6,11 12,13,14,15

#store
vpsllq $32,%ymm5,%ymm5
vpsllq $32,%ymm10,%ymm10
vpsllq $32,%ymm6,%ymm6
vpsllq $32,%ymm11,%ymm11
vpblendd $0xAA,%ymm5,%ymm7,%ymm7
vpblendd $0xAA,%ymm10,%ymm3,%ymm3
vpblendd $0xAA,%ymm6,%ymm8,%ymm8
vpblendd $0xAA,%ymm11,%ymm4,%ymm4

shuffle4 7,3,5,3
shuffle4 8,4,7,4

shuffle8 5,7,6,7
shuffle8 3,4,5,4

vmovdqa %ymm6,(%rdi)
vmovdqa %ymm5,32(%rdi)
vmovdqa %ymm7,64(%rdi)
vmovdqa %ymm4,96(%rdi)

ret

+ 80
- 0
crypto_sign/dilithium3/avx2/nttconsts.c View File

@@ -0,0 +1,80 @@
#include "nttconsts.h"

#define QINV 4236238847 // -q^(-1) mod 2^32
#define MONT 4193792ULL
#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q)


const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q,
256 * Q
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF,
0x7FFFFF, 0x7FFFFF
}
};
const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}};

#undef QINV
#undef MONT
#undef DIV


const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas = {
.as_arr = {
0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776,
3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667,
5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191,
3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439,
7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422,
1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579,
8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390,
7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620,
5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868,
3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076,
6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435,
5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599,
3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165,
7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031,
6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064,
2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112,
162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237,
8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977,
1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735,
6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892,
5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443,
7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090,
4834730, 7018208, 1976782
}
};

const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv = {
.as_arr = {
6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985,
4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748,
2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646,
1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087,
177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422,
6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573,
5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061,
6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386,
1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252,
1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818,
1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496,
7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525,
6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443,
7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093,
5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647,
4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669,
43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330,
1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900,
6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326,
4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745,
3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293,
518909, 2608894, 3975713
}
};

+ 27
- 0
crypto_sign/dilithium3/avx2/nttconsts.h View File

@@ -0,0 +1,27 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H
#define PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H

#include <immintrin.h>
#include <stdint.h>

#include "alignment.h"
#include "params.h"

typedef ALIGNED_UINT32(8) aligned_uint32x8_t;

typedef ALIGNED_UINT32(N) aligned_uint32xN_t;


extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones;
extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv;

extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas;
extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv;

#endif //PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H


+ 305
- 0
crypto_sign/dilithium3/avx2/packing.c View File

@@ -0,0 +1,305 @@
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_pack_pk(
uint8_t *pk,
const uint8_t *rho,
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
pk[i] = rho[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_unpack_pk(
uint8_t *rho,
polyveck *t1,
const uint8_t *pk) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = pk[i];
}
pk += SEEDBYTES;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_sk
*
* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0).
*
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t key[]: byte array containing key
* - const uint8_t tr[]: byte array containing tr
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
* - const polyveck *t0: pointer to vector t0
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_pack_sk(
uint8_t *sk,
const uint8_t *rho,
const uint8_t *key,
const uint8_t *tr,
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = rho[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
sk[i] = key[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
sk[i] = tr[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]);
}
sk += L * POLETA_SIZE_PACKED;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]);
}
sk += K * POLETA_SIZE_PACKED;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sk
*
* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0).
*
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t key[]: output byte array for key
* - const uint8_t tr[]: output byte array for tr
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - const polyveck *r0: pointer to output vector t0
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_unpack_sk(
uint8_t *rho,
uint8_t *key,
uint8_t *tr,
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
rho[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < SEEDBYTES; ++i) {
key[i] = sk[i];
}
sk += SEEDBYTES;

for (i = 0; i < CRHBYTES; ++i) {
tr[i] = sk[i];
}
sk += CRHBYTES;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED);
}
sk += L * POLETA_SIZE_PACKED;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED);
}
sk += K * POLETA_SIZE_PACKED;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_pack_sig
*
* Description: Bit-pack signature sig = (z, h, c).
*
* Arguments: - uint8_t sig[]: output byte array
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
* - const poly *c: pointer to PQCLEAN_DILITHIUM3_AVX2_challenge polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_pack_sig(
uint8_t *sig,
const polyvecl *z,
const polyveck *h,
const poly *c) {
unsigned int i, j, k;
uint64_t signs, mask;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]);
}
sig += L * POLZ_SIZE_PACKED;

/* Encode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (uint8_t)j;
}
}

sig[OMEGA + i] = (uint8_t)k;
}
while (k < OMEGA) {
sig[k++] = 0;
}
sig += OMEGA + K;

/* Encode c */
signs = 0;
mask = 1;
for (i = 0; i < N / 8; ++i) {
sig[i] = 0;
for (j = 0; j < 8; ++j) {
if (c->coeffs[8 * i + j] != 0) {
sig[i] |= (uint8_t)(1u << j);
if (c->coeffs[8 * i + j] == (Q - 1)) {
signs |= mask;
}
mask <<= 1;
}
}
}
sig += N / 8;
for (i = 0; i < 8; ++i) {
sig[i] = (uint8_t)(signs >> 8u * i);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sig
*
* Description: Unpack signature sig = (z, h, c).
*
* Arguments: - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - poly *c: pointer to output PQCLEAN_DILITHIUM3_AVX2_challenge polynomial
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_unpack_sig(
polyvecl *z,
polyveck *h,
poly *c,
const uint8_t *sig) {
unsigned int i, j, k;
uint64_t signs;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED);
}
sig += L * POLZ_SIZE_PACKED;

/* Decode h */
k = 0;
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
h->vec[i].coeffs[j] = 0;
}

if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) {
return 1;
}

for (j = k; j < sig[OMEGA + i]; ++j) {
/* Coefficients are ordered for strong unforgeability */
if (j > k && sig[j] <= sig[j - 1]) {
return 1;
}
h->vec[i].coeffs[sig[j]] = 1;
}

k = sig[OMEGA + i];
}

/* Extra indices are zero for strong unforgeability */
for (j = k; j < OMEGA; ++j) {
if (sig[j]) {
return 1;
}
}

sig += OMEGA + K;

/* Decode c */
for (i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}

signs = 0;
for (i = 0; i < 8; ++i) {
signs |= (uint64_t)sig[N / 8 + i] << 8 * i;
}

/* Extra sign bits are zero for strong unforgeability */
if (signs >> 60) {
return 1;
}

for (i = 0; i < N / 8; ++i) {
for (j = 0; j < 8; ++j) {
if ((sig[i] >> j) & 0x01) {
c->coeffs[8 * i + j] = 1;
c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1));
signs >>= 1;
}
}
}

return 0;
}

+ 36
- 0
crypto_sign/dilithium3/avx2/packing.h View File

@@ -0,0 +1,36 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_PACKING_H
#define PQCLEAN_DILITHIUM3_AVX2_PACKING_H

#include "params.h"
#include "polyvec.h"

void PQCLEAN_DILITHIUM3_AVX2_pack_pk(
uint8_t *pk,
const uint8_t *rho, const polyveck *t1);
void PQCLEAN_DILITHIUM3_AVX2_pack_sk(
uint8_t *sk,
const uint8_t *rho,
const uint8_t *key,
const uint8_t *tr,
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0);
void PQCLEAN_DILITHIUM3_AVX2_pack_sig(
uint8_t *sig,
const polyvecl *z, const polyveck *h, const poly *c);

void PQCLEAN_DILITHIUM3_AVX2_unpack_pk(
uint8_t *rho, polyveck *t1,
const uint8_t *pk);
void PQCLEAN_DILITHIUM3_AVX2_unpack_sk(
uint8_t *rho,
uint8_t *key,
uint8_t *tr,
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk);
int PQCLEAN_DILITHIUM3_AVX2_unpack_sig(
polyvecl *z, polyveck *h, poly *c, const uint8_t *sig);

#endif

+ 33
- 0
crypto_sign/dilithium3/avx2/params.h View File

@@ -0,0 +1,33 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_PARAMS_H
#define PQCLEAN_DILITHIUM3_AVX2_PARAMS_H


#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define QBITS 23
#define D 14
#define GAMMA1 ((Q - 1)/16)
#define GAMMA2 (GAMMA1/2)
#define ALPHA (2*GAMMA2)

#define K 5
#define L 4
#define ETA 5
#define SETABITS 4
#define BETA 275
#define OMEGA 96


#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8)
#define POLT0_SIZE_PACKED ((N*D)/8)
#define POLETA_SIZE_PACKED ((N*SETABITS)/8)
#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8)
#define POLW1_SIZE_PACKED ((N*4)/8)

#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLT1_SIZE_PACKED)
#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + (L + K)*POLETA_SIZE_PACKED + CRHBYTES + K*POLT0_SIZE_PACKED)
#define CRYPTO_BYTES (L*POLZ_SIZE_PACKED + (OMEGA + K) + (N/8 + 8))

#endif

+ 191
- 0
crypto_sign/dilithium3/avx2/pointwise.S View File

@@ -0,0 +1,191 @@
#include "params.h"

.global PQCLEAN_DILITHIUM3_AVX2_pointwise_avx
PQCLEAN_DILITHIUM3_AVX2_pointwise_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1

xor %eax,%eax
_looptop1:
#load
vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa 64(%rsi),%ymm6
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vmovdqa 64(%rdx),%ymm14
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13
vpsrlq $32,%ymm14,%ymm15

#mul
vpmuludq %ymm2,%ymm10,%ymm2
vpmuludq %ymm3,%ymm11,%ymm3
vpmuludq %ymm4,%ymm12,%ymm4
vpmuludq %ymm5,%ymm13,%ymm5
vpmuludq %ymm6,%ymm14,%ymm6
vpmuludq %ymm7,%ymm15,%ymm7

#reduce
vpmuludq %ymm0,%ymm2,%ymm10
vpmuludq %ymm0,%ymm3,%ymm11
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm0,%ymm6,%ymm14
vpmuludq %ymm0,%ymm7,%ymm15
vpmuludq %ymm1,%ymm10,%ymm10
vpmuludq %ymm1,%ymm11,%ymm11
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpmuludq %ymm1,%ymm14,%ymm14
vpmuludq %ymm1,%ymm15,%ymm15
vpaddq %ymm2,%ymm10,%ymm2
vpaddq %ymm3,%ymm11,%ymm3
vpaddq %ymm4,%ymm12,%ymm4
vpaddq %ymm5,%ymm13,%ymm5
vpaddq %ymm6,%ymm14,%ymm6
vpaddq %ymm7,%ymm15,%ymm7
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4
vpsrlq $32,%ymm6,%ymm6

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4
vpblendd $0xAA,%ymm7,%ymm6,%ymm6
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)
vmovdqa %ymm6,64(%rdi)

add $96,%rdi
add $96,%rsi
add $96,%rdx
add $1,%eax
cmp $10,%eax
jb _looptop1

vmovdqa (%rsi),%ymm2
vmovdqa 32(%rsi),%ymm4
vmovdqa (%rdx),%ymm10
vmovdqa 32(%rdx),%ymm12
vpsrlq $32,%ymm2,%ymm3
vpsrlq $32,%ymm4,%ymm5
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13

#mul
vpmuludq %ymm2,%ymm10,%ymm2
vpmuludq %ymm3,%ymm11,%ymm3
vpmuludq %ymm4,%ymm12,%ymm4
vpmuludq %ymm5,%ymm13,%ymm5

#reduce
vpmuludq %ymm0,%ymm2,%ymm10
vpmuludq %ymm0,%ymm3,%ymm11
vpmuludq %ymm0,%ymm4,%ymm12
vpmuludq %ymm0,%ymm5,%ymm13
vpmuludq %ymm1,%ymm10,%ymm10
vpmuludq %ymm1,%ymm11,%ymm11
vpmuludq %ymm1,%ymm12,%ymm12
vpmuludq %ymm1,%ymm13,%ymm13
vpaddq %ymm2,%ymm10,%ymm2
vpaddq %ymm3,%ymm11,%ymm3
vpaddq %ymm4,%ymm12,%ymm4
vpaddq %ymm5,%ymm13,%ymm5
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4

#store
vpblendd $0x55,%ymm2,%ymm3,%ymm2
vpblendd $0x55,%ymm4,%ymm5,%ymm4
vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

ret

.macro pointwise off
#load
vmovdqa \off(%rsi),%ymm6
vmovdqa \off+32(%rsi),%ymm8
vmovdqa \off(%rdx),%ymm10
vmovdqa \off+32(%rdx),%ymm12
vpsrlq $32,%ymm6,%ymm7
vpsrlq $32,%ymm8,%ymm9
vpsrlq $32,%ymm10,%ymm11
vpsrlq $32,%ymm12,%ymm13

#mul
vpmuludq %ymm6,%ymm10,%ymm6
vpmuludq %ymm7,%ymm11,%ymm7
vpmuludq %ymm8,%ymm12,%ymm8
vpmuludq %ymm9,%ymm13,%ymm9
.endm

.macro acc
vpaddq %ymm6,%ymm2,%ymm2
vpaddq %ymm7,%ymm3,%ymm3
vpaddq %ymm8,%ymm4,%ymm4
vpaddq %ymm9,%ymm5,%ymm5
.endm

.global PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx
PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1

xor %eax,%eax
_looptop2:
pointwise 0

#mov
vmovdqa %ymm6,%ymm2
vmovdqa %ymm7,%ymm3
vmovdqa %ymm8,%ymm4
vmovdqa %ymm9,%ymm5

pointwise 1024
acc

pointwise 2048
acc

pointwise 3072
acc


#reduce
vpmuludq %ymm0,%ymm2,%ymm6
vpmuludq %ymm0,%ymm3,%ymm7
vpmuludq %ymm0,%ymm4,%ymm8
vpmuludq %ymm0,%ymm5,%ymm9
vpmuludq %ymm1,%ymm6,%ymm6
vpmuludq %ymm1,%ymm7,%ymm7
vpmuludq %ymm1,%ymm8,%ymm8
vpmuludq %ymm1,%ymm9,%ymm9
vpaddq %ymm2,%ymm6,%ymm2
vpaddq %ymm3,%ymm7,%ymm3
vpaddq %ymm4,%ymm8,%ymm4
vpaddq %ymm5,%ymm9,%ymm5
vpsrlq $32,%ymm2,%ymm2
vpsrlq $32,%ymm4,%ymm4

#store
vpblendd $0xAA,%ymm3,%ymm2,%ymm2
vpblendd $0xAA,%ymm5,%ymm4,%ymm4

vmovdqa %ymm2,(%rdi)
vmovdqa %ymm4,32(%rdi)

add $64,%rsi
add $64,%rdx
add $64,%rdi
add $1,%eax
cmp $16,%eax
jb _looptop2

ret

+ 914
- 0
crypto_sign/dilithium3/avx2/poly.c View File

@@ -0,0 +1,914 @@
#include <immintrin.h>
#include <stdint.h>

#include "fips202x4.h"
#include "ntt.h"
#include "nttconsts.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "rejsample.h"
#include "rounding.h"
#include "symmetric.h"

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_reduce
*
* Description: Reduce all coefficients of input polynomial to representative
* in [0,2*Q[.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_reduce(poly *a) {
PQCLEAN_DILITHIUM3_AVX2_reduce_avx(a->coeffs);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_csubq
*
* Description: For all coefficients of input polynomial subtract Q if
* coefficient is bigger than Q.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_csubq(poly *a) {
PQCLEAN_DILITHIUM3_AVX2_csubq_avx(a->coeffs);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_freeze
*
* Description: Reduce all coefficients of the polynomial to standard
* representatives.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a) {
PQCLEAN_DILITHIUM3_AVX2_reduce_avx(a->coeffs);
PQCLEAN_DILITHIUM3_AVX2_csubq_avx(a->coeffs);

}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_add
*
* Description: Add polynomials. No modular reduction is performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first summand
* - const poly *b: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) {
unsigned int i;
__m256i vec0, vec1;
for (i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
vec1 = _mm256_load_si256(&b->coeffs_x8[i]);
vec0 = _mm256_add_epi32(vec0, vec1);
_mm256_store_si256(&c->coeffs_x8[i], vec0);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_sub
*
* Description: Subtract polynomials. Assumes coefficients of second input
* polynomial to be less than 2*Q. No modular reduction is
* performed.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial to be
* subtraced from first input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) {
unsigned int i;
__m256i vec0, vec1;
const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec);

for (i = 0; i < N / 8; i++) {
vec0 = _mm256_load_si256(&a->coeffs_x8[i]);
vec1 = _mm256_load_si256(&b->coeffs_x8[i]);
vec0 = _mm256_add_epi32(vec0, twoq);
vec0 = _mm256_sub_epi32(vec0, vec1);
_mm256_store_si256(&c->coeffs_x8[i], vec0);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_shiftl
*
* Description: Multiply polynomial by 2^D without modular reduction. Assumes
* input coefficients to be less than 2^{32-D}.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(poly *a) {
unsigned int i;
__m256i vec;

for (i = 0; i < N / 8; i++) {
vec = _mm256_load_si256(&a->coeffs_x8[i]);
vec = _mm256_slli_epi32(vec, D);
_mm256_store_si256(&a->coeffs_x8[i], vec);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_ntt
*
* Description: Forward NTT. Output coefficients can be up to 16*Q larger than
* input coefficients.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_ntt(poly *a) {
unsigned int i;
ALIGNED_UINT64(N) tmp;

for (i = 0; i < N / 32; ++i) {
PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx(tmp.as_arr + 4 * i, a->coeffs + 4 * i, PQCLEAN_DILITHIUM3_AVX2_zetas.as_arr + 1);
}
for (i = 0; i < N / 32; ++i) {
PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx(a->coeffs + 32 * i, tmp.as_arr + 32 * i, PQCLEAN_DILITHIUM3_AVX2_zetas.as_arr + 8 + 31 * i);
}
}

/*************************************************
* Name: poly_invntt_montgomery
*
* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients
* need to be less than 2*Q. Output coefficients are less than 2*Q.
*
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(poly *a) {
unsigned int i;
ALIGNED_UINT64(N) tmp;

for (i = 0; i < N / 32; i++) {
PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx(tmp.as_arr + 32 * i, a->coeffs + 32 * i, PQCLEAN_DILITHIUM3_AVX2_zetas_inv.as_arr + 31 * i);
}
for (i = 0; i < N / 32; i++) {
PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx(a->coeffs + 4 * i, tmp.as_arr + 4 * i, PQCLEAN_DILITHIUM3_AVX2_zetas_inv.as_arr + 248);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery
*
* Description: Pointwise multiplication of polynomials in NTT domain
* representation and multiplication of resulting polynomial
* with 2^{-32}. Output coefficients are less than 2*Q if input
* coefficient are less than 22*Q.
*
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) {
PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(c->coeffs, a->coeffs, b->coeffs);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_power2round
*
* Description: For all coefficients c of the input polynomial,
* compute c0, c1 such that c mod Q = c1*2^D + c0
* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients Q + a0
* - const poly *v: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *restrict a1,
poly *restrict a0,
const poly *restrict a) {
for (size_t i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_power2round(a->coeffs[i], &a0->coeffs[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_decompose
*
* Description: For all coefficients c of the input polynomial,
* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - poly *a1: pointer to output polynomial with coefficients c1
* - poly *a0: pointer to output polynomial with coefficients Q + a0
* - const poly *c: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_decompose(
poly *restrict a1,
poly *restrict a0,
const poly *restrict a) {
unsigned int i;
for (i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_decompose(a->coeffs[i], &a0->coeffs[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_make_hint
*
* Description: Compute hint polynomial. The coefficients of which indicate
* whether the low bits of the corresponding coefficient of
* the input polynomial overflow into the high bits.
*
* Arguments: - poly *h: pointer to output hint polynomial
* - const poly *a0: pointer to low part of input polynomial
* - const poly *a1: pointer to high part of input polynomial
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(
poly *restrict h,
const poly *restrict a0,
const poly *restrict a1) {
unsigned int i, s = 0;
for (i = 0; i < N; ++i) {
h->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_make_hint(a0->coeffs[i], a1->coeffs[i]);
s += h->coeffs[i];
}
return s;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_use_hint
*
* Description: Use hint polynomial to correct the high bits of a polynomial.
*
* Arguments: - poly *a: pointer to output polynomial with corrected high bits
* - const poly *b: pointer to input polynomial
* - const poly *h: pointer to input hint polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(
poly *restrict a,
const poly *restrict b,
const poly *restrict h) {
unsigned int i;

for (i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_use_hint(b->coeffs[i], h->coeffs[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_chknorm
*
* Description: Check infinity norm of polynomial against given bound.
* Assumes input coefficients to be standard representatives.
*
* Arguments: - const poly *a: pointer to polynomial
* - uint32_t B: norm bound
*
* Returns 0 if norm is strictly smaller than B and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, uint32_t B) {
unsigned int i;
int32_t t;

/* It is ok to leak which coefficient violates the bound since
the probability for each coefficient is independent of secret
data but we must not leak the sign of the centralized representative. */
for (i = 0; i < N; ++i) {
/* Absolute value of centralized representative */
t = (Q - 1) / 2 - a->coeffs[i];
t ^= (t >> 31);
t = (Q - 1) / 2 - t;

if ((uint32_t)t >= B) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: rej_uniform_ref
*
* Description: Sample uniformly random coefficients in [0, Q-1] by
* performing rejection sampling using array of random bytes.
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_uniform_ref(uint32_t *a,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t;

ctr = pos = 0;
while (ctr < len && pos + 3 <= buflen) {
t = buf[pos++];
t |= (uint32_t)buf[pos++] << 8;
t |= (uint32_t)buf[pos++] << 16;
t &= 0x7FFFFF;

if (t < Q) {
a[ctr++] = t;
}
}

return ctr;
}

/*************************************************
* Name: poly_uniform
*
* Description: Sample polynomial with uniformly random coefficients
* in [0,Q-1] by performing rejection sampling using the
* output stream from SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a,
const unsigned char seed[SEEDBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int nblocks = POLY_UNIFORM_NBLOCKS;
unsigned int buflen = POLY_UNIFORM_BUFLEN;
unsigned char buf[POLY_UNIFORM_BUFLEN + 2];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, nblocks, &state);

ctr = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a->coeffs, N, buf, buflen);

while (ctr < N) {
off = buflen % 3;
for (i = 0; i < off; ++i) {
buf[i] = buf[buflen - off + i];
}

buflen = STREAM128_BLOCKBYTES + off;
stream128_squeezeblocks(buf + off, 1, &state);
ctr += rej_uniform_ref(a->coeffs + ctr, N - ctr, buf, buflen);
}
}

void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const unsigned char seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3) {
unsigned int i, ctr0, ctr1, ctr2, ctr3;
unsigned char inbuf[4][SEEDBYTES + 2];
unsigned char outbuf[4][5 * SHAKE128_RATE];
__m256i state[25];

for (i = 0; i < SEEDBYTES; ++i) {
inbuf[0][i] = seed[i];
inbuf[1][i] = seed[i];
inbuf[2][i] = seed[i];
inbuf[3][i] = seed[i];
}
inbuf[0][SEEDBYTES + 0] = nonce0;
inbuf[0][SEEDBYTES + 1] = nonce0 >> 8;
inbuf[1][SEEDBYTES + 0] = nonce1;
inbuf[1][SEEDBYTES + 1] = nonce1 >> 8;
inbuf[2][SEEDBYTES + 0] = nonce2;
inbuf[2][SEEDBYTES + 1] = nonce2 >> 8;
inbuf[3][SEEDBYTES + 0] = nonce3;
inbuf[3][SEEDBYTES + 1] = nonce3 >> 8;

PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3],
SEEDBYTES + 2);
PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5,
state);

ctr0 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a0->coeffs, N, outbuf[0], 5 * SHAKE128_RATE);
ctr1 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a1->coeffs, N, outbuf[1], 5 * SHAKE128_RATE);
ctr2 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a2->coeffs, N, outbuf[2], 5 * SHAKE128_RATE);
ctr3 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a3->coeffs, N, outbuf[3], 5 * SHAKE128_RATE);

while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) {
PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1,
state);

ctr0 += rej_uniform_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0],
SHAKE128_RATE);
ctr1 += rej_uniform_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1],
SHAKE128_RATE);
ctr2 += rej_uniform_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2],
SHAKE128_RATE);
ctr3 += rej_uniform_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3],
SHAKE128_RATE);
}
}

/*************************************************
* Name: rej_eta
*
* Description: Sample uniformly random coefficients in [-ETA, ETA] by
* performing rejection sampling using array of random bytes.
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_eta_ref(uint32_t *a,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;

ctr = pos = 0;
while (ctr < len && pos < buflen) {
t0 = buf[pos] & 0x0F;
t1 = buf[pos++] >> 4;

if (t0 <= 2 * ETA) {
a[ctr++] = Q + ETA - t0;
}
if (t1 <= 2 * ETA && ctr < len) {
a[ctr++] = Q + ETA - t1;
}
}

return ctr;
}

/*************************************************
* Name: poly_uniform_eta
*
* Description: Sample polynomial with uniformly random coefficients
* in [-ETA,ETA] by performing rejection sampling using the
* output stream from SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a,
const unsigned char seed[SEEDBYTES],
uint16_t nonce) {
unsigned int ctr;
unsigned char buf[POLY_UNIFORM_ETA_BUFLEN];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);

ctr = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN);

while (ctr < N) {
stream128_squeezeblocks(buf, 1, &state);
ctr += rej_eta_ref(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES);
}
}

void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const unsigned char seed[SEEDBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3) {
unsigned int i, ctr0, ctr1, ctr2, ctr3;
unsigned char inbuf[4][SEEDBYTES + 2];
unsigned char outbuf[4][2 * SHAKE128_RATE];
__m256i state[25];

for (i = 0; i < SEEDBYTES; ++i) {
inbuf[0][i] = seed[i];
inbuf[1][i] = seed[i];
inbuf[2][i] = seed[i];
inbuf[3][i] = seed[i];
}
inbuf[0][SEEDBYTES + 0] = nonce0;
inbuf[0][SEEDBYTES + 1] = nonce0 >> 8;
inbuf[1][SEEDBYTES + 0] = nonce1;
inbuf[1][SEEDBYTES + 1] = nonce1 >> 8;
inbuf[2][SEEDBYTES + 0] = nonce2;
inbuf[2][SEEDBYTES + 1] = nonce2 >> 8;
inbuf[3][SEEDBYTES + 0] = nonce3;
inbuf[3][SEEDBYTES + 1] = nonce3 >> 8;

PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3],
SEEDBYTES + 2);
PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 2,
state);

ctr0 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a0->coeffs, N, outbuf[0], 2 * SHAKE128_RATE);
ctr1 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a1->coeffs, N, outbuf[1], 2 * SHAKE128_RATE);
ctr2 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a2->coeffs, N, outbuf[2], 2 * SHAKE128_RATE);
ctr3 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a3->coeffs, N, outbuf[3], 2 * SHAKE128_RATE);

while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) {
PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1,
state);

ctr0 += rej_eta_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], SHAKE128_RATE);
ctr1 += rej_eta_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], SHAKE128_RATE);
ctr2 += rej_eta_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], SHAKE128_RATE);
ctr3 += rej_eta_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], SHAKE128_RATE);
}
}

/*************************************************
* Name: rej_gamma1m1_ref
*
* Description: Sample uniformly random coefficients
* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling
* using array of random bytes.
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
* random bytes were given.
**************************************************/
static unsigned int rej_gamma1m1_ref(uint32_t *a,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;

ctr = pos = 0;
while (ctr < len && pos + 5 <= buflen) {
t0 = buf[pos];
t0 |= (uint32_t)buf[pos + 1] << 8;
t0 |= (uint32_t)buf[pos + 2] << 16;
t0 &= 0xFFFFF;

t1 = buf[pos + 2] >> 4;
t1 |= (uint32_t)buf[pos + 3] << 4;
t1 |= (uint32_t)buf[pos + 4] << 12;

pos += 5;

if (t0 <= 2 * GAMMA1 - 2) {
a[ctr++] = Q + GAMMA1 - 1 - t0;
}
if (t1 <= 2 * GAMMA1 - 2 && ctr < len) {
a[ctr++] = Q + GAMMA1 - 1 - t1;
}
}
return ctr;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1
*
* Description: Sample polynomial with uniformly random coefficients
* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection
* sampling on output stream of SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* CRHBYTES
* - uint16_t nonce: 16-bit nonce
**************************************************/
#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES)
#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1(poly *a,
const unsigned char seed[CRHBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN;
unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4];
stream256_state state;

stream256_init(&state, seed, nonce);
stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state);

ctr = PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1(a->coeffs, N, buf, POLY_UNIFORM_GAMMA1M1_BUFLEN);

while (ctr < N) {
off = buflen % 5;
for (i = 0; i < off; ++i) {
buf[i] = buf[buflen - off + i];
}

buflen = STREAM256_BLOCKBYTES + off;
stream256_squeezeblocks(buf + off, 1, &state);
ctr += rej_gamma1m1_ref(a->coeffs + ctr, N - ctr, buf, buflen);
}
}

void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const unsigned char seed[CRHBYTES],
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3) {
unsigned int i, ctr0, ctr1, ctr2, ctr3;
unsigned char inbuf[4][CRHBYTES + 2];
unsigned char outbuf[4][5 * SHAKE256_RATE];
__m256i state[25];

for (i = 0; i < CRHBYTES; ++i) {
inbuf[0][i] = seed[i];
inbuf[1][i] = seed[i];
inbuf[2][i] = seed[i];
inbuf[3][i] = seed[i];
}
inbuf[0][CRHBYTES + 0] = nonce0 & 0xFF;
inbuf[0][CRHBYTES + 1] = nonce0 >> 8;
inbuf[1][CRHBYTES + 0] = nonce1 & 0xFF;
inbuf[1][CRHBYTES + 1] = nonce1 >> 8;
inbuf[2][CRHBYTES + 0] = nonce2 & 0xFF;
inbuf[2][CRHBYTES + 1] = nonce2 >> 8;
inbuf[3][CRHBYTES + 0] = nonce3 & 0xFF;
inbuf[3][CRHBYTES + 1] = nonce3 >> 8;

PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3],
CRHBYTES + 2);
PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5,
state);

ctr0 = rej_gamma1m1_ref(a0->coeffs, N, outbuf[0], 5 * SHAKE256_RATE);
ctr1 = rej_gamma1m1_ref(a1->coeffs, N, outbuf[1], 5 * SHAKE256_RATE);
ctr2 = rej_gamma1m1_ref(a2->coeffs, N, outbuf[2], 5 * SHAKE256_RATE);
ctr3 = rej_gamma1m1_ref(a3->coeffs, N, outbuf[3], 5 * SHAKE256_RATE);

while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) {
PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1,
state);

ctr0 += rej_gamma1m1_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0],
SHAKE256_RATE);
ctr1 += rej_gamma1m1_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1],
SHAKE256_RATE);
ctr2 += rej_gamma1m1_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2],
SHAKE256_RATE);
ctr3 += rej_gamma1m1_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3],
SHAKE256_RATE);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyeta_pack
*
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
* Input coefficients are assumed to lie in [Q-ETA,Q+ETA].
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLETA_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;
unsigned char t[8];

for (i = 0; i < N / 2; ++i) {
t[0] = Q + ETA - a->coeffs[2 * i + 0];
t[1] = Q + ETA - a->coeffs[2 * i + 1];
r[i] = t[0] | (t[1] << 4);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack
*
* Description: Unpack polynomial with coefficients in [-ETA,ETA].
* Output coefficients lie in [Q-ETA,Q+ETA].
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *restrict r, const unsigned char *restrict a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[i] & 0x0F;
r->coeffs[2 * i + 1] = a[i] >> 4;
r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1];
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyt1_pack
*
* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits.
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLT1_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;

for (i = 0; i < N / 8; ++i) {
r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0));
r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1));
r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2));
r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3));
r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4));
r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5));
r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6));
r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7));
r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1));
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack
*
* Description: Unpack polynomial t1 with 9-bit coefficients.
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *restrict r, const unsigned char *restrict a) {
unsigned int i;
for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = ((a[9 * i + 0] >> 0) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF;
r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF;
r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF;
r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF;
r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF;
r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF;
r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF;
r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyt0_pack
*
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}].
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLT0_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;
uint32_t t[4];

for (i = 0; i < N / 4; ++i) {
t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0];
t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1];
t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2];
t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3];

r[7 * i + 0] = t[0];
r[7 * i + 1] = t[0] >> 8;
r[7 * i + 1] |= t[1] << 6;
r[7 * i + 2] = t[1] >> 2;
r[7 * i + 3] = t[1] >> 10;
r[7 * i + 3] |= t[2] << 4;
r[7 * i + 4] = t[2] >> 4;
r[7 * i + 5] = t[2] >> 12;
r[7 * i + 5] |= t[3] << 2;
r[7 * i + 6] = t[3] >> 6;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack
*
* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}].
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *restrict r, const unsigned char *restrict a) {
unsigned int i;

for (i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = a[7 * i + 0];
r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8;

r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2;
r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10;

r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4;
r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12;

r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6;

r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0];
r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1];
r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2];
r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3];
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyz_pack
*
* Description: Bit-pack polynomial z with coefficients
* in [-(GAMMA1 - 1), GAMMA1 - 1].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLZ_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;
uint32_t t[2];

for (i = 0; i < N / 2; ++i) {
/* Map to {0,...,2*GAMMA1 - 2} */
t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0];
t[0] += ((int32_t)t[0] >> 31) & Q;
t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1];
t[1] += ((int32_t)t[1] >> 31) & Q;

r[5 * i + 0] = t[0];
r[5 * i + 1] = t[0] >> 8;
r[5 * i + 2] = t[0] >> 16;
r[5 * i + 2] |= t[1] << 4;
r[5 * i + 3] = t[1] >> 4;
r[5 * i + 4] = t[1] >> 12;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyz_unpack
*
* Description: Unpack polynomial z with coefficients
* in [-(GAMMA1 - 1), GAMMA1 - 1].
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *restrict r, const unsigned char *restrict a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[5 * i + 0];
r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16;

r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;

r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q;
r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q;
}

}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyw1_pack
*
* Description: Bit-pack polynomial w1 with coefficients in [0, 15].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* POLW1_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(unsigned char *restrict r, const poly *restrict a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4);
}
}

+ 83
- 0
crypto_sign/dilithium3/avx2/poly.h View File

@@ -0,0 +1,83 @@
#ifndef POLY_H
#define POLY_H

#include <immintrin.h>
#include <stdint.h>

#include "alignment.h"
#include "params.h"

typedef union {
uint32_t coeffs[N];
__m256i coeffs_x8[N / 8];
} poly;

void PQCLEAN_DILITHIUM3_AVX2_poly_reduce(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_csubq(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a);

void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b);
void PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(poly *a);

void PQCLEAN_DILITHIUM3_AVX2_poly_ntt(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b);

void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a);
unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1);
void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h);

int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, uint32_t B);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1(poly *a,
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(poly *a0,
poly *a1,
poly *a2,
poly *a3,
const uint8_t *seed,
uint16_t nonce0,
uint16_t nonce1,
uint16_t nonce2,
uint16_t nonce3);

void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(uint8_t *r, const poly *a);
#endif

+ 353
- 0
crypto_sign/dilithium3/avx2/polyvec.c View File

@@ -0,0 +1,353 @@
#include <stdint.h>

#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
*
* Arguments: - polyvecl *w: pointer to output vector
* - const polyvecl *u: pointer to first summand
* - const polyvecl *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyvecl *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v) {
unsigned int i;

for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
* in it. Input/output vectors are in NTT domain representation.
* Input coefficients are assumed to be less than 22*Q. Output
* coeffcient are less than 2*L*Q.
*
* Arguments: - poly *w: output polynomial
* - const polyvecl *u: pointer to first input vector
* - const polyvecl *v: pointer to second input vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w,
const polyvecl *u,
const polyvecl *v) {
PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs);
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input coefficients to be standard representatives.
*
* Arguments: - const polyvecl *v: pointer to vector
* - uint32_t B: norm bound
*
* Returns 0 if norm of all polynomials is strictly smaller than B and 1
* otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/**************************************************************/
/************ Vectors of polynomials of length K **************/
/**************************************************************/


/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [0,2*Q[.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq
*
* Description: For all coefficients of polynomials in vector of length K
* subtract Q if coefficient is bigger than Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_csubq(&v->vec[i]);
}
}

/*************************************************
* Name: polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first summand
* - const polyveck *v: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* Assumes coefficients of polynomials in second input vector
* to be less than 2*Q. No modular reduction is performed.
*
* Arguments: - polyveck *w: pointer to output vector
* - const polyveck *u: pointer to first input vector
* - const polyveck *v: pointer to second input vector to be
* subtracted from first input vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{32-D}.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
* than 2*Q.
*
* Arguments: - polyveck *v: pointer to input/output vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input coefficients to be standard representatives.
*
* Arguments: - const polyveck *v: pointer to vector
* - uint32_t B: norm bound
*
* Returns 0 if norm of all polynomials are strictly smaller than B and 1
* otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod Q = a1*2^D + a0
* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
* standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients Q + a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0
* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
* Assumes coefficients to be standard representatives.
*
* Arguments: - polyveck *v1: pointer to output vector of polynomials with
* coefficients a1
* - polyveck *v0: pointer to output vector of polynomials with
* coefficients Q + a0
* - const polyveck *v: pointer to input vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]);
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint
*
* Description: Compute hint vector.
*
* Arguments: - polyveck *h: pointer to output vector
* - const polyveck *v0: pointer to low part of input vector
* - const polyveck *v1: pointer to high part of input vector
*
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1) {
unsigned int i, s = 0;

for (i = 0; i < K; ++i) {
s += PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]);
}

return s;
}

/*************************************************
* Name: polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *v: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]);
}
}

+ 52
- 0
crypto_sign/dilithium3/avx2/polyvec.h View File

@@ -0,0 +1,52 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H
#define PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H

#include <stdint.h>

#include "params.h"
#include "poly.h"

/* Vectors of polynomials of length L */
typedef struct {
poly vec[L];
} polyvecl;

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v);

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);

void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v);
void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w,
const polyvecl *u,
const polyvecl *v);

int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B);



/* Vectors of polynomials of length K */
typedef struct {
poly vec[K];
} polyveck;

void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v);

void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v);

void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(polyveck *v);

int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, uint32_t B);

void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h);

#endif

+ 9
- 0
crypto_sign/dilithium3/avx2/reduce.h View File

@@ -0,0 +1,9 @@
#ifndef REDUCE_H
#define REDUCE_H

#include <stdint.h>

void PQCLEAN_DILITHIUM3_AVX2_reduce_avx(uint32_t a[N]);
void PQCLEAN_DILITHIUM3_AVX2_csubq_avx(uint32_t a[N]);

#endif

+ 91
- 0
crypto_sign/dilithium3/avx2/reduce.s View File

@@ -0,0 +1,91 @@
.global PQCLEAN_DILITHIUM3_AVX2_reduce_avx
PQCLEAN_DILITHIUM3_AVX2_reduce_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x23ones(%rip),%ymm0

xor %eax,%eax
_looptop_rdc32:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7

#reduce
vpsrld $23,%ymm1,%ymm2
vpsrld $23,%ymm3,%ymm4
vpsrld $23,%ymm5,%ymm6
vpsrld $23,%ymm7,%ymm8
vpand %ymm0,%ymm1,%ymm1
vpand %ymm0,%ymm3,%ymm3
vpand %ymm0,%ymm5,%ymm5
vpand %ymm0,%ymm7,%ymm7
vpsubd %ymm2,%ymm1,%ymm1
vpsubd %ymm4,%ymm3,%ymm3
vpsubd %ymm6,%ymm5,%ymm5
vpsubd %ymm8,%ymm7,%ymm7
vpslld $13,%ymm2,%ymm2
vpslld $13,%ymm4,%ymm4
vpslld $13,%ymm6,%ymm6
vpslld $13,%ymm8,%ymm8
vpaddd %ymm2,%ymm1,%ymm1
vpaddd %ymm4,%ymm3,%ymm3
vpaddd %ymm6,%ymm5,%ymm5
vpaddd %ymm8,%ymm7,%ymm7

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm7,96(%rdi)

add $128,%rdi
add $1,%eax
cmp $8,%eax
jb _looptop_rdc32

ret

.global PQCLEAN_DILITHIUM3_AVX2_csubq_avx
PQCLEAN_DILITHIUM3_AVX2_csubq_avx:
#consts
vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm0

xor %eax,%eax
_looptop_csubq:
#load
vmovdqa (%rdi),%ymm1
vmovdqa 32(%rdi),%ymm3
vmovdqa 64(%rdi),%ymm5
vmovdqa 96(%rdi),%ymm7

#PQCLEAN_DILITHIUM3_AVX2_csubq
vpsubd %ymm0,%ymm1,%ymm1
vpsubd %ymm0,%ymm3,%ymm3
vpsubd %ymm0,%ymm5,%ymm5
vpsubd %ymm0,%ymm7,%ymm7
vpsrad $31,%ymm1,%ymm2
vpsrad $31,%ymm3,%ymm4
vpsrad $31,%ymm5,%ymm6
vpsrad $31,%ymm7,%ymm8
vpand %ymm0,%ymm2,%ymm2
vpand %ymm0,%ymm4,%ymm4
vpand %ymm0,%ymm6,%ymm6
vpand %ymm0,%ymm8,%ymm8
vpaddd %ymm2,%ymm1,%ymm1
vpaddd %ymm4,%ymm3,%ymm3
vpaddd %ymm6,%ymm5,%ymm5
vpaddd %ymm8,%ymm7,%ymm7

#store
vmovdqa %ymm1,(%rdi)
vmovdqa %ymm3,32(%rdi)
vmovdqa %ymm5,64(%rdi)
vmovdqa %ymm7,96(%rdi)

add $128,%rdi
add $1,%eax
cmp $8,%eax
jb _looptop_csubq

ret

+ 443
- 0
crypto_sign/dilithium3/avx2/rejsample.c View File

@@ -0,0 +1,443 @@
#include <immintrin.h>
#include <stdint.h>

#include "params.h"
#include "rejsample.h"

static const uint8_t idx[256][8] = {
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 0, 0, 0, 0, 0, 0, 0, 0},
{ 1, 0, 0, 0, 0, 0, 0, 0},
{ 0, 1, 0, 0, 0, 0, 0, 0},
{ 2, 0, 0, 0, 0, 0, 0, 0},
{ 0, 2, 0, 0, 0, 0, 0, 0},
{ 1, 2, 0, 0, 0, 0, 0, 0},
{ 0, 1, 2, 0, 0, 0, 0, 0},
{ 3, 0, 0, 0, 0, 0, 0, 0},
{ 0, 3, 0, 0, 0, 0, 0, 0},
{ 1, 3, 0, 0, 0, 0, 0, 0},
{ 0, 1, 3, 0, 0, 0, 0, 0},
{ 2, 3, 0, 0, 0, 0, 0, 0},
{ 0, 2, 3, 0, 0, 0, 0, 0},
{ 1, 2, 3, 0, 0, 0, 0, 0},
{ 0, 1, 2, 3, 0, 0, 0, 0},
{ 4, 0, 0, 0, 0, 0, 0, 0},
{ 0, 4, 0, 0, 0, 0, 0, 0},
{ 1, 4, 0, 0, 0, 0, 0, 0},
{ 0, 1, 4, 0, 0, 0, 0, 0},
{ 2, 4, 0, 0, 0, 0, 0, 0},
{ 0, 2, 4, 0, 0, 0, 0, 0},
{ 1, 2, 4, 0, 0, 0, 0, 0},
{ 0, 1, 2, 4, 0, 0, 0, 0},
{ 3, 4, 0, 0, 0, 0, 0, 0},
{ 0, 3, 4, 0, 0, 0, 0, 0},
{ 1, 3, 4, 0, 0, 0, 0, 0},
{ 0, 1, 3, 4, 0, 0, 0, 0},
{ 2, 3, 4, 0, 0, 0, 0, 0},
{ 0, 2, 3, 4, 0, 0, 0, 0},
{ 1, 2, 3, 4, 0, 0, 0, 0},
{ 0, 1, 2, 3, 4, 0, 0, 0},
{ 5, 0, 0, 0, 0, 0, 0, 0},
{ 0, 5, 0, 0, 0, 0, 0, 0},
{ 1, 5, 0, 0, 0, 0, 0, 0},
{ 0, 1, 5, 0, 0, 0, 0, 0},
{ 2, 5, 0, 0, 0, 0, 0, 0},
{ 0, 2, 5, 0, 0, 0, 0, 0},
{ 1, 2, 5, 0, 0, 0, 0, 0},
{ 0, 1, 2, 5, 0, 0, 0, 0},
{ 3, 5, 0, 0, 0, 0, 0, 0},
{ 0, 3, 5, 0, 0, 0, 0, 0},
{ 1, 3, 5, 0, 0, 0, 0, 0},
{ 0, 1, 3, 5, 0, 0, 0, 0},
{ 2, 3, 5, 0, 0, 0, 0, 0},
{ 0, 2, 3, 5, 0, 0, 0, 0},
{ 1, 2, 3, 5, 0, 0, 0, 0},
{ 0, 1, 2, 3, 5, 0, 0, 0},
{ 4, 5, 0, 0, 0, 0, 0, 0},
{ 0, 4, 5, 0, 0, 0, 0, 0},
{ 1, 4, 5, 0, 0, 0, 0, 0},
{ 0, 1, 4, 5, 0, 0, 0, 0},
{ 2, 4, 5, 0, 0, 0, 0, 0},
{ 0, 2, 4, 5, 0, 0, 0, 0},
{ 1, 2, 4, 5, 0, 0, 0, 0},
{ 0, 1, 2, 4, 5, 0, 0, 0},
{ 3, 4, 5, 0, 0, 0, 0, 0},
{ 0, 3, 4, 5, 0, 0, 0, 0},
{ 1, 3, 4, 5, 0, 0, 0, 0},
{ 0, 1, 3, 4, 5, 0, 0, 0},
{ 2, 3, 4, 5, 0, 0, 0, 0},
{ 0, 2, 3, 4, 5, 0, 0, 0},
{ 1, 2, 3, 4, 5, 0, 0, 0},
{ 0, 1, 2, 3, 4, 5, 0, 0},
{ 6, 0, 0, 0, 0, 0, 0, 0},
{ 0, 6, 0, 0, 0, 0, 0, 0},
{ 1, 6, 0, 0, 0, 0, 0, 0},
{ 0, 1, 6, 0, 0, 0, 0, 0},
{ 2, 6, 0, 0, 0, 0, 0, 0},
{ 0, 2, 6, 0, 0, 0, 0, 0},
{ 1, 2, 6, 0, 0, 0, 0, 0},
{ 0, 1, 2, 6, 0, 0, 0, 0},
{ 3, 6, 0, 0, 0, 0, 0, 0},
{ 0, 3, 6, 0, 0, 0, 0, 0},
{ 1, 3, 6, 0, 0, 0, 0, 0},
{ 0, 1, 3, 6, 0, 0, 0, 0},
{ 2, 3, 6, 0, 0, 0, 0, 0},
{ 0, 2, 3, 6, 0, 0, 0, 0},
{ 1, 2, 3, 6, 0, 0, 0, 0},
{ 0, 1, 2, 3, 6, 0, 0, 0},
{ 4, 6, 0, 0, 0, 0, 0, 0},
{ 0, 4, 6, 0, 0, 0, 0, 0},
{ 1, 4, 6, 0, 0, 0, 0, 0},
{ 0, 1, 4, 6, 0, 0, 0, 0},
{ 2, 4, 6, 0, 0, 0, 0, 0},
{ 0, 2, 4, 6, 0, 0, 0, 0},
{ 1, 2, 4, 6, 0, 0, 0, 0},
{ 0, 1, 2, 4, 6, 0, 0, 0},
{ 3, 4, 6, 0, 0, 0, 0, 0},
{ 0, 3, 4, 6, 0, 0, 0, 0},
{ 1, 3, 4, 6, 0, 0, 0, 0},
{ 0, 1, 3, 4, 6, 0, 0, 0},
{ 2, 3, 4, 6, 0, 0, 0, 0},
{ 0, 2, 3, 4, 6, 0, 0, 0},
{ 1, 2, 3, 4, 6, 0, 0, 0},
{ 0, 1, 2, 3, 4, 6, 0, 0},
{ 5, 6, 0, 0, 0, 0, 0, 0},
{ 0, 5, 6, 0, 0, 0, 0, 0},
{ 1, 5, 6, 0, 0, 0, 0, 0},
{ 0, 1, 5, 6, 0, 0, 0, 0},
{ 2, 5, 6, 0, 0, 0, 0, 0},
{ 0, 2, 5, 6, 0, 0, 0, 0},
{ 1, 2, 5, 6, 0, 0, 0, 0},
{ 0, 1, 2, 5, 6, 0, 0, 0},
{ 3, 5, 6, 0, 0, 0, 0, 0},
{ 0, 3, 5, 6, 0, 0, 0, 0},
{ 1, 3, 5, 6, 0, 0, 0, 0},
{ 0, 1, 3, 5, 6, 0, 0, 0},
{ 2, 3, 5, 6, 0, 0, 0, 0},
{ 0, 2, 3, 5, 6, 0, 0, 0},
{ 1, 2, 3, 5, 6, 0, 0, 0},
{ 0, 1, 2, 3, 5, 6, 0, 0},
{ 4, 5, 6, 0, 0, 0, 0, 0},
{ 0, 4, 5, 6, 0, 0, 0, 0},
{ 1, 4, 5, 6, 0, 0, 0, 0},
{ 0, 1, 4, 5, 6, 0, 0, 0},
{ 2, 4, 5, 6, 0, 0, 0, 0},
{ 0, 2, 4, 5, 6, 0, 0, 0},
{ 1, 2, 4, 5, 6, 0, 0, 0},
{ 0, 1, 2, 4, 5, 6, 0, 0},
{ 3, 4, 5, 6, 0, 0, 0, 0},
{ 0, 3, 4, 5, 6, 0, 0, 0},
{ 1, 3, 4, 5, 6, 0, 0, 0},
{ 0, 1, 3, 4, 5, 6, 0, 0},
{ 2, 3, 4, 5, 6, 0, 0, 0},
{ 0, 2, 3, 4, 5, 6, 0, 0},
{ 1, 2, 3, 4, 5, 6, 0, 0},
{ 0, 1, 2, 3, 4, 5, 6, 0},
{ 7, 0, 0, 0, 0, 0, 0, 0},
{ 0, 7, 0, 0, 0, 0, 0, 0},
{ 1, 7, 0, 0, 0, 0, 0, 0},
{ 0, 1, 7, 0, 0, 0, 0, 0},
{ 2, 7, 0, 0, 0, 0, 0, 0},
{ 0, 2, 7, 0, 0, 0, 0, 0},
{ 1, 2, 7, 0, 0, 0, 0, 0},
{ 0, 1, 2, 7, 0, 0, 0, 0},
{ 3, 7, 0, 0, 0, 0, 0, 0},
{ 0, 3, 7, 0, 0, 0, 0, 0},
{ 1, 3, 7, 0, 0, 0, 0, 0},
{ 0, 1, 3, 7, 0, 0, 0, 0},
{ 2, 3, 7, 0, 0, 0, 0, 0},
{ 0, 2, 3, 7, 0, 0, 0, 0},
{ 1, 2, 3, 7, 0, 0, 0, 0},
{ 0, 1, 2, 3, 7, 0, 0, 0},
{ 4, 7, 0, 0, 0, 0, 0, 0},
{ 0, 4, 7, 0, 0, 0, 0, 0},
{ 1, 4, 7, 0, 0, 0, 0, 0},
{ 0, 1, 4, 7, 0, 0, 0, 0},
{ 2, 4, 7, 0, 0, 0, 0, 0},
{ 0, 2, 4, 7, 0, 0, 0, 0},
{ 1, 2, 4, 7, 0, 0, 0, 0},
{ 0, 1, 2, 4, 7, 0, 0, 0},
{ 3, 4, 7, 0, 0, 0, 0, 0},
{ 0, 3, 4, 7, 0, 0, 0, 0},
{ 1, 3, 4, 7, 0, 0, 0, 0},
{ 0, 1, 3, 4, 7, 0, 0, 0},
{ 2, 3, 4, 7, 0, 0, 0, 0},
{ 0, 2, 3, 4, 7, 0, 0, 0},
{ 1, 2, 3, 4, 7, 0, 0, 0},
{ 0, 1, 2, 3, 4, 7, 0, 0},
{ 5, 7, 0, 0, 0, 0, 0, 0},
{ 0, 5, 7, 0, 0, 0, 0, 0},
{ 1, 5, 7, 0, 0, 0, 0, 0},
{ 0, 1, 5, 7, 0, 0, 0, 0},
{ 2, 5, 7, 0, 0, 0, 0, 0},
{ 0, 2, 5, 7, 0, 0, 0, 0},
{ 1, 2, 5, 7, 0, 0, 0, 0},
{ 0, 1, 2, 5, 7, 0, 0, 0},
{ 3, 5, 7, 0, 0, 0, 0, 0},
{ 0, 3, 5, 7, 0, 0, 0, 0},
{ 1, 3, 5, 7, 0, 0, 0, 0},
{ 0, 1, 3, 5, 7, 0, 0, 0},
{ 2, 3, 5, 7, 0, 0, 0, 0},
{ 0, 2, 3, 5, 7, 0, 0, 0},
{ 1, 2, 3, 5, 7, 0, 0, 0},
{ 0, 1, 2, 3, 5, 7, 0, 0},
{ 4, 5, 7, 0, 0, 0, 0, 0},
{ 0, 4, 5, 7, 0, 0, 0, 0},
{ 1, 4, 5, 7, 0, 0, 0, 0},
{ 0, 1, 4, 5, 7, 0, 0, 0},
{ 2, 4, 5, 7, 0, 0, 0, 0},
{ 0, 2, 4, 5, 7, 0, 0, 0},
{ 1, 2, 4, 5, 7, 0, 0, 0},
{ 0, 1, 2, 4, 5, 7, 0, 0},
{ 3, 4, 5, 7, 0, 0, 0, 0},
{ 0, 3, 4, 5, 7, 0, 0, 0},
{ 1, 3, 4, 5, 7, 0, 0, 0},
{ 0, 1, 3, 4, 5, 7, 0, 0},
{ 2, 3, 4, 5, 7, 0, 0, 0},
{ 0, 2, 3, 4, 5, 7, 0, 0},
{ 1, 2, 3, 4, 5, 7, 0, 0},
{ 0, 1, 2, 3, 4, 5, 7, 0},
{ 6, 7, 0, 0, 0, 0, 0, 0},
{ 0, 6, 7, 0, 0, 0, 0, 0},
{ 1, 6, 7, 0, 0, 0, 0, 0},
{ 0, 1, 6, 7, 0, 0, 0, 0},
{ 2, 6, 7, 0, 0, 0, 0, 0},
{ 0, 2, 6, 7, 0, 0, 0, 0},
{ 1, 2, 6, 7, 0, 0, 0, 0},
{ 0, 1, 2, 6, 7, 0, 0, 0},
{ 3, 6, 7, 0, 0, 0, 0, 0},
{ 0, 3, 6, 7, 0, 0, 0, 0},
{ 1, 3, 6, 7, 0, 0, 0, 0},
{ 0, 1, 3, 6, 7, 0, 0, 0},
{ 2, 3, 6, 7, 0, 0, 0, 0},
{ 0, 2, 3, 6, 7, 0, 0, 0},
{ 1, 2, 3, 6, 7, 0, 0, 0},
{ 0, 1, 2, 3, 6, 7, 0, 0},
{ 4, 6, 7, 0, 0, 0, 0, 0},
{ 0, 4, 6, 7, 0, 0, 0, 0},
{ 1, 4, 6, 7, 0, 0, 0, 0},
{ 0, 1, 4, 6, 7, 0, 0, 0},
{ 2, 4, 6, 7, 0, 0, 0, 0},
{ 0, 2, 4, 6, 7, 0, 0, 0},
{ 1, 2, 4, 6, 7, 0, 0, 0},
{ 0, 1, 2, 4, 6, 7, 0, 0},
{ 3, 4, 6, 7, 0, 0, 0, 0},
{ 0, 3, 4, 6, 7, 0, 0, 0},
{ 1, 3, 4, 6, 7, 0, 0, 0},
{ 0, 1, 3, 4, 6, 7, 0, 0},
{ 2, 3, 4, 6, 7, 0, 0, 0},
{ 0, 2, 3, 4, 6, 7, 0, 0},
{ 1, 2, 3, 4, 6, 7, 0, 0},
{ 0, 1, 2, 3, 4, 6, 7, 0},
{ 5, 6, 7, 0, 0, 0, 0, 0},
{ 0, 5, 6, 7, 0, 0, 0, 0},
{ 1, 5, 6, 7, 0, 0, 0, 0},
{ 0, 1, 5, 6, 7, 0, 0, 0},
{ 2, 5, 6, 7, 0, 0, 0, 0},
{ 0, 2, 5, 6, 7, 0, 0, 0},
{ 1, 2, 5, 6, 7, 0, 0, 0},
{ 0, 1, 2, 5, 6, 7, 0, 0},
{ 3, 5, 6, 7, 0, 0, 0, 0},
{ 0, 3, 5, 6, 7, 0, 0, 0},
{ 1, 3, 5, 6, 7, 0, 0, 0},
{ 0, 1, 3, 5, 6, 7, 0, 0},
{ 2, 3, 5, 6, 7, 0, 0, 0},
{ 0, 2, 3, 5, 6, 7, 0, 0},
{ 1, 2, 3, 5, 6, 7, 0, 0},
{ 0, 1, 2, 3, 5, 6, 7, 0},
{ 4, 5, 6, 7, 0, 0, 0, 0},
{ 0, 4, 5, 6, 7, 0, 0, 0},
{ 1, 4, 5, 6, 7, 0, 0, 0},
{ 0, 1, 4, 5, 6, 7, 0, 0},
{ 2, 4, 5, 6, 7, 0, 0, 0},
{ 0, 2, 4, 5, 6, 7, 0, 0},
{ 1, 2, 4, 5, 6, 7, 0, 0},
{ 0, 1, 2, 4, 5, 6, 7, 0},
{ 3, 4, 5, 6, 7, 0, 0, 0},
{ 0, 3, 4, 5, 6, 7, 0, 0},
{ 1, 3, 4, 5, 6, 7, 0, 0},
{ 0, 1, 3, 4, 5, 6, 7, 0},
{ 2, 3, 4, 5, 6, 7, 0, 0},
{ 0, 2, 3, 4, 5, 6, 7, 0},
{ 1, 2, 3, 4, 5, 6, 7, 0},
{ 0, 1, 2, 3, 4, 5, 6, 7}
};

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform(
uint32_t *r,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int i, ctr, pos;
uint32_t vec[8];
__m256i d, tmp;
uint32_t good;
const __m256i bound = _mm256_set1_epi32(Q);

ctr = pos = 0;
while (ctr + 8 <= len && pos + 24 <= buflen) {
for (i = 0; i < 8; i++) {
vec[i] = buf[pos++];
vec[i] |= (uint32_t)buf[pos++] << 8;
vec[i] |= (uint32_t)buf[pos++] << 16;
vec[i] &= 0x7FFFFF;
}

d = _mm256_loadu_si256((__m256i_u *)vec);
tmp = _mm256_cmpgt_epi32(bound, d);
good = _mm256_movemask_ps((__m256)tmp);

__m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]);
tmp = _mm256_cvtepu8_epi32(rid);
d = _mm256_permutevar8x32_epi32(d, tmp);
_mm256_storeu_si256((__m256i_u *)&r[ctr], d);
ctr += __builtin_popcount(good);
}

while (ctr < len && pos + 3 <= buflen) {
vec[0] = buf[pos++];
vec[0] |= (uint32_t)buf[pos++] << 8;
vec[0] |= (uint32_t)buf[pos++] << 16;
vec[0] &= 0x7FFFFF;

if (vec[0] < Q) {
r[ctr++] = vec[0];
}
}

return ctr;
}

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta(
uint32_t *r,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int i, ctr, pos;
uint8_t vec[32];
__m256i tmp0, tmp1;
__m128i d0, d1, rid;
uint32_t good;
const __m256i bound = _mm256_set1_epi8(2 * ETA + 1);
const __m256i off = _mm256_set1_epi32(Q + ETA);

ctr = pos = 0;
while (ctr + 32 <= len && pos + 16 <= buflen) {
for (i = 0; i < 16; i++) {
vec[2 * i + 0] = buf[pos] & 0x0F;
vec[2 * i + 1] = buf[pos++] >> 4;
}

tmp0 = _mm256_loadu_si256((__m256i_u *)vec);
tmp1 = _mm256_cmpgt_epi8(bound, tmp0);
good = _mm256_movemask_epi8(tmp1);

d0 = _mm256_castsi256_si128(tmp0);
rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount(good & 0xFF);

d0 = _mm_bsrli_si128(d0, 8);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 8) & 0xFF);

d0 = _mm256_extracti128_si256(tmp0, 1);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 16) & 0xFF);

d0 = _mm_bsrli_si128(d0, 8);
rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]);
d1 = _mm_shuffle_epi8(d0, rid);
tmp1 = _mm256_cvtepu8_epi32(d1);
tmp1 = _mm256_sub_epi32(off, tmp1);
_mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1);
ctr += __builtin_popcount((good >> 24) & 0xFF);
}

while (ctr < len && pos < buflen) {
vec[0] = buf[pos] & 0x0F;
vec[1] = buf[pos++] >> 4;

if (vec[0] <= 2 * ETA) {
r[ctr++] = Q + ETA - vec[0];
}
if (vec[1] <= 2 * ETA && ctr < len) {
r[ctr++] = Q + ETA - vec[1];
}
}

return ctr;
}

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1(
uint32_t *r,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
unsigned int i, ctr, pos;
uint32_t vec[8];
__m256i d, tmp;
uint32_t good;
const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1);
const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1);

ctr = pos = 0;
while (ctr + 8 <= len && pos + 20 <= buflen) {
for (i = 0; i < 4; i++) {
vec[2 * i + 0] = buf[pos + 0];
vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8;
vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16;
vec[2 * i + 0] &= 0xFFFFF;

vec[2 * i + 1] = buf[pos + 2] >> 4;
vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4;
vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12;

pos += 5;
}

d = _mm256_loadu_si256((__m256i_u *)vec);
tmp = _mm256_cmpgt_epi32(bound, d);
good = _mm256_movemask_ps((__m256)tmp);
d = _mm256_sub_epi32(off, d);

__m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]);
tmp = _mm256_cvtepu8_epi32(rid);
d = _mm256_permutevar8x32_epi32(d, tmp);
_mm256_storeu_si256((__m256i_u *)&r[ctr], d);
ctr += __builtin_popcount(good);
}

while (ctr < len && pos + 5 <= buflen) {
vec[0] = buf[pos + 0];
vec[0] |= (uint32_t)buf[pos + 1] << 8;
vec[0] |= (uint32_t)buf[pos + 2] << 16;
vec[0] &= 0xFFFFF;

vec[1] = buf[pos + 2] >> 4;
vec[1] |= (uint32_t)buf[pos + 3] << 4;
vec[1] |= (uint32_t)buf[pos + 4] << 12;

pos += 5;

if (vec[0] <= 2 * GAMMA1 - 2) {
r[ctr++] = Q + GAMMA1 - 1 - vec[0];
}
if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) {
r[ctr++] = Q + GAMMA1 - 1 - vec[1];
}
}

return ctr;
}

+ 26
- 0
crypto_sign/dilithium3/avx2/rejsample.h View File

@@ -0,0 +1,26 @@
#ifndef REJSAMPLE_H
#define REJSAMPLE_H

#include <stdint.h>

#include "poly.h"

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform(
uint32_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen);

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta(
uint32_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen);

unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1(
uint32_t *r,
unsigned int len,
const uint8_t *buf,
unsigned int buflen);

#endif

+ 115
- 0
crypto_sign/dilithium3/avx2/rounding.c View File

@@ -0,0 +1,115 @@
#include "rounding.h"

/*************************************************
* Name: power2round
*
* Description: For finite field element a, compute a0, a1 such that
* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
* Assumes a to be standard representative.
*
* Arguments: - uint32_t a: input element
* - uint32_t *a0: pointer to output element Q + a0
*
* Returns a1.
**************************************************/
uint32_t PQCLEAN_DILITHIUM3_AVX2_power2round(uint32_t a, uint32_t *a0) {
int32_t t;

/* Centralized remainder mod 2^D */
t = a & ((1U << D) - 1);
t -= (1U << (D - 1)) + 1;
t += (t >> 31) & (1U << D);
t -= (1U << (D - 1)) - 1;
*a0 = Q + t;
a = (a - t) >> D;
return a;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_decompose
*
* Description: For finite field element a, compute high and low bits a0, a1 such
* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except
* if a1 = (Q-1)/ALPHA where we set a1 = 0 and
* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard
* representative.
*
* Arguments: - uint32_t a: input element
* - uint32_t *a0: pointer to output element Q + a0
*
* Returns a1.
**************************************************/
uint32_t PQCLEAN_DILITHIUM3_AVX2_decompose(uint32_t a, uint32_t *a0) {
int32_t t, u;

/* Centralized remainder mod ALPHA */
t = a & 0x7FFFF;
t += (a >> 19) << 9;
t -= ALPHA / 2 + 1;
t += (t >> 31) & ALPHA;
t -= ALPHA / 2 - 1;
a -= t;

/* Divide by ALPHA (possible to avoid) */
u = a - 1;
u >>= 31;
a = (a >> 19) + 1;
a -= u & 1;

/* Border case */
*a0 = Q + t - (a >> 4);
a &= 0xF;
return a;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_make_hint
*
* Description: Compute hint bit indicating whether the low bits of the
* input element overflow into the high bits. Inputs assumed to be
* standard representatives.
*
* Arguments: - uint32_t a0: low bits of input element
* - uint32_t a1: high bits of input element
*
* Returns 1 if high bits of a and b differ and 0 otherwise.
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint(const uint32_t a0, const uint32_t a1) {
if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) {
return 0;
}

return 1;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_use_hint
*
* Description: Correct high bits according to hint.
*
* Arguments: - uint32_t a: input element
* - unsigned int hint: hint bit
*
* Returns corrected high bits.
**************************************************/
uint32_t PQCLEAN_DILITHIUM3_AVX2_use_hint(const uint32_t a, const unsigned int hint) {
uint32_t a0, a1;

a1 = PQCLEAN_DILITHIUM3_AVX2_decompose(a, &a0);
if (hint == 0) {
return a1;
}
if (a0 > Q) {
return (a1 + 1) & 0xF;
}
return (a1 - 1) & 0xF;

/* If decompose does not divide out ALPHA:
if(hint == 0)
return a1;
else if(a0 > Q)
return (a1 + ALPHA) % (Q - 1);
else
return (a1 - ALPHA) % (Q - 1);
*/
}

+ 12
- 0
crypto_sign/dilithium3/avx2/rounding.h View File

@@ -0,0 +1,12 @@
#ifndef ROUNDING_H
#define ROUNDING_H

#include "params.h"
#include <stdint.h>

uint32_t PQCLEAN_DILITHIUM3_AVX2_power2round(uint32_t a, uint32_t *a0);
uint32_t PQCLEAN_DILITHIUM3_AVX2_decompose(uint32_t a, uint32_t *a0);
unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint(uint32_t a0, uint32_t a1);
uint32_t PQCLEAN_DILITHIUM3_AVX2_use_hint(uint32_t a, unsigned int hint);

#endif

+ 23
- 0
crypto_sign/dilithium3/avx2/shuffle.inc View File

@@ -0,0 +1,23 @@
.macro shuffle8 r0,r1,r2,r3
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm

.macro shuffle2 r0,r1,r2,r3
vpsllq $32,%ymm\r1,%ymm12
vpsrlq $32,%ymm\r0,%ymm13
vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3
.endm

.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm12
vpsrld $16,%ymm\r0,%ymm13
vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2
vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3
.endm

+ 446
- 0
crypto_sign/dilithium3/avx2/sign.c View File

@@ -0,0 +1,446 @@
#include <stdint.h>
#include <string.h>

#include "fips202.h"
#include "packing.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include "randombytes.h"
#include "sign.h"
#include "symmetric.h"

/*************************************************
* Name: expand_mat
*
* Description: Implementation of ExpandA. Generates matrix A with uniformly
* random coefficients a_{i,j} by performing rejection
* sampling on the output stream of SHAKE128(rho|i|j).
*
* Arguments: - polyvecl mat[K]: output matrix
* - const uint8_t rho[]: byte array containing seed rho
**************************************************/

void PQCLEAN_DILITHIUM3_AVX2_expand_mat(polyvecl mat[5], const uint8_t rho[SEEDBYTES]) {
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[0].vec[0],
&mat[0].vec[1],
&mat[0].vec[2],
&mat[0].vec[3],
rho, 0, 1, 2, 3);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[1].vec[0],
&mat[1].vec[1],
&mat[1].vec[2],
&mat[1].vec[3],
rho, 256, 257, 258, 259);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[2].vec[0],
&mat[2].vec[1],
&mat[2].vec[2],
&mat[2].vec[3],
rho, 512, 513, 514, 515);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[3].vec[0],
&mat[3].vec[1],
&mat[3].vec[2],
&mat[3].vec[3],
rho, 768, 769, 770, 771);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[4].vec[0],
&mat[4].vec[1],
&mat[4].vec[2],
&mat[4].vec[3],
rho, 1024, 1025, 1026, 1027);
}


/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_challenge
*
* Description: Implementation of H. Samples polynomial with 60 nonzero
* coefficients in {-1,1} using the output stream of
* SHAKE256(mu|w1).
*
* Arguments: - poly *c: pointer to output polynomial
* - const uint8_t mu[]: byte array containing mu
* - const polyveck *w1: pointer to vector w1
**************************************************/
void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c,
const uint8_t mu[CRHBYTES],
const polyveck *w1) {
unsigned int i, b, pos;
uint64_t signs;
uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
uint8_t outbuf[SHAKE256_RATE];
shake256ctx state;

for (i = 0; i < CRHBYTES; ++i) {
inbuf[i] = mu[i];
}
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]);
}

shake256_absorb(&state, inbuf, sizeof(inbuf));
shake256_squeezeblocks(outbuf, 1, &state);

signs = 0;
for (i = 0; i < 8; ++i) {
signs |= (uint64_t) outbuf[i] << 8 * i;
}

pos = 8;

for (i = 0; i < N; ++i) {
c->coeffs[i] = 0;
}

for (i = 196; i < 256; ++i) {
do {
if (pos >= SHAKE256_RATE) {
shake256_squeezeblocks(outbuf, 1, &state);
pos = 0;
}

b = outbuf[pos++];
} while (b > i);

c->coeffs[i] = c->coeffs[b];
c->coeffs[b] = 1;
c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1));
signs >>= 1;
}
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair
*
* Description: Generates public and private key.
*
* Arguments: - uint8_t *pk: pointer to output public key (allocated
* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key (allocated
* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES bytes)
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) {
unsigned int i;
uint8_t seedbuf[3 * SEEDBYTES];
uint8_t tr[CRHBYTES];
const uint8_t *rho, *rhoprime, *key;
uint16_t nonce = 0;
polyvecl mat[K];
polyvecl s1, s1hat;
polyveck s2, t, t1, t0;

/* Expand 32 bytes of randomness into rho, rhoprime and key */
randombytes(seedbuf, 3 * SEEDBYTES);
rho = seedbuf;
rhoprime = seedbuf + SEEDBYTES;
key = seedbuf + 2 * SEEDBYTES;

/* Expand matrix */
PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho);

/* Sample short vectors s1 and s2 */
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime,
nonce, nonce + 1, nonce + 2, nonce + 3);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s2.vec[0], &s2.vec[1], &s2.vec[2], &s2.vec[3], rhoprime,
nonce + 4, nonce + 5, nonce + 6, nonce + 7);
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&s2.vec[4], rhoprime, nonce + 8);

/* Matrix-vector multiplication */
s1hat = s1;
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1hat);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat);
//PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&t.vec[i]);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&t.vec[i]);
}

/* Add error vector s2 */
PQCLEAN_DILITHIUM3_AVX2_polyveck_add(&t, &t, &s2);

/* Extract t1 and write public key */
PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(&t);
PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(&t1, &t0, &t);
PQCLEAN_DILITHIUM3_AVX2_pack_pk(pk, rho, &t1);

/* Compute CRH(rho, t1) and write secret key */
crh(tr, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES);
PQCLEAN_DILITHIUM3_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0);

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES
* of len)
* - size_t *siglen: pointer to output length of signed message
* (should be PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES)
* - uint8_t *m: pointer to message to be signed
* - size_t mlen: length of message
* - uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
const uint8_t *m, size_t mlen,
const uint8_t *sk) {
size_t i;
unsigned int n;
uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES];
uint8_t *rho, *tr, *key, *mu, *rhoprime;
uint16_t nonce = 0;
poly c, chat;
polyvecl mat[K], s1, y, yhat, z;
polyveck t0, s2, w, w1, w0;
polyveck h, cs2, ct0;

rho = seedbuf;
tr = rho + SEEDBYTES;
key = tr + CRHBYTES;
mu = key + SEEDBYTES;
rhoprime = mu + CRHBYTES;
PQCLEAN_DILITHIUM3_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk);


// use incremental hash API instead of copying around buffers
/* Compute CRH(tr, m) */
shake256incctx state;
shake256_inc_init(&state);
shake256_inc_absorb(&state, tr, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);

crh(rhoprime, key, SEEDBYTES + CRHBYTES);

/* Expand matrix and transform vectors */
PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho);
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1);
PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&s2);
PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&t0);

rej:
/* Sample intermediate vector y */
PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &y.vec[3],
rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3);
nonce += 4;

/* Matrix-vector multiplication */
yhat = y;
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&yhat);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat);
PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&w.vec[i]);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&w.vec[i]);
}

/* Decompose w and call the random oracle */
PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&w);
PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(&w1, &w0, &w);
PQCLEAN_DILITHIUM3_AVX2_challenge(&c, mu, &w1);
chat = c;
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&chat);

/* Check that subtracting cs2 does not change high bits of w and low bits
* do not reveal secret information */
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&cs2.vec[i]);
}
PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(&w0, &w0, &cs2);
PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(&w0);
if (PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) {
goto rej;
}

/* Compute z, reject if it reveals secret */
for (i = 0; i < L; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&z.vec[i]);
}
PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(&z, &z, &y);
PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(&z);
if (PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
goto rej;
}

/* Compute hints for w1 */
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]);
PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&ct0.vec[i]);
}

PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&ct0);
if (PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(&ct0, GAMMA2)) {
goto rej;
}

PQCLEAN_DILITHIUM3_AVX2_polyveck_add(&w0, &w0, &ct0);
PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&w0);
n = PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(&h, &w0, &w1);
if (n > OMEGA) {
goto rej;
}

/* Write signature */
PQCLEAN_DILITHIUM3_AVX2_pack_sig(sig, &z, &h, &c);
*siglen = PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES;
return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign
*
* Description: Compute signed message.
*
* Arguments: - uint8_t *sm: pointer to output signed message (allocated
* array with PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + mlen bytes),
* can be equal to m
* - unsigned long long *smlen: pointer to output length of signed
* message
* - const uint8_t *m: pointer to message to be signed
* - unsigned long long mlen: length of message
* - const uint8_t *sk: pointer to bit-packed secret key
*
* Returns 0 (success)
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *m, size_t mlen,
const uint8_t *sk) {
int rc;
memmove(sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, m, mlen);
rc = PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk);
*smlen += mlen;
return rc;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify
*
* Description: Verify signed message.
*
* Arguments: - uint8_t *sig: signature
* - size_t siglen: length of signature (PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES)
* - uint8_t *m: pointer to message
* - size_t *mlen: pointer to output length of message
* - uint8_t *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pk) {
size_t i;
uint8_t rho[SEEDBYTES];
uint8_t mu[CRHBYTES];
poly c, chat, cp;
polyvecl mat[K], z;
polyveck t1, w1, h, tmp1, tmp2;

if (siglen < PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) {
return -1;
}

PQCLEAN_DILITHIUM3_AVX2_unpack_pk(rho, &t1, pk);
if (PQCLEAN_DILITHIUM3_AVX2_unpack_sig(&z, &h, &c, sig)) {
return -1;
}
if (PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) {
return -1;
}

/* Compute CRH(CRH(rho, t1), msg) */
crh(mu, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES);

shake256incctx state;
shake256_inc_init(&state);
shake256_inc_absorb(&state, mu, CRHBYTES);
shake256_inc_absorb(&state, m, mlen);
shake256_inc_finalize(&state);
shake256_inc_squeeze(mu, CRHBYTES, &state);

/* Matrix-vector multiplication; compute Az - c2^dt1 */
PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho);
PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&z);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z);
}

chat = c;
PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&chat);
PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(&t1);
PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&t1);
for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]);
}

PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2);
PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(&tmp1);
PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(&tmp1);

/* Reconstruct w1 */
PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&tmp1);
PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(&w1, &tmp1, &h);

/* Call random oracle and verify challenge */
PQCLEAN_DILITHIUM3_AVX2_challenge(&cp, mu, &w1);
for (i = 0; i < N; ++i) {
if (c.coeffs[i] != cp.coeffs[i]) {
return -1;
}
}

return 0;
}

/*************************************************
* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open
*
* Description: Verify signed message.
*
* Arguments: - unsigned char *m: pointer to output message (allocated
* array with smlen bytes), can be equal to sm
* - unsigned long long *mlen: pointer to output length of message
* - const unsigned char *sm: pointer to signed message
* - unsigned long long smlen: length of signed message
* - const unsigned char *pk: pointer to bit-packed public key
*
* Returns 0 if signed message could be verified correctly and -1 otherwise
**************************************************/
int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk) {
size_t i;
if (smlen < PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) {
goto badsig;
}
*mlen = smlen - PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES;

if (PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES,
sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, *mlen, pk)) {
goto badsig;
} else {
/* All good, copy msg, return 0 */
for (i = 0; i < *mlen; ++i) {
m[i] = sm[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + i];
}
return 0;
}

/* Signature verification failed */
badsig:
*mlen = (size_t) -1;
for (i = 0; i < smlen; ++i) {
m[i] = 0;
}

return -1;
}

+ 15
- 0
crypto_sign/dilithium3/avx2/sign.h View File

@@ -0,0 +1,15 @@
#ifndef SIGN_H
#define SIGN_H

#include "api.h"
#include "params.h"
#include "poly.h"
#include "polyvec.h"

void PQCLEAN_DILITHIUM3_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]);
void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES],
const polyveck *w1);


#endif


+ 26
- 0
crypto_sign/dilithium3/avx2/stream.c View File

@@ -0,0 +1,26 @@
#include "stream.h"

#include <string.h>

void PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) {

uint8_t buf[SEEDBYTES + 2];
memcpy(buf, seed, SEEDBYTES);
buf[SEEDBYTES] = (uint8_t)nonce;
buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8);

shake128_absorb(state, buf, SEEDBYTES + 2);
}


void PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) {

uint8_t buf[CRHBYTES + 2];
memcpy(buf, seed, CRHBYTES);
buf[CRHBYTES] = (uint8_t)nonce;
buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8);

shake256_absorb(state, buf, CRHBYTES + 2);
}

+ 15
- 0
crypto_sign/dilithium3/avx2/stream.h View File

@@ -0,0 +1,15 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_STREAM_H
#define PQCLEAN_DILITHIUM3_AVX2_STREAM_H

#include <stdint.h>

#include "fips202.h"
#include "params.h"

void PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init(
shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce);

void PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init(
shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce);

#endif

+ 23
- 0
crypto_sign/dilithium3/avx2/symmetric.h View File

@@ -0,0 +1,23 @@
#ifndef PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H
#define PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H

#include "params.h"
#include "stream.h"


#include "fips202.h"

#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES)
#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init(STATE, SEED, NONCE)
#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE)
#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init(STATE, SEED, NONCE)
#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE)

#define STREAM128_BLOCKBYTES SHAKE128_RATE
#define STREAM256_BLOCKBYTES SHAKE256_RATE

typedef shake128ctx stream128_state;
typedef shake256ctx stream256_state;


#endif

+ 6
- 2
crypto_sign/dilithium3/clean/LICENSE View File

@@ -1,2 +1,6 @@
Public Domain
Authors: Léo Ducas, Eike Kiltz, Tancrède Lepoint, Vadim Lyubashevsky, Gregor Seiler, Peter Schwabe, Damien Stehlé
Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/)

For Keccak and the random number generator
we are using public-domain code from sources
and by authors listed in comments on top of
the respective files.

+ 3
- 3
crypto_sign/dilithium3/clean/Makefile View File

@@ -2,10 +2,10 @@

LIB=libdilithium3_clean.a

SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c symmetric.c
OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o symmetric.o
SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c
OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o
HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \
reduce.h rounding.h symmetric.h
reduce.h rounding.h symmetric.h stream.h

CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)



+ 1
- 1
crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake View File

@@ -2,7 +2,7 @@
# nmake /f Makefile.Microsoft_nmake

LIBRARY=libdilithium3_clean.lib
OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj symmetric.obj
OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj
CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX

all: $(LIBRARY)


+ 13
- 9
crypto_sign/dilithium3/clean/api.h View File

@@ -4,14 +4,25 @@
#include <stddef.h>
#include <stdint.h>


#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES 1472U
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES 3504U
#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES 2701U

#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_ALGNAME "Dilithium3"

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(
uint8_t *pk, uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(
uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(
uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(
uint8_t *sig, size_t *siglen,
@@ -21,13 +32,6 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(
const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen, const uint8_t *pk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen,
const uint8_t *msg, size_t len,
const uint8_t *sk);

int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen,
const uint8_t *sm, size_t smlen,
const uint8_t *pk);


#endif

+ 15
- 14
crypto_sign/dilithium3/clean/ntt.c View File

@@ -1,11 +1,12 @@
#include <stdint.h>

#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include <stdint.h>

/* Roots of unity in order needed by forward ntt */
static const uint32_t zetas[N] = {
/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM3_CLEAN_ntt */
static const uint32_t PQCLEAN_DILITHIUM3_CLEAN_zetas[N] = {
0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347,
2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464,
1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231,
@@ -40,8 +41,8 @@ static const uint32_t zetas[N] = {
8332111, 7018208, 3937738, 1400424, 7534263, 1976782
};

/* Roots of unity in order needed by inverse ntt */
static const uint32_t zetas_inv[N] = {
/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM3_CLEAN_ntt */
static const uint32_t PQCLEAN_DILITHIUM3_CLEAN_zetas_inv[N] = {
6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416,
3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036,
3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683,
@@ -77,7 +78,7 @@ static const uint32_t zetas_inv[N] = {
};

/*************************************************
* Name: ntt
* Name: PQCLEAN_DILITHIUM3_CLEAN_ntt
*
* Description: Forward NTT, in-place. No modular reduction is performed after
* additions or subtractions. Hence output coefficients can be up
@@ -86,16 +87,16 @@ static const uint32_t zetas_inv[N] = {
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]) {
void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t *p) {
unsigned int len, start, j, k;
uint32_t zeta, t;

k = 1;
for (len = 128; len > 0; len >>= 1) {
for (start = 0; start < N; start = j + len) {
zeta = zetas[k++];
zeta = PQCLEAN_DILITHIUM3_CLEAN_zetas[k++];
for (j = start; j < start + len; ++j) {
t = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]);
t = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]);
p[j + len] = p[j] + 2 * Q - t;
p[j] = p[j] + t;
}
@@ -104,7 +105,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]) {
}

/*************************************************
* Name: invntt_frominvmont
* Name: PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont
*
* Description: Inverse NTT and multiplication by Montgomery factor 2^32.
* In-place. No modular reductions after additions or
@@ -113,7 +114,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]) {
*
* Arguments: - uint32_t p[N]: input/output coefficient array
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t p[N]) {
void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t *p) {
unsigned int start, len, j, k;
uint32_t t, zeta;
const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q;
@@ -121,17 +122,17 @@ void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t p[N]) {
k = 0;
for (len = 1; len < N; len <<= 1) {
for (start = 0; start < N; start = j + len) {
zeta = zetas_inv[k++];
zeta = PQCLEAN_DILITHIUM3_CLEAN_zetas_inv[k++];
for (j = start; j < start + len; ++j) {
t = p[j];
p[j] = t + p[j + len];
p[j + len] = t + 256 * Q - p[j + len];
p[j + len] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]);
p[j + len] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]);
}
}
}

for (j = 0; j < N; ++j) {
p[j] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)f * p[j]);
p[j] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) f * p[j]);
}
}

+ 4
- 3
crypto_sign/dilithium3/clean/ntt.h View File

@@ -1,9 +1,10 @@
#ifndef NTT_H
#define NTT_H
#ifndef PQCLEAN_DILITHIUM3_CLEAN_NTT_H
#define PQCLEAN_DILITHIUM3_CLEAN_NTT_H

#include "params.h"
#include <stdint.h>

#include "params.h"

void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]);
void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t p[N]);



+ 62
- 54
crypto_sign/dilithium3/clean/packing.c View File

@@ -4,17 +4,18 @@
#include "polyvec.h"

/*************************************************
* Name: pack_pk
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_pk
*
* Description: Bit-pack public key pk = (rho, t1).
*
* Arguments: - unsigned char pk[]: output byte array
* - const unsigned char rho[]: byte array containing rho
* Arguments: - uint8_t pk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const polyveck *t1: pointer to vector t1
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
const unsigned char rho[SEEDBYTES],
const polyveck *t1) {
void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(
uint8_t *pk,
const uint8_t *rho,
const polyveck *t1) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
@@ -28,17 +29,18 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
}

/*************************************************
* Name: unpack_pk
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_pk
*
* Description: Unpack public key pk = (rho, t1).
*
* Arguments: - const unsigned char rho[]: output byte array for rho
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const polyveck *t1: pointer to output vector t1
* - unsigned char pk[]: byte array containing bit-packed pk
* - uint8_t pk[]: byte array containing bit-packed pk
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES],
polyveck *t1,
const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) {
void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(
uint8_t *rho,
polyveck *t1,
const uint8_t *pk) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
@@ -52,25 +54,26 @@ void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES],
}

/*************************************************
* Name: pack_sk
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sk
*
* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0).
*
* Arguments: - unsigned char sk[]: output byte array
* - const unsigned char rho[]: byte array containing rho
* - const unsigned char key[]: byte array containing key
* - const unsigned char tr[]: byte array containing tr
* Arguments: - uint8_t sk[]: output byte array
* - const uint8_t rho[]: byte array containing rho
* - const uint8_t key[]: byte array containing key
* - const uint8_t tr[]: byte array containing tr
* - const polyvecl *s1: pointer to vector s1
* - const polyveck *s2: pointer to vector s2
* - const polyveck *t0: pointer to vector t0
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
const unsigned char rho[SEEDBYTES],
const unsigned char key[SEEDBYTES],
const unsigned char tr[CRHBYTES],
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0) {
void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(
uint8_t *sk,
const uint8_t *rho,
const uint8_t *key,
const uint8_t *tr,
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
@@ -104,25 +107,26 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
}

/*************************************************
* Name: unpack_sk
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sk
*
* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0).
*
* Arguments: - const unsigned char rho[]: output byte array for rho
* - const unsigned char key[]: output byte array for key
* - const unsigned char tr[]: output byte array for tr
* Arguments: - const uint8_t rho[]: output byte array for rho
* - const uint8_t key[]: output byte array for key
* - const uint8_t tr[]: output byte array for tr
* - const polyvecl *s1: pointer to output vector s1
* - const polyveck *s2: pointer to output vector s2
* - const polyveck *r0: pointer to output vector t0
* - unsigned char sk[]: byte array containing bit-packed sk
* - uint8_t sk[]: byte array containing bit-packed sk
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES],
unsigned char key[SEEDBYTES],
unsigned char tr[CRHBYTES],
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const unsigned char sk[CRYPTO_SECRETKEYBYTES]) {
void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(
uint8_t *rho,
uint8_t *key,
uint8_t *tr,
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk) {
unsigned int i;

for (i = 0; i < SEEDBYTES; ++i) {
@@ -156,19 +160,20 @@ void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES],
}

/*************************************************
* Name: pack_sig
* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sig
*
* Description: Bit-pack signature sig = (z, h, c).
*
* Arguments: - unsigned char sig[]: output byte array
* Arguments: - uint8_t sig[]: output byte array
* - const polyvecl *z: pointer to vector z
* - const polyveck *h: pointer to hint vector h
* - const poly *c: pointer to challenge polynomial
* - const poly *c: pointer to PQCLEAN_DILITHIUM3_CLEAN_challenge polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
const polyvecl *z,
const polyveck *h,
const poly *c) {
void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(
uint8_t *sig,
const polyvecl *z,
const polyveck *h,
const poly *c) {
unsigned int i, j, k;
uint64_t signs, mask;

@@ -182,10 +187,11 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
for (i = 0; i < K; ++i) {
for (j = 0; j < N; ++j) {
if (h->vec[i].coeffs[j] != 0) {
sig[k++] = (unsigned char) j;
sig[k++] = (uint8_t)j;
}
}
sig[OMEGA + i] = (unsigned char) k;

sig[OMEGA + i] = (uint8_t)k;
}
while (k < OMEGA) {
sig[k++] = 0;
@@ -199,7 +205,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
sig[i] = 0;
for (j = 0; j < 8; ++j) {
if (c->coeffs[8 * i + j] != 0) {
sig[i] |= (unsigned char) (1U << j);
sig[i] |= (uint8_t)(1u << j);
if (c->coeffs[8 * i + j] == (Q - 1)) {
signs |= mask;
}
@@ -209,27 +215,28 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
}
sig += N / 8;
for (i = 0; i < 8; ++i) {
sig[i] = (unsigned char) (signs >> 8 * i);
sig[i] = (uint8_t)(signs >> 8u * i);
}
}

/*************************************************
* Name: unpack_sig
* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sig
*
* Description: Unpack signature sig = (z, h, c).
*
* Arguments: - polyvecl *z: pointer to output vector z
* - polyveck *h: pointer to output hint vector h
* - poly *c: pointer to output challenge polynomial
* - const unsigned char sig[]: byte array containing
* - poly *c: pointer to output PQCLEAN_DILITHIUM3_CLEAN_challenge polynomial
* - const uint8_t sig[]: byte array containing
* bit-packed signature
*
* Returns 1 in case of malformed signature; otherwise 0.
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(polyvecl *z,
polyveck *h,
poly *c,
const unsigned char sig[CRYPTO_BYTES]) {
int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(
polyvecl *z,
polyveck *h,
poly *c,
const uint8_t *sig) {
unsigned int i, j, k;
uint64_t signs;

@@ -266,6 +273,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(polyvecl *z,
return 1;
}
}

sig += OMEGA + K;

/* Decode c */


+ 29
- 24
crypto_sign/dilithium3/clean/packing.h View File

@@ -1,31 +1,36 @@
#ifndef PACKING_H
#define PACKING_H
#ifndef PQCLEAN_DILITHIUM3_CLEAN_PACKING_H
#define PQCLEAN_DILITHIUM3_CLEAN_PACKING_H

#include "params.h"
#include "polyvec.h"

void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
const unsigned char rho[SEEDBYTES], const polyveck *t1);
void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
const unsigned char rho[SEEDBYTES],
const unsigned char key[SEEDBYTES],
const unsigned char tr[CRHBYTES],
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0);
void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES],
const polyvecl *z, const polyveck *h, const poly *c);
void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(
uint8_t *pk,
const uint8_t *rho, const polyveck *t1);
void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(
uint8_t *sk,
const uint8_t *rho,
const uint8_t *key,
const uint8_t *tr,
const polyvecl *s1,
const polyveck *s2,
const polyveck *t0);
void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(
uint8_t *sig,
const polyvecl *z, const polyveck *h, const poly *c);

void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], polyveck *t1,
const unsigned char pk[CRYPTO_PUBLICKEYBYTES]);
void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES],
unsigned char key[SEEDBYTES],
unsigned char tr[CRHBYTES],
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const unsigned char sk[CRYPTO_SECRETKEYBYTES]);
int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(polyvecl *z, polyveck *h, poly *c,
const unsigned char sig[CRYPTO_BYTES]);
void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(
uint8_t *rho, polyveck *t1,
const uint8_t *pk);
void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(
uint8_t *rho,
uint8_t *key,
uint8_t *tr,
polyvecl *s1,
polyveck *s2,
polyveck *t0,
const uint8_t *sk);
int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(
polyvecl *z, polyveck *h, poly *c, const uint8_t *sig);

#endif

+ 4
- 4
crypto_sign/dilithium3/clean/params.h View File

@@ -1,18 +1,17 @@
#ifndef PARAMS_H
#define PARAMS_H
#ifndef PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H
#define PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H


#define SEEDBYTES 32
#define CRHBYTES 48
#define N 256
#define Q 8380417
#define QBITS 23
#define ROOT_OF_UNITY 1753
#define D 14
#define GAMMA1 ((Q - 1)/16)
#define GAMMA2 (GAMMA1/2)
#define ALPHA (2*GAMMA2)

// DilithiumIII parameters
#define K 5
#define L 4
#define ETA 5
@@ -20,6 +19,7 @@
#define BETA 275
#define OMEGA 96


#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8)
#define POLT0_SIZE_PACKED ((N*D)/8)
#define POLETA_SIZE_PACKED ((N*SETABITS)/8)


+ 131
- 164
crypto_sign/dilithium3/clean/poly.c View File

@@ -1,10 +1,11 @@
#include <stdint.h>

#include "ntt.h"
#include "params.h"
#include "poly.h"
#include "reduce.h"
#include "rounding.h"
#include "symmetric.h"
#include <stdint.h>


/*************************************************
@@ -16,8 +17,7 @@
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_reduce32(a->coeffs[i]);
}
}
@@ -31,8 +31,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a) {
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_csubq(poly *a) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_csubq(a->coeffs[i]);
}
}
@@ -46,8 +45,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_csubq(poly *a) {
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_freeze(a->coeffs[i]);
}
}
@@ -61,9 +59,8 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a) {
* - const poly *a: pointer to first summand
* - const poly *b: pointer to second summand
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) {
unsigned int i;
for (i = 0; i < N; ++i) {
void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) {
for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
}
}
@@ -78,11 +75,10 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) {
* Arguments: - poly *c: pointer to output polynomial
* - const poly *a: pointer to first input polynomial
* - const poly *b: pointer to second input polynomial to be
* subtraced from first input polynomial
* subtracted from first input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i];
}
}
@@ -96,8 +92,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) {
* Arguments: - poly *a: pointer to input/output polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(poly *a) {
unsigned int i;
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] <<= D;
}
}
@@ -139,9 +134,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(poly *a) {
* - const poly *b: pointer to second input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) {
unsigned int i;

for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
c->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]);
}

@@ -160,12 +153,9 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *
* - const poly *v: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) {
unsigned int i;

for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_power2round(a->coeffs[i], &a0->coeffs[i]);
}

}

/*************************************************
@@ -182,12 +172,9 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a
* - const poly *c: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) {
unsigned int i;

for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_decompose(a->coeffs[i], &a0->coeffs[i]);
}

}

/*************************************************
@@ -204,13 +191,11 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a)
* Returns number of 1 bits.
**************************************************/
unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) {
unsigned int i, s = 0;

for (i = 0; i < N; ++i) {
unsigned int s = 0;
for (size_t i = 0; i < N; ++i) {
h->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]);
s += h->coeffs[i];
}

return s;
}

@@ -224,12 +209,9 @@ unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, co
* - const poly *h: pointer to input hint polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *h) {
unsigned int i;

for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_use_hint(b->coeffs[i], h->coeffs[i]);
}

}

/*************************************************
@@ -244,15 +226,13 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *
* Returns 0 if norm is strictly smaller than B and 1 otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) {
unsigned int i;
int32_t t;

/* It is ok to leak which coefficient violates the bound since
the probability for each coefficient is independent of secret
data but we must not leak the sign of the centralized representative. */
for (i = 0; i < N; ++i) {
for (size_t i = 0; i < N; ++i) {
/* Absolute value of centralized representative */
t = (int32_t) ((Q - 1) / 2 - a->coeffs[i]);
t = (int32_t)((Q - 1) / 2 - a->coeffs[i]);
t ^= (t >> 31);
t = (Q - 1) / 2 - t;

@@ -260,7 +240,6 @@ int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) {
return 1;
}
}

return 0;
}

@@ -272,7 +251,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) {
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
@@ -280,8 +259,8 @@ int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) {
**************************************************/
static unsigned int rej_uniform(uint32_t *a,
unsigned int len,
const unsigned char *buf,
unsigned int buflen) {
const uint8_t *buf,
size_t buflen) {
unsigned int ctr, pos;
uint32_t t;

@@ -308,19 +287,20 @@ static unsigned int rej_uniform(uint32_t *a,
* output stream from SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* - const uint8_t seed[]: byte array with seed of length
* SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES)/STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a,
const unsigned char seed[SEEDBYTES],
const uint8_t seed[SEEDBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int buflen = POLY_UNIFORM_BUFLEN;
unsigned char buf[POLY_UNIFORM_BUFLEN + 2];
shake128ctx state;
unsigned int i, ctr;
size_t buflen = POLY_UNIFORM_BUFLEN;
uint8_t buf[POLY_UNIFORM_BUFLEN + 2];
stream128_state state;
size_t off;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state);
@@ -347,7 +327,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a,
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
@@ -355,7 +335,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a,
**************************************************/
static unsigned int rej_eta(uint32_t *a,
unsigned int len,
const unsigned char *buf,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;
@@ -384,19 +364,18 @@ static unsigned int rej_eta(uint32_t *a,
* output stream from SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* - const uint8_t seed[]: byte array with seed of length
* SEEDBYTES
* - uint16_t nonce: 2-byte nonce
**************************************************/
#define POLY_UNIFORM_ETA_NBLOCKS (((N/2 * (1U << SETABITS)) / (2*ETA + 1)\
+ STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES)
#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a,
const unsigned char seed[SEEDBYTES],
const uint8_t *seed,
uint16_t nonce) {
unsigned int ctr;
unsigned char buf[POLY_UNIFORM_ETA_BUFLEN];
shake128ctx state;
uint8_t buf[POLY_UNIFORM_ETA_BUFLEN];
stream128_state state;

stream128_init(&state, seed, nonce);
stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state);
@@ -418,7 +397,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a,
*
* Arguments: - uint32_t *a: pointer to output array (allocated)
* - unsigned int len: number of coefficients to be sampled
* - const unsigned char *buf: array of random bytes
* - const uint8_t *buf: array of random bytes
* - unsigned int buflen: length of array of random bytes
*
* Returns number of sampled coefficients. Can be smaller than len if not enough
@@ -426,7 +405,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a,
**************************************************/
static unsigned int rej_gamma1m1(uint32_t *a,
unsigned int len,
const unsigned char *buf,
const uint8_t *buf,
unsigned int buflen) {
unsigned int ctr, pos;
uint32_t t0, t1;
@@ -438,7 +417,7 @@ static unsigned int rej_gamma1m1(uint32_t *a,
t0 |= (uint32_t)buf[pos + 2] << 16;
t0 &= 0xFFFFF;

t1 = buf[pos + 2] >> 4;
t1 = buf[pos + 2] >> 4;
t1 |= (uint32_t)buf[pos + 3] << 4;
t1 |= (uint32_t)buf[pos + 4] << 12;

@@ -463,19 +442,19 @@ static unsigned int rej_gamma1m1(uint32_t *a,
* sampling on output stream of SHAKE256(seed|nonce).
*
* Arguments: - poly *a: pointer to output polynomial
* - const unsigned char seed[]: byte array with seed of length
* - const uint8_t seed[]: byte array with seed of length
* CRHBYTES
* - uint16_t nonce: 16-bit nonce
**************************************************/
#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES)
#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES)
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1(poly *a,
const unsigned char seed[CRHBYTES],
const uint8_t seed[CRHBYTES],
uint16_t nonce) {
unsigned int i, ctr, off;
unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN;
unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4];
shake256ctx state;
uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4];
stream256_state state;

stream256_init(&state, seed, nonce);
stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state);
@@ -500,18 +479,18 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1(poly *a,
* Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
* Input coefficients are assumed to lie in [Q-ETA,Q+ETA].
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLETA_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(unsigned char *r, const poly *a) {
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a) {
unsigned int i;
unsigned char t[8];
uint8_t t[8];

for (i = 0; i < N / 2; ++i) {
t[0] = (uint8_t) (Q + ETA - a->coeffs[2 * i + 0]);
t[1] = (uint8_t) (Q + ETA - a->coeffs[2 * i + 1]);
r[i] = (uint8_t) (t[0] | (t[1] << 4));
t[0] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 0]);
t[1] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 1]);
r[i] = (uint8_t)(t[0] | (t[1] << 4));
}
}

@@ -522,68 +501,67 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(unsigned char *r, const poly *a) {
* Output coefficients lie in [Q-ETA,Q+ETA].
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const unsigned char *a) {
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[i] & 0x0F;
r->coeffs[2 * i + 1] = a[i] >> 4;
r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1];
}

}

/*************************************************
* Name: polyt1_pack
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack
*
* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits.
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLT1_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(unsigned char *r, const poly *a) {
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a) {
unsigned int i;

for (i = 0; i < N / 8; ++i) {
r[9 * i + 0] = (uint8_t) ((a->coeffs[8 * i + 0] >> 0));
r[9 * i + 1] = (uint8_t) ((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1));
r[9 * i + 2] = (uint8_t) ((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2));
r[9 * i + 3] = (uint8_t) ((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3));
r[9 * i + 4] = (uint8_t) ((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4));
r[9 * i + 5] = (uint8_t) ((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5));
r[9 * i + 6] = (uint8_t) ((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6));
r[9 * i + 7] = (uint8_t) ((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7));
r[9 * i + 8] = (uint8_t) ((a->coeffs[8 * i + 7] >> 1));
r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0));
r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1));
r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2));
r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3));
r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4));
r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5));
r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6));
r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7));
r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1));
}

}

/*************************************************
* Name: polyt1_unpack
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack
*
* Description: Unpack polynomial t1 with 9-bit coefficients.
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) {
unsigned int i;

for (i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF;
r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF;
r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF;
r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF;
r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF;
r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF;
r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF;
r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF;
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) {
for (size_t i = 0; i < N / 8; ++i) {
r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t) a[9 * i + 1] << 8)) & 0x1FF;
r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t) a[9 * i + 2] << 7)) & 0x1FF;
r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t) a[9 * i + 3] << 6)) & 0x1FF;
r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t) a[9 * i + 4] << 5)) & 0x1FF;
r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t) a[9 * i + 5] << 4)) & 0x1FF;
r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t) a[9 * i + 6] << 3)) & 0x1FF;
r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t) a[9 * i + 7] << 2)) & 0x1FF;
r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t) a[9 * i + 8] << 1)) & 0x1FF;
}

}

/*************************************************
@@ -592,32 +570,30 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) {
* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}].
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLT0_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(unsigned char *r, const poly *a) {
unsigned int i;
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a) {
uint32_t t[4];

for (i = 0; i < N / 4; ++i) {
t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0];
t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1];
t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2];
t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3];
r[7 * i + 0] = (uint8_t) (t[0]);
r[7 * i + 1] = (uint8_t) (t[0] >> 8);
r[7 * i + 1] |= (uint8_t) (t[1] << 6);
r[7 * i + 2] = (uint8_t) (t[1] >> 2);
r[7 * i + 3] = (uint8_t) (t[1] >> 10);
r[7 * i + 3] |= (uint8_t) (t[2] << 4);
r[7 * i + 4] = (uint8_t) (t[2] >> 4);
r[7 * i + 5] = (uint8_t) (t[2] >> 12);
r[7 * i + 5] |= (uint8_t) (t[3] << 2);
r[7 * i + 6] = (uint8_t) (t[3] >> 6);
for (size_t i = 0; i < N / 4; ++i) {
t[0] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 0];
t[1] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 1];
t[2] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 2];
t[3] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 3];
r[7 * i + 0] = (uint8_t)(t[0]);
r[7 * i + 1] = (uint8_t)(t[0] >> 8);
r[7 * i + 1] |= (uint8_t)(t[1] << 6);
r[7 * i + 2] = (uint8_t)(t[1] >> 2);
r[7 * i + 3] = (uint8_t)(t[1] >> 10);
r[7 * i + 3] |= (uint8_t)(t[2] << 4);
r[7 * i + 4] = (uint8_t)(t[2] >> 4);
r[7 * i + 5] = (uint8_t)(t[2] >> 12);
r[7 * i + 5] |= (uint8_t)(t[3] << 2);
r[7 * i + 6] = (uint8_t)(t[3] >> 6);
}

}

/*************************************************
@@ -627,32 +603,30 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(unsigned char *r, const poly *a) {
* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}].
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) {
unsigned int i;
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) {

for (i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = a[7 * i + 0];
r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8;
for (size_t i = 0; i < N / 4; ++i) {
r->coeffs[4 * i + 0] = a[7 * i + 0];
r->coeffs[4 * i + 0] |= (uint32_t) (a[7 * i + 1] & 0x3F) << 8;

r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2;
r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10;
r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
r->coeffs[4 * i + 1] |= (uint32_t) a[7 * i + 2] << 2;
r->coeffs[4 * i + 1] |= (uint32_t) (a[7 * i + 3] & 0x0F) << 10;

r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4;
r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12;
r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
r->coeffs[4 * i + 2] |= (uint32_t) a[7 * i + 4] << 4;
r->coeffs[4 * i + 2] |= (uint32_t) (a[7 * i + 5] & 0x03) << 12;

r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6;
r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
r->coeffs[4 * i + 3] |= (uint32_t) a[7 * i + 6] << 6;

r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0];
r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1];
r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2];
r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3];
}

}

/*************************************************
@@ -662,29 +636,27 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) {
* in [-(GAMMA1 - 1), GAMMA1 - 1].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLZ_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(unsigned char *r, const poly *a) {
unsigned int i;
void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a) {
uint32_t t[2];

for (i = 0; i < N / 2; ++i) {
for (size_t i = 0; i < N / 2; ++i) {
/* Map to {0,...,2*GAMMA1 - 2} */
t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0];
t[0] += ((int32_t)t[0] >> 31) & Q;
t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1];
t[1] += ((int32_t)t[1] >> 31) & Q;

r[5 * i + 0] = (uint8_t) (t[0]);
r[5 * i + 1] = (uint8_t) (t[0] >> 8);
r[5 * i + 2] = (uint8_t) (t[0] >> 16);
r[5 * i + 2] |= (uint8_t) (t[1] << 4);
r[5 * i + 3] = (uint8_t) (t[1] >> 4);
r[5 * i + 4] = (uint8_t) (t[1] >> 12);
r[5 * i + 0] = (uint8_t)t[0];
r[5 * i + 1] = (uint8_t)(t[0] >> 8);
r[5 * i + 2] = (uint8_t)(t[0] >> 16);
r[5 * i + 2] |= (uint8_t)(t[1] << 4);
r[5 * i + 3] = (uint8_t)(t[1] >> 4);
r[5 * i + 4] = (uint8_t)(t[1] >> 12);
}

}

/*************************************************
@@ -695,26 +667,23 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(unsigned char *r, const poly *a) {
* Output coefficients are standard representatives.
*
* Arguments: - poly *r: pointer to output polynomial
* - const unsigned char *a: byte array with bit-packed polynomial
* - const uint8_t *a: byte array with bit-packed polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const unsigned char *a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a) {
for (size_t i = 0; i < N / 2; ++i) {
r->coeffs[2 * i + 0] = a[5 * i + 0];
r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16;
r->coeffs[2 * i + 0] |= (uint32_t) a[5 * i + 1] << 8;
r->coeffs[2 * i + 0] |= (uint32_t) (a[5 * i + 2] & 0x0F) << 16;

r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;
r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 3] << 4;
r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 4] << 12;

r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q;
r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q;
}

}

/*************************************************
@@ -723,15 +692,13 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const unsigned char *a) {
* Description: Bit-pack polynomial w1 with coefficients in [0, 15].
* Input coefficients are assumed to be standard representatives.
*
* Arguments: - unsigned char *r: pointer to output byte array with at least
* Arguments: - uint8_t *r: pointer to output byte array with at least
* POLW1_SIZE_PACKED bytes
* - const poly *a: pointer to input polynomial
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(unsigned char *r, const poly *a) {
unsigned int i;

for (i = 0; i < N / 2; ++i) {
r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4));
void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a) {
for (size_t i = 0; i < N / 2; ++i) {
r[i] = (uint8_t)(a->coeffs[2 * i + 0] | a->coeffs[2 * i + 1] << 4);
}

}

+ 16
- 15
crypto_sign/dilithium3/clean/poly.h View File

@@ -1,9 +1,10 @@
#ifndef POLY_H
#define POLY_H
#ifndef PQCLEAN_DILITHIUM3_CLEAN_POLY_H
#define PQCLEAN_DILITHIUM3_CLEAN_POLY_H

#include "params.h"
#include <stdint.h>

#include "params.h"

typedef struct {
uint32_t coeffs[N];
} poly;
@@ -27,27 +28,27 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *

int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B);
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a,
const unsigned char seed[SEEDBYTES],
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a,
const unsigned char seed[SEEDBYTES],
const uint8_t *seed,
uint16_t nonce);
void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1(poly *a,
const unsigned char seed[CRHBYTES],
const uint8_t seed[CRHBYTES],
uint16_t nonce);

void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const unsigned char *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const unsigned char *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const unsigned char *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const unsigned char *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a);

void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(unsigned char *r, const poly *a);
void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a);

#endif

+ 29
- 26
crypto_sign/dilithium3/clean/polyvec.c View File

@@ -1,14 +1,15 @@
#include <stdint.h>

#include "params.h"
#include "poly.h"
#include "polyvec.h"
#include <stdint.h>

/**************************************************************/
/************ Vectors of polynomials of length L **************/
/**************************************************************/

/*************************************************
* Name: polyvecl_freeze
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze
*
* Description: Reduce coefficients of polynomials in vector of length L
* to standard representatives.
@@ -24,7 +25,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v) {
}

/*************************************************
* Name: polyvecl_add
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add
*
* Description: Add vectors of polynomials of length L.
* No modular reduction is performed.
@@ -42,7 +43,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const
}

/*************************************************
* Name: polyvecl_ntt
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt
*
* Description: Forward NTT of all polynomials in vector of length L. Output
* coefficients can be up to 16*Q larger than input coefficients.
@@ -58,7 +59,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(polyvecl *v) {
}

/*************************************************
* Name: polyvecl_pointwise_acc_invmontgomery
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery
*
* Description: Pointwise multiply vectors of polynomials of length L, multiply
* resulting vector by 2^{-32} and add (accumulate) polynomials
@@ -85,7 +86,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w,
}

/*************************************************
* Name: polyvecl_chknorm
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm
*
* Description: Check infinity norm of polynomials in vector of length L.
* Assumes input coefficients to be standard representatives.
@@ -96,14 +97,15 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w,
* Returns 0 if norm of all polynomials is strictly smaller than B and 1
* otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound) {
int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) {
unsigned int i;

for (i = 0; i < L; ++i) {
if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) {
if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], B)) {
return 1;
}
}

return 0;
}

@@ -113,7 +115,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound)


/*************************************************
* Name: polyveck_reduce
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce
*
* Description: Reduce coefficients of polynomials in vector of length K
* to representatives in [0,2*Q[.
@@ -129,7 +131,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(polyveck *v) {
}

/*************************************************
* Name: polyveck_csubq
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq
*
* Description: For all coefficients of polynomials in vector of length K
* subtract Q if coefficient is bigger than Q.
@@ -145,7 +147,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(polyveck *v) {
}

/*************************************************
* Name: polyveck_freeze
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze
*
* Description: Reduce coefficients of polynomials in vector of length K
* to standard representatives.
@@ -161,7 +163,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v) {
}

/*************************************************
* Name: polyveck_add
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_add
*
* Description: Add vectors of polynomials of length K.
* No modular reduction is performed.
@@ -179,7 +181,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const
}

/*************************************************
* Name: polyveck_sub
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub
*
* Description: Subtract vectors of polynomials of length K.
* Assumes coefficients of polynomials in second input vector
@@ -199,7 +201,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const
}

/*************************************************
* Name: polyveck_shiftl
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl
*
* Description: Multiply vector of polynomials of Length K by 2^D without modular
* reduction. Assumes input coefficients to be less than 2^{32-D}.
@@ -215,7 +217,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v) {
}

/*************************************************
* Name: polyveck_ntt
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt
*
* Description: Forward NTT of all polynomials in vector of length K. Output
* coefficients can be up to 16*Q larger than input coefficients.
@@ -231,7 +233,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v) {
}

/*************************************************
* Name: polyveck_invntt_montgomery
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery
*
* Description: Inverse NTT and multiplication by 2^{32} of polynomials
* in vector of length K. Input coefficients need to be less
@@ -248,7 +250,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery(polyveck *v) {
}

/*************************************************
* Name: polyveck_chknorm
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm
*
* Description: Check infinity norm of polynomials in vector of length K.
* Assumes input coefficients to be standard representatives.
@@ -259,19 +261,20 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery(polyveck *v) {
* Returns 0 if norm of all polynomials are strictly smaller than B and 1
* otherwise.
**************************************************/
int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t bound) {
int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) {
unsigned int i;

for (i = 0; i < K; ++i) {
if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) {
if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], B)) {
return 1;
}
}

return 0;
}

/*************************************************
* Name: polyveck_power2round
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute a0, a1 such that a mod Q = a1*2^D + a0
@@ -293,7 +296,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, c
}

/*************************************************
* Name: polyveck_decompose
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose
*
* Description: For all coefficients a of polynomials in vector of length K,
* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0
@@ -316,7 +319,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con
}

/*************************************************
* Name: polyveck_make_hint
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint
*
* Description: Compute hint vector.
*
@@ -339,19 +342,19 @@ unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h,
}

/*************************************************
* Name: polyveck_use_hint
* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint
*
* Description: Use hint vector to correct the high bits of input vector.
*
* Arguments: - polyveck *w: pointer to output vector of polynomials with
* corrected high bits
* - const polyveck *u: pointer to input vector
* - const polyveck *v: pointer to input vector
* - const polyveck *h: pointer to input hint vector
**************************************************/
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) {
unsigned int i;

for (i = 0; i < K; ++i) {
PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]);
PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]);
}
}

+ 5
- 4
crypto_sign/dilithium3/clean/polyvec.h View File

@@ -1,9 +1,10 @@
#ifndef POLYVEC_H
#define POLYVEC_H
#ifndef PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H
#define PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H

#include <stdint.h>

#include "params.h"
#include "poly.h"
#include <stdint.h>

/* Vectors of polynomials of length L */
typedef struct {
@@ -46,6 +47,6 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con
unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h,
const polyveck *v0,
const polyveck *v1);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h);
void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h);

#endif

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save