diff --git a/crypto_sign/dilithium2/META.yml b/crypto_sign/dilithium2/META.yml index bf7c4cff..6761d77d 100644 --- a/crypto_sign/dilithium2/META.yml +++ b/crypto_sign/dilithium2/META.yml @@ -17,4 +17,13 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/dilithium/commit/40f79645879b5c69835cd91d06945d7c24f39922 + version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + - name: avx2 + version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + required_flags: + - avx2 + - bmi2 diff --git a/crypto_sign/dilithium2/avx2/LICENSE b/crypto_sign/dilithium2/avx2/LICENSE new file mode 100644 index 00000000..40541676 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/LICENSE @@ -0,0 +1,6 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and the random number generator +we are using public-domain code from sources +and by authors listed in comments on top of +the respective files. diff --git a/crypto_sign/dilithium2/avx2/Makefile b/crypto_sign/dilithium2/avx2/Makefile new file mode 100644 index 00000000..8308ea46 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/Makefile @@ -0,0 +1,43 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium2_avx2.a + +SOURCES = fips202x4.c invntt.s nttconsts.c ntt.s packing.c pointwise.S poly.c \ + polyvec.c reduce.s rejsample.c rounding.c sign.c stream.c +OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \ + polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o +HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ + nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \ + fips202x4.h shuffle.inc + +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \ + -Wmissing-prototypes -Wredundant-decls -std=c99 \ + -Wcast-align \ + -mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +KECCAK4XDIR=../../../common/keccak4x +KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o +KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.s $(HEADERS) + $(AS) -o $@ $< + +%.o: %.S $(HEADERS) + $(AS) -c -o $@ $< + +$(LIB): $(OBJECTS) $(KECCAK4X) + $(AR) -r $@ $^ + +$(KECCAK4X): + $(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) + $(MAKE) -C $(KECCAK4XDIR) clean + diff --git a/crypto_sign/dilithium2/avx2/alignment.h b/crypto_sign/dilithium2/avx2/alignment.h new file mode 100644 index 00000000..40279ed3 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/alignment.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H +#define PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H + +#define ALIGNED_UINT8(N) \ + union { \ + uint32_t as_arr[N]; \ + __m256i as_vec[(N)/32]; \ + } + +#define ALIGNED_UINT32(N) \ + union { \ + uint32_t as_arr[N]; \ + __m256i as_vec[(N)/8]; \ + } + +#define ALIGNED_UINT64(N) \ + union { \ + uint64_t as_arr[N]; \ + __m256i as_vec[(N)/8]; \ + } + +#endif //PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H diff --git a/crypto_sign/dilithium2/avx2/api.h b/crypto_sign/dilithium2/avx2/api.h new file mode 100644 index 00000000..d4c2bc08 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/api.h @@ -0,0 +1,37 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_API_H +#define PQCLEAN_DILITHIUM2_AVX2_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1184U +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2800U +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2044U + +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2" + + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair( + uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *msg, size_t len, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + + + +#endif diff --git a/crypto_sign/dilithium2/avx2/fips202x4.c b/crypto_sign/dilithium2/avx2/fips202x4.c new file mode 100644 index 00000000..d7d55683 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/fips202x4.c @@ -0,0 +1,239 @@ +#include +#include + +#include "fips202.h" +#include "fips202x4.h" +#include "params.h" + +#define NROUNDS 24 +#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) + +static uint64_t load64(const uint8_t *x) { + unsigned int i; + uint64_t r = 0; + + for (i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +static void store64(uint8_t *x, uint64_t u) { + unsigned int i; + + for (i = 0; i < 8; ++i) { + x[i] = (uint8_t)(u >> 8 * i); + } +} + +/* Use implementation from the Keccak Code Package */ +extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); +#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds + +static void keccak_absorb4x(__m256i *s, + unsigned int r, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen, + uint8_t p) { + unsigned long long i; + uint8_t t0[200]; + uint8_t t1[200]; + uint8_t t2[200]; + uint8_t t3[200]; + uint64_t *ss = (uint64_t *)s; + + for (i = 0; i < 25; ++i) { + s[i] = _mm256_xor_si256(s[i], s[i]); + } + + while (mlen >= r) { + for (i = 0; i < r / 8; ++i) { + ss[4 * i + 0] ^= load64(m0 + 8 * i); + ss[4 * i + 1] ^= load64(m1 + 8 * i); + ss[4 * i + 2] ^= load64(m2 + 8 * i); + ss[4 * i + 3] ^= load64(m3 + 8 * i); + } + + KeccakF1600_StatePermute4x(s); + mlen -= r; + m0 += r; + m1 += r; + m2 += r; + m3 += r; + } + + for (i = 0; i < r; ++i) { + t0[i] = 0; + t1[i] = 0; + t2[i] = 0; + t3[i] = 0; + } + for (i = 0; i < mlen; ++i) { + t0[i] = m0[i]; + t1[i] = m1[i]; + t2[i] = m2[i]; + t3[i] = m3[i]; + } + + t0[i] = p; + t1[i] = p; + t2[i] = p; + t3[i] = p; + + t0[r - 1] |= 128; + t1[r - 1] |= 128; + t2[r - 1] |= 128; + t3[r - 1] |= 128; + + for (i = 0; i < r / 8; ++i) { + ss[4 * i + 0] ^= load64(t0 + 8 * i); + ss[4 * i + 1] ^= load64(t1 + 8 * i); + ss[4 * i + 2] ^= load64(t2 + 8 * i); + ss[4 * i + 3] ^= load64(t3 + 8 * i); + } +} + + +static void keccak_squeezeblocks4x(uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + unsigned int r, + __m256i *s) { + unsigned int i; + uint64_t *ss = (uint64_t *)s; + + while (nblocks > 0) { + KeccakF1600_StatePermute4x(s); + for (i = 0; i < r / 8; ++i) { + store64(h0 + 8 * i, ss[4 * i + 0]); + store64(h1 + 8 * i, ss[4 * i + 1]); + store64(h2 + 8 * i, ss[4 * i + 2]); + store64(h3 + 8 * i, ss[4 * i + 3]); + } + + h0 += r; + h1 += r; + h2 += r; + h3 += r; + --nblocks; + } + +} + +void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F); +} + +void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s) { + keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s); +} + +void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F); +} + +void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s) { + keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s); +} + +void PQCLEAN_DILITHIUM2_AVX2_shake128_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + unsigned int i; + unsigned long nblocks = hlen / SHAKE128_RATE; + uint8_t t[4][SHAKE128_RATE]; + __m256i s[25]; + + PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen); + PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + + h0 += nblocks * SHAKE128_RATE; + h1 += nblocks * SHAKE128_RATE; + h2 += nblocks * SHAKE128_RATE; + h3 += nblocks * SHAKE128_RATE; + hlen -= nblocks * SHAKE128_RATE; + + if (hlen) { + PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); + for (i = 0; i < hlen; ++i) { + h0[i] = t[0][i]; + h1[i] = t[1][i]; + h2[i] = t[2][i]; + h3[i] = t[3][i]; + } + } +} + +void PQCLEAN_DILITHIUM2_AVX2_shake256_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + unsigned int i; + unsigned long nblocks = hlen / SHAKE256_RATE; + uint8_t t[4][SHAKE256_RATE]; + __m256i s[25]; + + PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen); + PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + + h0 += nblocks * SHAKE256_RATE; + h1 += nblocks * SHAKE256_RATE; + h2 += nblocks * SHAKE256_RATE; + h3 += nblocks * SHAKE256_RATE; + hlen -= nblocks * SHAKE256_RATE; + + if (hlen) { + PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); + for (i = 0; i < hlen; ++i) { + h0[i] = t[0][i]; + h1[i] = t[1][i]; + h2[i] = t[2][i]; + h3[i] = t[3][i]; + } + } +} diff --git a/crypto_sign/dilithium2/avx2/fips202x4.h b/crypto_sign/dilithium2/avx2/fips202x4.h new file mode 100644 index 00000000..2ab1c106 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/fips202x4.h @@ -0,0 +1,65 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_FIPS202X4_H +#define PQCLEAN_DILITHIUM2_AVX2_FIPS202X4_H + +#include +#include + +#include "params.h" + +void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s); + +void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s); + +void PQCLEAN_DILITHIUM2_AVX2_shake128_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM2_AVX2_shake256_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +#endif diff --git a/crypto_sign/dilithium2/avx2/invntt.s b/crypto_sign/dilithium2/avx2/invntt.s new file mode 100644 index 00000000..7ae2b4e3 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/invntt.s @@ -0,0 +1,281 @@ +.include "shuffle.inc" + +.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 +vpaddd %ymm2,%ymm\l0,%ymm12 +vpaddd %ymm2,%ymm\l1,%ymm13 +vpaddd %ymm2,%ymm\l2,%ymm14 + +vpsubd %ymm\h0,%ymm12,%ymm12 +vpsubd %ymm\h1,%ymm13,%ymm13 +vpsubd %ymm\h2,%ymm14,%ymm14 + +vpmuludq %ymm\z0,%ymm12,%ymm12 +vpmuludq %ymm\z0,%ymm13,%ymm13 +vpaddd %ymm2,%ymm\l3,%ymm15 + +vpmuludq %ymm\z1,%ymm14,%ymm14 +vpsubd %ymm\h3,%ymm15,%ymm15 +vpaddd %ymm\l0,%ymm\h0,%ymm\l0 + +vpmuludq %ymm\z1,%ymm15,%ymm15 +vpaddd %ymm\l1,%ymm\h1,%ymm\l1 +vpaddd %ymm\l2,%ymm\h2,%ymm\l2 + +vpaddd %ymm\l3,%ymm\h3,%ymm\l3 + +vpmuludq %ymm0,%ymm12,%ymm\h0 +vpmuludq %ymm0,%ymm13,%ymm\h1 +vpmuludq %ymm0,%ymm14,%ymm\h2 +vpmuludq %ymm0,%ymm15,%ymm\h3 +vpmuludq %ymm1,%ymm\h0,%ymm\h0 +vpmuludq %ymm1,%ymm\h1,%ymm\h1 +vpmuludq %ymm1,%ymm\h2,%ymm\h2 +vpmuludq %ymm1,%ymm\h3,%ymm\h3 +vpaddq %ymm12,%ymm\h0,%ymm\h0 +vpaddq %ymm13,%ymm\h1,%ymm\h1 +vpaddq %ymm14,%ymm\h2,%ymm\h2 +vpaddq %ymm15,%ymm\h3,%ymm\h3 +vpsrlq $32,%ymm\h0,%ymm\h0 +vpsrlq $32,%ymm\h1,%ymm\h1 +vpsrlq $32,%ymm\h2,%ymm\h2 +vpsrlq $32,%ymm\h3,%ymm\h3 +.endm + +.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx +PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm6 +vmovdqa 32(%rsi),%ymm7 +vmovdqa 64(%rsi),%ymm5 +vmovdqa 96(%rsi),%ymm10 + +#reorder +shuffle8 6,5,8,5 +shuffle8 7,10,6,10 + +shuffle4 8,6,4,6 +shuffle4 5,10,8,10 + +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vpsrlq $32,%ymm10,%ymm11 + +level0: +vpmovzxdq (%rdx),%ymm3 +vpmovzxdq 16(%rdx),%ymm15 +vpaddd %ymm2,%ymm4,%ymm12 +vpaddd %ymm2,%ymm6,%ymm13 +vpaddd %ymm2,%ymm8,%ymm14 + +vpsubd %ymm5,%ymm12,%ymm12 +vpsubd %ymm7,%ymm13,%ymm13 +vpsubd %ymm9,%ymm14,%ymm14 + +vpmuludq %ymm3,%ymm12,%ymm12 +vpmuludq %ymm15,%ymm13,%ymm13 +vpaddd %ymm2,%ymm10,%ymm15 + +vpsubd %ymm11,%ymm15,%ymm15 +vpaddd %ymm4,%ymm5,%ymm4 +vpaddd %ymm6,%ymm7,%ymm6 +vpmovzxdq 32(%rdx),%ymm5 +vpmovzxdq 48(%rdx),%ymm7 + +vpmuludq %ymm5,%ymm14,%ymm14 +vpmuludq %ymm7,%ymm15,%ymm15 +vpaddd %ymm8,%ymm9,%ymm8 + +vpaddd %ymm10,%ymm11,%ymm10 + +vpmuludq %ymm0,%ymm12,%ymm5 +vpmuludq %ymm0,%ymm13,%ymm7 +vpmuludq %ymm0,%ymm14,%ymm9 +vpmuludq %ymm0,%ymm15,%ymm11 +vpmuludq %ymm1,%ymm5,%ymm5 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm9,%ymm9 +vpmuludq %ymm1,%ymm11,%ymm11 +vpaddq %ymm12,%ymm5,%ymm5 +vpaddq %ymm13,%ymm7,%ymm7 +vpaddq %ymm14,%ymm9,%ymm9 +vpaddq %ymm15,%ymm11,%ymm11 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm7,%ymm7 +vpsrlq $32,%ymm9,%ymm9 +vpsrlq $32,%ymm11,%ymm11 + +level1: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpmovzxdq 64(%rdx),%ymm15 +vpmovzxdq 80(%rdx),%ymm3 + +butterfly 4,5,8,9,6,7,10,11 + +level2: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpmovzxdq 96(%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 3,3 + +#shuffle +shuffle4 4,5,3,5 +shuffle4 6,7,4,7 +shuffle4 8,9,6,9 +shuffle4 10,11,8,11 + +level3: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd 112(%rdx),%ymm14 +vpbroadcastd 116(%rdx),%ymm15 +vpblendd $0xF0,%ymm15,%ymm14,%ymm10 + +butterfly 3,4,6,8,5,7,9,11 10,10 + +#shuffle +shuffle8 3,4,10,4 +shuffle8 6,8,3,8 +shuffle8 5,7,6,7 +shuffle8 9,11,5,11 + +level4: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd 120(%rdx),%ymm9 + +butterfly 10,3,6,5,4,8,7,11 9,9 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm4,128(%rdi) +vmovdqa %ymm8,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx +PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x256q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm4 +vmovdqa 256(%rsi),%ymm5 +vmovdqa 512(%rsi),%ymm6 +vmovdqa 768(%rsi),%ymm7 +vmovdqa 1024(%rsi),%ymm8 +vmovdqa 1280(%rsi),%ymm9 +vmovdqa 1536(%rsi),%ymm10 +vmovdqa 1792(%rsi),%ymm11 + +level5: +vpbroadcastd (%rdx),%ymm3 +vpbroadcastd 4(%rdx),%ymm15 +vpaddd %ymm2,%ymm4,%ymm12 +vpaddd %ymm2,%ymm6,%ymm13 +vpaddd %ymm2,%ymm8,%ymm14 + +vpsubd %ymm5,%ymm12,%ymm12 +vpsubd %ymm7,%ymm13,%ymm13 +vpsubd %ymm9,%ymm14,%ymm14 + +vpmuludq %ymm3,%ymm12,%ymm12 +vpmuludq %ymm15,%ymm13,%ymm13 +vpaddd %ymm2,%ymm10,%ymm15 + +vpsubd %ymm11,%ymm15,%ymm15 +vpaddd %ymm4,%ymm5,%ymm4 +vpaddd %ymm6,%ymm7,%ymm6 +vpbroadcastd 8(%rdx),%ymm5 +vpbroadcastd 12(%rdx),%ymm7 + +vpmuludq %ymm5,%ymm14,%ymm14 +vpmuludq %ymm7,%ymm15,%ymm15 +vpaddd %ymm8,%ymm9,%ymm8 + +vpaddd %ymm10,%ymm11,%ymm10 + +vpmuludq %ymm0,%ymm12,%ymm5 +vpmuludq %ymm0,%ymm13,%ymm7 +vpmuludq %ymm0,%ymm14,%ymm9 +vpmuludq %ymm0,%ymm15,%ymm11 +vpmuludq %ymm1,%ymm5,%ymm5 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm9,%ymm9 +vpmuludq %ymm1,%ymm11,%ymm11 +vpaddq %ymm12,%ymm5,%ymm5 +vpaddq %ymm13,%ymm7,%ymm7 +vpaddq %ymm14,%ymm9,%ymm9 +vpaddq %ymm15,%ymm11,%ymm11 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm7,%ymm7 +vpsrlq $32,%ymm9,%ymm9 +vpsrlq $32,%ymm11,%ymm11 + +level6: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd 16(%rdx),%ymm15 +vpbroadcastd 20(%rdx),%ymm3 + +butterfly 4,5,8,9,6,7,10,11 + +level7: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd 24(%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 3,3 + +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xdiv(%rip),%ymm3 + +vpmuludq %ymm3,%ymm4,%ymm4 +vpmuludq %ymm3,%ymm5,%ymm5 +vpmuludq %ymm3,%ymm6,%ymm6 +vpmuludq %ymm3,%ymm7,%ymm7 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm0,%ymm6,%ymm14 +vpmuludq %ymm0,%ymm7,%ymm15 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm12,%ymm4,%ymm4 +vpaddq %ymm13,%ymm5,%ymm5 +vpaddq %ymm14,%ymm6,%ymm6 +vpaddq %ymm15,%ymm7,%ymm7 +vpsrlq $32,%ymm4,%ymm4 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm6,%ymm6 +vpsrlq $32,%ymm7,%ymm7 + +#store +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_mask(%rip),%ymm3 +vpermd %ymm4,%ymm3,%ymm4 +vpermd %ymm5,%ymm3,%ymm5 +vpermd %ymm6,%ymm3,%ymm6 +vpermd %ymm7,%ymm3,%ymm7 +vpermd %ymm8,%ymm3,%ymm8 +vpermd %ymm9,%ymm3,%ymm9 +vpermd %ymm10,%ymm3,%ymm10 +vpermd %ymm11,%ymm3,%ymm11 +vmovdqa %xmm4,(%rdi) +vmovdqa %xmm5,128(%rdi) +vmovdqa %xmm6,256(%rdi) +vmovdqa %xmm7,384(%rdi) +vmovdqa %xmm8,512(%rdi) +vmovdqa %xmm9,640(%rdi) +vmovdqa %xmm10,768(%rdi) +vmovdqa %xmm11,896(%rdi) + +ret diff --git a/crypto_sign/dilithium2/avx2/ntt.h b/crypto_sign/dilithium2/avx2/ntt.h new file mode 100644 index 00000000..53837202 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/ntt.h @@ -0,0 +1,26 @@ +#ifndef NTT_H +#define NTT_H + +#include + +#include "nttconsts.h" +#include "params.h" + +void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx(uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas); +void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx(uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas); + +void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx(uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas_inv); +void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx(uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas_inv); + +void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); + +#endif diff --git a/crypto_sign/dilithium2/avx2/ntt.s b/crypto_sign/dilithium2/avx2/ntt.s new file mode 100644 index 00000000..e69a5f89 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/ntt.s @@ -0,0 +1,178 @@ +.include "shuffle.inc" + +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 +#mul +vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 +vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 +vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2 +vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3 + +#reduce +vpmuludq %ymm0,%ymm\rh0,%ymm12 +vpmuludq %ymm0,%ymm\rh1,%ymm13 +vpmuludq %ymm0,%ymm\rh2,%ymm14 +vpmuludq %ymm0,%ymm\rh3,%ymm15 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm\rh0,%ymm12,%ymm12 +vpaddq %ymm\rh1,%ymm13,%ymm13 +vpaddq %ymm\rh2,%ymm14,%ymm14 +vpaddq %ymm\rh3,%ymm15,%ymm15 +vpsrlq $32,%ymm12,%ymm12 +vpsrlq $32,%ymm13,%ymm13 +vpsrlq $32,%ymm14,%ymm14 +vpsrlq $32,%ymm15,%ymm15 + +#update +vpaddd %ymm2,%ymm\rl0,%ymm\rh0 +vpaddd %ymm2,%ymm\rl1,%ymm\rh1 +vpaddd %ymm2,%ymm\rl2,%ymm\rh2 +vpaddd %ymm2,%ymm\rl3,%ymm\rh3 +vpaddd %ymm12,%ymm\rl0,%ymm\rl0 +vpaddd %ymm13,%ymm\rl1,%ymm\rl1 +vpaddd %ymm14,%ymm\rl2,%ymm\rl2 +vpaddd %ymm15,%ymm\rl3,%ymm\rl3 +vpsubd %ymm12,%ymm\rh0,%ymm\rh0 +vpsubd %ymm13,%ymm\rh1,%ymm\rh1 +vpsubd %ymm14,%ymm\rh2,%ymm\rh2 +vpsubd %ymm15,%ymm\rh3,%ymm\rh3 +.endm + +.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx +PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2 + +level0: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd (%rdx),%ymm3 + +#load +vpmovzxdq (%rsi),%ymm4 +vpmovzxdq 128(%rsi),%ymm5 +vpmovzxdq 256(%rsi),%ymm6 +vpmovzxdq 384(%rsi),%ymm7 +vpmovzxdq 512(%rsi),%ymm8 +vpmovzxdq 640(%rsi),%ymm9 +vpmovzxdq 768(%rsi),%ymm10 +vpmovzxdq 896(%rsi),%ymm11 + +butterfly 4,5,6,7,8,9,10,11 + +level1: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd 4(%rdx),%ymm12 +vpbroadcastd 8(%rdx),%ymm13 + +butterfly 4,5,8,9,6,7,10,11 12,12,13,13 + +level2: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd 12(%rdx),%ymm12 +vpbroadcastd 16(%rdx),%ymm13 +vpbroadcastd 20(%rdx),%ymm14 +vpbroadcastd 24(%rdx),%ymm15 + +butterfly 4,6,8,10,5,7,9,11 12,13,14,15 + +#store +vmovdqa %ymm4,(%rdi) +vmovdqa %ymm5,256(%rdi) +vmovdqa %ymm6,512(%rdi) +vmovdqa %ymm7,768(%rdi) +vmovdqa %ymm8,1024(%rdi) +vmovdqa %ymm9,1280(%rdi) +vmovdqa %ymm10,1536(%rdi) +vmovdqa %ymm11,1792(%rdi) + +ret + +.global PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx +PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x2q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm4 +vmovdqa 32(%rsi),%ymm5 +vmovdqa 64(%rsi),%ymm6 +vmovdqa 96(%rsi),%ymm7 +vmovdqa 128(%rsi),%ymm8 +vmovdqa 160(%rsi),%ymm9 +vmovdqa 192(%rsi),%ymm10 +vmovdqa 224(%rsi),%ymm11 + +level3: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd (%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 + +level4: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpbroadcastd 4(%rdx),%ymm12 +vpbroadcastd 8(%rdx),%ymm13 +vpblendd $0xF0,%ymm13,%ymm12,%ymm12 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +butterfly 3,8,4,9,5,10,6,11 12,12,12,12 + +level5: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpmovzxdq 12(%rdx),%ymm12 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +butterfly 7,5,3,10,8,6,4,11 12,12,12,12 + +level6: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpmovzxdq 28(%rdx),%ymm12 +vpmovzxdq 44(%rdx),%ymm13 + +butterfly 7,5,8,6,3,10,4,11 12,12,13,13 + +level7: +#PQCLEAN_DILITHIUM2_AVX2_zetas +vpmovzxdq 60(%rdx),%ymm12 +vpmovzxdq 76(%rdx),%ymm13 +vpmovzxdq 92(%rdx),%ymm14 +vpmovzxdq 108(%rdx),%ymm15 + +butterfly 7,3,8,4,5,10,6,11 12,13,14,15 + +#store +vpsllq $32,%ymm5,%ymm5 +vpsllq $32,%ymm10,%ymm10 +vpsllq $32,%ymm6,%ymm6 +vpsllq $32,%ymm11,%ymm11 +vpblendd $0xAA,%ymm5,%ymm7,%ymm7 +vpblendd $0xAA,%ymm10,%ymm3,%ymm3 +vpblendd $0xAA,%ymm6,%ymm8,%ymm8 +vpblendd $0xAA,%ymm11,%ymm4,%ymm4 + +shuffle4 7,3,5,3 +shuffle4 8,4,7,4 + +shuffle8 5,7,6,7 +shuffle8 3,4,5,4 + +vmovdqa %ymm6,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm4,96(%rdi) + +ret diff --git a/crypto_sign/dilithium2/avx2/nttconsts.c b/crypto_sign/dilithium2/avx2/nttconsts.c new file mode 100644 index 00000000..7fff311d --- /dev/null +++ b/crypto_sign/dilithium2/avx2/nttconsts.c @@ -0,0 +1,80 @@ +#include "nttconsts.h" + +#define QINV 4236238847 // -q^(-1) mod 2^32 +#define MONT 4193792ULL +#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) + + +const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, + 256 * Q + } + }; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, + 0x7FFFFF, 0x7FFFFF + } + }; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; + +#undef QINV +#undef MONT +#undef DIV + + +const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas = { + .as_arr = { + 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776, + 3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667, + 5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191, + 3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439, + 7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422, + 1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579, + 8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390, + 7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620, + 5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868, + 3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076, + 6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435, + 5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599, + 3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165, + 7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031, + 6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064, + 2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112, + 162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237, + 8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977, + 1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735, + 6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892, + 5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443, + 7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090, + 4834730, 7018208, 1976782 + } +}; + +const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv = { + .as_arr = { + 6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985, + 4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748, + 2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646, + 1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087, + 177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422, + 6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573, + 5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061, + 6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386, + 1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252, + 1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818, + 1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496, + 7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525, + 6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443, + 7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093, + 5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647, + 4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669, + 43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330, + 1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900, + 6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326, + 4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745, + 3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293, + 518909, 2608894, 3975713 + } +}; diff --git a/crypto_sign/dilithium2/avx2/nttconsts.h b/crypto_sign/dilithium2/avx2/nttconsts.h new file mode 100644 index 00000000..c2dd6b51 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/nttconsts.h @@ -0,0 +1,27 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H +#define PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H + +#include +#include + +#include "alignment.h" +#include "params.h" + +typedef ALIGNED_UINT32(8) aligned_uint32x8_t; + +typedef ALIGNED_UINT32(N) aligned_uint32xN_t; + + +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xqinv; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xq; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x2q; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x256q; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_mask; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8x23ones; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM2_AVX2_8xdiv; + +extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas; +extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv; + +#endif //PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H + diff --git a/crypto_sign/dilithium2/avx2/packing.c b/crypto_sign/dilithium2/avx2/packing.c new file mode 100644 index 00000000..6dc7a0ec --- /dev/null +++ b/crypto_sign/dilithium2/avx2/packing.c @@ -0,0 +1,305 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_pack_pk( + uint8_t *pk, + const uint8_t *rho, + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_unpack_pk( + uint8_t *rho, + polyveck *t1, + const uint8_t *pk) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sk +* +* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t key[]: byte array containing key +* - const uint8_t tr[]: byte array containing tr +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +* - const polyveck *t0: pointer to vector t0 +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); + } + sk += L * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); + } + sk += K * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sk +* +* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t key[]: output byte array for key +* - const uint8_t tr[]: output byte array for tr +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - const polyveck *r0: pointer to output vector t0 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); + } + sk += L * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); + } + sk += K * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_pack_sig +* +* Description: Bit-pack signature sig = (z, h, c). +* +* Arguments: - uint8_t sig[]: output byte array +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +* - const poly *c: pointer to PQCLEAN_DILITHIUM2_AVX2_challenge polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_pack_sig( + uint8_t *sig, + const polyvecl *z, + const polyveck *h, + const poly *c) { + unsigned int i, j, k; + uint64_t signs, mask; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); + } + sig += L * POLZ_SIZE_PACKED; + + /* Encode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t)j; + } + } + + sig[OMEGA + i] = (uint8_t)k; + } + while (k < OMEGA) { + sig[k++] = 0; + } + sig += OMEGA + K; + + /* Encode c */ + signs = 0; + mask = 1; + for (i = 0; i < N / 8; ++i) { + sig[i] = 0; + for (j = 0; j < 8; ++j) { + if (c->coeffs[8 * i + j] != 0) { + sig[i] |= (uint8_t)(1u << j); + if (c->coeffs[8 * i + j] == (Q - 1)) { + signs |= mask; + } + mask <<= 1; + } + } + } + sig += N / 8; + for (i = 0; i < 8; ++i) { + sig[i] = (uint8_t)(signs >> 8u * i); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sig +* +* Description: Unpack signature sig = (z, h, c). +* +* Arguments: - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - poly *c: pointer to output PQCLEAN_DILITHIUM2_AVX2_challenge polynomial +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( + polyvecl *z, + polyveck *h, + poly *c, + const uint8_t *sig) { + unsigned int i, j, k; + uint64_t signs; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); + } + sig += L * POLZ_SIZE_PACKED; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + sig += OMEGA + K; + + /* Decode c */ + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)sig[N / 8 + i] << 8 * i; + } + + /* Extra sign bits are zero for strong unforgeability */ + if (signs >> 60) { + return 1; + } + + for (i = 0; i < N / 8; ++i) { + for (j = 0; j < 8; ++j) { + if ((sig[i] >> j) & 0x01) { + c->coeffs[8 * i + j] = 1; + c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); + signs >>= 1; + } + } + } + + return 0; +} diff --git a/crypto_sign/dilithium2/avx2/packing.h b/crypto_sign/dilithium2/avx2/packing.h new file mode 100644 index 00000000..c8f90729 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/packing.h @@ -0,0 +1,36 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_PACKING_H +#define PQCLEAN_DILITHIUM2_AVX2_PACKING_H + +#include "params.h" +#include "polyvec.h" + +void PQCLEAN_DILITHIUM2_AVX2_pack_pk( + uint8_t *pk, + const uint8_t *rho, const polyveck *t1); +void PQCLEAN_DILITHIUM2_AVX2_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0); +void PQCLEAN_DILITHIUM2_AVX2_pack_sig( + uint8_t *sig, + const polyvecl *z, const polyveck *h, const poly *c); + +void PQCLEAN_DILITHIUM2_AVX2_unpack_pk( + uint8_t *rho, polyveck *t1, + const uint8_t *pk); +void PQCLEAN_DILITHIUM2_AVX2_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk); +int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( + polyvecl *z, polyveck *h, poly *c, const uint8_t *sig); + +#endif diff --git a/crypto_sign/dilithium2/avx2/params.h b/crypto_sign/dilithium2/avx2/params.h new file mode 100644 index 00000000..dc6f130b --- /dev/null +++ b/crypto_sign/dilithium2/avx2/params.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_PARAMS_H +#define PQCLEAN_DILITHIUM2_AVX2_PARAMS_H + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define QBITS 23 +#define D 14 +#define GAMMA1 ((Q - 1)/16) +#define GAMMA2 (GAMMA1/2) +#define ALPHA (2*GAMMA2) + +#define K 4 +#define L 3 +#define ETA 6 +#define SETABITS 4 +#define BETA 325 +#define OMEGA 80 + + +#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) +#define POLT0_SIZE_PACKED ((N*D)/8) +#define POLETA_SIZE_PACKED ((N*SETABITS)/8) +#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) +#define POLW1_SIZE_PACKED ((N*4)/8) + +#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLT1_SIZE_PACKED) +#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + (L + K)*POLETA_SIZE_PACKED + CRHBYTES + K*POLT0_SIZE_PACKED) +#define CRYPTO_BYTES (L*POLZ_SIZE_PACKED + (OMEGA + K) + (N/8 + 8)) + +#endif diff --git a/crypto_sign/dilithium2/avx2/pointwise.S b/crypto_sign/dilithium2/avx2/pointwise.S new file mode 100644 index 00000000..fa2ab766 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/pointwise.S @@ -0,0 +1,189 @@ +#include "params.h" + +.global PQCLEAN_DILITHIUM2_AVX2_pointwise_avx +PQCLEAN_DILITHIUM2_AVX2_pointwise_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 + +xor %eax,%eax +_looptop1: +#load +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa 64(%rsi),%ymm6 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vmovdqa 64(%rdx),%ymm14 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 +vpsrlq $32,%ymm14,%ymm15 + +#mul +vpmuludq %ymm2,%ymm10,%ymm2 +vpmuludq %ymm3,%ymm11,%ymm3 +vpmuludq %ymm4,%ymm12,%ymm4 +vpmuludq %ymm5,%ymm13,%ymm5 +vpmuludq %ymm6,%ymm14,%ymm6 +vpmuludq %ymm7,%ymm15,%ymm7 + +#reduce +vpmuludq %ymm0,%ymm2,%ymm10 +vpmuludq %ymm0,%ymm3,%ymm11 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm0,%ymm6,%ymm14 +vpmuludq %ymm0,%ymm7,%ymm15 +vpmuludq %ymm1,%ymm10,%ymm10 +vpmuludq %ymm1,%ymm11,%ymm11 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm2,%ymm10,%ymm2 +vpaddq %ymm3,%ymm11,%ymm3 +vpaddq %ymm4,%ymm12,%ymm4 +vpaddq %ymm5,%ymm13,%ymm5 +vpaddq %ymm6,%ymm14,%ymm6 +vpaddq %ymm7,%ymm15,%ymm7 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 +vpsrlq $32,%ymm6,%ymm6 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 +vpblendd $0xAA,%ymm7,%ymm6,%ymm6 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm6,64(%rdi) + +add $96,%rdi +add $96,%rsi +add $96,%rdx +add $1,%eax +cmp $10,%eax +jb _looptop1 + +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 + +#mul +vpmuludq %ymm2,%ymm10,%ymm2 +vpmuludq %ymm3,%ymm11,%ymm3 +vpmuludq %ymm4,%ymm12,%ymm4 +vpmuludq %ymm5,%ymm13,%ymm5 + +#reduce +vpmuludq %ymm0,%ymm2,%ymm10 +vpmuludq %ymm0,%ymm3,%ymm11 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm1,%ymm10,%ymm10 +vpmuludq %ymm1,%ymm11,%ymm11 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpaddq %ymm2,%ymm10,%ymm2 +vpaddq %ymm3,%ymm11,%ymm3 +vpaddq %ymm4,%ymm12,%ymm4 +vpaddq %ymm5,%ymm13,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 + +#store +vpblendd $0x55,%ymm2,%ymm3,%ymm2 +vpblendd $0x55,%ymm4,%ymm5,%ymm4 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +ret + +.macro pointwise off +#load +vmovdqa \off(%rsi),%ymm6 +vmovdqa \off+32(%rsi),%ymm8 +vmovdqa \off(%rdx),%ymm10 +vmovdqa \off+32(%rdx),%ymm12 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 + +#mul +vpmuludq %ymm6,%ymm10,%ymm6 +vpmuludq %ymm7,%ymm11,%ymm7 +vpmuludq %ymm8,%ymm12,%ymm8 +vpmuludq %ymm9,%ymm13,%ymm9 +.endm + +.macro acc +vpaddq %ymm6,%ymm2,%ymm2 +vpaddq %ymm7,%ymm3,%ymm3 +vpaddq %ymm8,%ymm4,%ymm4 +vpaddq %ymm9,%ymm5,%ymm5 +.endm + +.global PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx +PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm1 + +xor %eax,%eax +_looptop2: +pointwise 0 + +#mov +vmovdqa %ymm6,%ymm2 +vmovdqa %ymm7,%ymm3 +vmovdqa %ymm8,%ymm4 +vmovdqa %ymm9,%ymm5 + +pointwise 1024 +acc + +pointwise 2048 +acc + + + +#reduce +vpmuludq %ymm0,%ymm2,%ymm6 +vpmuludq %ymm0,%ymm3,%ymm7 +vpmuludq %ymm0,%ymm4,%ymm8 +vpmuludq %ymm0,%ymm5,%ymm9 +vpmuludq %ymm1,%ymm6,%ymm6 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm8,%ymm8 +vpmuludq %ymm1,%ymm9,%ymm9 +vpaddq %ymm2,%ymm6,%ymm2 +vpaddq %ymm3,%ymm7,%ymm3 +vpaddq %ymm4,%ymm8,%ymm4 +vpaddq %ymm5,%ymm9,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 + +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +add $64,%rsi +add $64,%rdx +add $64,%rdi +add $1,%eax +cmp $16,%eax +jb _looptop2 + +ret diff --git a/crypto_sign/dilithium2/avx2/poly.c b/crypto_sign/dilithium2/avx2/poly.c new file mode 100644 index 00000000..3a3cf295 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/poly.c @@ -0,0 +1,914 @@ +#include +#include + +#include "fips202x4.h" +#include "ntt.h" +#include "nttconsts.h" +#include "params.h" +#include "poly.h" +#include "reduce.h" +#include "rejsample.h" +#include "rounding.h" +#include "symmetric.h" + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_reduce +* +* Description: Reduce all coefficients of input polynomial to representative +* in [0,2*Q[. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a) { + PQCLEAN_DILITHIUM2_AVX2_reduce_avx(a->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_csubq +* +* Description: For all coefficients of input polynomial subtract Q if +* coefficient is bigger than Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_csubq(poly *a) { + PQCLEAN_DILITHIUM2_AVX2_csubq_avx(a->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_freeze +* +* Description: Reduce all coefficients of the polynomial to standard +* representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a) { + PQCLEAN_DILITHIUM2_AVX2_reduce_avx(a->coeffs); + PQCLEAN_DILITHIUM2_AVX2_csubq_avx(a->coeffs); + +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i vec0, vec1; + for (i = 0; i < N / 8; i++) { + vec0 = _mm256_load_si256(&a->coeffs_x8[i]); + vec1 = _mm256_load_si256(&b->coeffs_x8[i]); + vec0 = _mm256_add_epi32(vec0, vec1); + _mm256_store_si256(&c->coeffs_x8[i], vec0); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_sub +* +* Description: Subtract polynomials. Assumes coefficients of second input +* polynomial to be less than 2*Q. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i vec0, vec1; + const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec); + + for (i = 0; i < N / 8; i++) { + vec0 = _mm256_load_si256(&a->coeffs_x8[i]); + vec1 = _mm256_load_si256(&b->coeffs_x8[i]); + vec0 = _mm256_add_epi32(vec0, twoq); + vec0 = _mm256_sub_epi32(vec0, vec1); + _mm256_store_si256(&c->coeffs_x8[i], vec0); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{32-D}. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a) { + unsigned int i; + __m256i vec; + + for (i = 0; i < N / 8; i++) { + vec = _mm256_load_si256(&a->coeffs_x8[i]); + vec = _mm256_slli_epi32(vec, D); + _mm256_store_si256(&a->coeffs_x8[i], vec); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_ntt +* +* Description: Forward NTT. Output coefficients can be up to 16*Q larger than +* input coefficients. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a) { + unsigned int i; + ALIGNED_UINT64(N) tmp; + + for (i = 0; i < N / 32; ++i) { + PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx(tmp.as_arr + 4 * i, a->coeffs + 4 * i, PQCLEAN_DILITHIUM2_AVX2_zetas.as_arr + 1); + } + for (i = 0; i < N / 32; ++i) { + PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx(a->coeffs + 32 * i, tmp.as_arr + 32 * i, PQCLEAN_DILITHIUM2_AVX2_zetas.as_arr + 8 + 31 * i); + } +} + +/************************************************* +* Name: poly_invntt_montgomery +* +* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients +* need to be less than 2*Q. Output coefficients are less than 2*Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(poly *a) { + unsigned int i; + ALIGNED_UINT64(N) tmp; + + for (i = 0; i < N / 32; i++) { + PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx(tmp.as_arr + 32 * i, a->coeffs + 32 * i, PQCLEAN_DILITHIUM2_AVX2_zetas_inv.as_arr + 31 * i); + } + for (i = 0; i < N / 32; i++) { + PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx(a->coeffs + 4 * i, tmp.as_arr + 4 * i, PQCLEAN_DILITHIUM2_AVX2_zetas_inv.as_arr + 248); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* with 2^{-32}. Output coefficients are less than 2*Q if input +* coefficient are less than 22*Q. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { + PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(c->coeffs, a->coeffs, b->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients Q + a0 +* - const poly *v: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *restrict a1, + poly *restrict a0, + const poly *restrict a) { + for (size_t i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_power2round(a->coeffs[i], &a0->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients Q + a0 +* - const poly *c: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_decompose( + poly *restrict a1, + poly *restrict a0, + const poly *restrict a) { + unsigned int i; + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_decompose(a->coeffs[i], &a0->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint( + poly *restrict h, + const poly *restrict a0, + const poly *restrict a1) { + unsigned int i, s = 0; + for (i = 0; i < N; ++i) { + h->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + return s; +} + +/************************************************* + * Name: PQCLEAN_DILITHIUM2_AVX2_poly_use_hint + * + * Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *a: pointer to output polynomial with corrected high bits +* - const poly *b: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint( + poly *restrict a, + const poly *restrict b, + const poly *restrict h) { + unsigned int i; + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_use_hint(b->coeffs[i], h->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const poly *a: pointer to polynomial +* - uint32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, uint32_t B) { + unsigned int i; + int32_t t; + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for (i = 0; i < N; ++i) { + /* Absolute value of centralized representative */ + t = (Q - 1) / 2 - a->coeffs[i]; + t ^= (t >> 31); + t = (Q - 1) / 2 - t; + + if ((uint32_t)t >= B) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: rej_uniform_ref +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + return ctr; +} + +/************************************************* +* Name: poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling using the +* output stream from SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, + const unsigned char seed[SEEDBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int nblocks = POLY_UNIFORM_NBLOCKS; + unsigned int buflen = POLY_UNIFORM_BUFLEN; + unsigned char buf[POLY_UNIFORM_BUFLEN + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, nblocks, &state); + + ctr = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a->coeffs, N, buf, buflen); + + while (ctr < N) { + off = buflen % 3; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + buflen = STREAM128_BLOCKBYTES + off; + stream128_squeezeblocks(buf + off, 1, &state); + ctr += rej_uniform_ref(a->coeffs + ctr, N - ctr, buf, buflen); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[SEEDBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][SEEDBYTES + 2]; + unsigned char outbuf[4][5 * SHAKE128_RATE]; + __m256i state[25]; + + for (i = 0; i < SEEDBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][SEEDBYTES + 0] = nonce0; + inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; + inbuf[1][SEEDBYTES + 0] = nonce1; + inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; + inbuf[2][SEEDBYTES + 0] = nonce2; + inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; + inbuf[3][SEEDBYTES + 0] = nonce3; + inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + SEEDBYTES + 2); + PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, + state); + + ctr0 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a0->coeffs, N, outbuf[0], 5 * SHAKE128_RATE); + ctr1 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a1->coeffs, N, outbuf[1], 5 * SHAKE128_RATE); + ctr2 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a2->coeffs, N, outbuf[2], 5 * SHAKE128_RATE); + ctr3 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a3->coeffs, N, outbuf[3], 5 * SHAKE128_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_uniform_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], + SHAKE128_RATE); + ctr1 += rej_uniform_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], + SHAKE128_RATE); + ctr2 += rej_uniform_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], + SHAKE128_RATE); + ctr3 += rej_uniform_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], + SHAKE128_RATE); + } +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 <= 2 * ETA) { + a[ctr++] = Q + ETA - t0; + } + if (t1 <= 2 * ETA && ctr < len) { + a[ctr++] = Q + ETA - t1; + } + } + + return ctr; +} + +/************************************************* +* Name: poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling using the +* output stream from SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, + const unsigned char seed[SEEDBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned char buf[POLY_UNIFORM_ETA_BUFLEN]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); + + while (ctr < N) { + stream128_squeezeblocks(buf, 1, &state); + ctr += rej_eta_ref(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[SEEDBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][SEEDBYTES + 2]; + unsigned char outbuf[4][2 * SHAKE128_RATE]; + __m256i state[25]; + + for (i = 0; i < SEEDBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][SEEDBYTES + 0] = nonce0; + inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; + inbuf[1][SEEDBYTES + 0] = nonce1; + inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; + inbuf[2][SEEDBYTES + 0] = nonce2; + inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; + inbuf[3][SEEDBYTES + 0] = nonce3; + inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + SEEDBYTES + 2); + PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 2, + state); + + ctr0 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a0->coeffs, N, outbuf[0], 2 * SHAKE128_RATE); + ctr1 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a1->coeffs, N, outbuf[1], 2 * SHAKE128_RATE); + ctr2 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a2->coeffs, N, outbuf[2], 2 * SHAKE128_RATE); + ctr3 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a3->coeffs, N, outbuf[3], 2 * SHAKE128_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_eta_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], SHAKE128_RATE); + ctr1 += rej_eta_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], SHAKE128_RATE); + ctr2 += rej_eta_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], SHAKE128_RATE); + ctr3 += rej_eta_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], SHAKE128_RATE); + } +} + +/************************************************* +* Name: rej_gamma1m1_ref +* +* Description: Sample uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling +* using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_gamma1m1_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + + ctr = pos = 0; + while (ctr < len && pos + 5 <= buflen) { + t0 = buf[pos]; + t0 |= (uint32_t)buf[pos + 1] << 8; + t0 |= (uint32_t)buf[pos + 2] << 16; + t0 &= 0xFFFFF; + + t1 = buf[pos + 2] >> 4; + t1 |= (uint32_t)buf[pos + 3] << 4; + t1 |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + + if (t0 <= 2 * GAMMA1 - 2) { + a[ctr++] = Q + GAMMA1 - 1 - t0; + } + if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { + a[ctr++] = Q + GAMMA1 - 1 - t1; + } + } + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection +* sampling on output stream of SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) +#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1(poly *a, + const unsigned char seed[CRHBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; + unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); + + ctr = PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1(a->coeffs, N, buf, POLY_UNIFORM_GAMMA1M1_BUFLEN); + + while (ctr < N) { + off = buflen % 5; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + buflen = STREAM256_BLOCKBYTES + off; + stream256_squeezeblocks(buf + off, 1, &state); + ctr += rej_gamma1m1_ref(a->coeffs + ctr, N - ctr, buf, buflen); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[CRHBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][CRHBYTES + 2]; + unsigned char outbuf[4][5 * SHAKE256_RATE]; + __m256i state[25]; + + for (i = 0; i < CRHBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][CRHBYTES + 0] = nonce0 & 0xFF; + inbuf[0][CRHBYTES + 1] = nonce0 >> 8; + inbuf[1][CRHBYTES + 0] = nonce1 & 0xFF; + inbuf[1][CRHBYTES + 1] = nonce1 >> 8; + inbuf[2][CRHBYTES + 0] = nonce2 & 0xFF; + inbuf[2][CRHBYTES + 1] = nonce2 >> 8; + inbuf[3][CRHBYTES + 0] = nonce3 & 0xFF; + inbuf[3][CRHBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + CRHBYTES + 2); + PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, + state); + + ctr0 = rej_gamma1m1_ref(a0->coeffs, N, outbuf[0], 5 * SHAKE256_RATE); + ctr1 = rej_gamma1m1_ref(a1->coeffs, N, outbuf[1], 5 * SHAKE256_RATE); + ctr2 = rej_gamma1m1_ref(a2->coeffs, N, outbuf[2], 5 * SHAKE256_RATE); + ctr3 = rej_gamma1m1_ref(a3->coeffs, N, outbuf[3], 5 * SHAKE256_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_gamma1m1_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], + SHAKE256_RATE); + ctr1 += rej_gamma1m1_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], + SHAKE256_RATE); + ctr2 += rej_gamma1m1_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], + SHAKE256_RATE); + ctr3 += rej_gamma1m1_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], + SHAKE256_RATE); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLETA_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + unsigned char t[8]; + + for (i = 0; i < N / 2; ++i) { + t[0] = Q + ETA - a->coeffs[2 * i + 0]; + t[1] = Q + ETA - a->coeffs[2 * i + 1]; + r[i] = t[0] | (t[1] << 4); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* Output coefficients lie in [Q-ETA,Q+ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[i] & 0x0F; + r->coeffs[2 * i + 1] = a[i] >> 4; + r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1]; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLT1_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + + for (i = 0; i < N / 8; ++i) { + r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); + r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); + r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); + r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); + r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); + r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); + r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); + r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); + r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack +* +* Description: Unpack polynomial t1 with 9-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = ((a[9 * i + 0] >> 0) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; + r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; + r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; + r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; + r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; + r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; + r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; + r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLT0_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + + for (i = 0; i < N / 4; ++i) { + t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; + t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; + t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; + t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; + + r[7 * i + 0] = t[0]; + r[7 * i + 1] = t[0] >> 8; + r[7 * i + 1] |= t[1] << 6; + r[7 * i + 2] = t[1] >> 2; + r[7 * i + 3] = t[1] >> 10; + r[7 * i + 3] |= t[2] << 4; + r[7 * i + 4] = t[2] >> 4; + r[7 * i + 5] = t[2] >> 12; + r[7 * i + 5] |= t[3] << 2; + r[7 * i + 6] = t[3] >> 6; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[7 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; + + r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; + r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; + r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; + + r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; + r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; + + r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; + r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; + + r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; + r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; + r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; + r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyz_pack +* +* Description: Bit-pack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLZ_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + uint32_t t[2]; + + for (i = 0; i < N / 2; ++i) { + /* Map to {0,...,2*GAMMA1 - 2} */ + t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; + t[0] += ((int32_t)t[0] >> 31) & Q; + t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; + t[1] += ((int32_t)t[1] >> 31) & Q; + + r[5 * i + 0] = t[0]; + r[5 * i + 1] = t[0] >> 8; + r[5 * i + 2] = t[0] >> 16; + r[5 * i + 2] |= t[1] << 4; + r[5 * i + 3] = t[1] >> 4; + r[5 * i + 4] = t[1] >> 12; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1]. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[5 * i + 0]; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; + + r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + + r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; + r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; + r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; + } + +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLW1_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + + for (i = 0; i < N / 2; ++i) { + r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); + } +} diff --git a/crypto_sign/dilithium2/avx2/poly.h b/crypto_sign/dilithium2/avx2/poly.h new file mode 100644 index 00000000..d726a55f --- /dev/null +++ b/crypto_sign/dilithium2/avx2/poly.h @@ -0,0 +1,83 @@ +#ifndef POLY_H +#define POLY_H + +#include +#include + +#include "alignment.h" +#include "params.h" + +typedef union { + uint32_t coeffs[N]; + __m256i coeffs_x8[N / 8]; +} poly; + +void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM2_AVX2_poly_csubq(poly *a); +void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(poly *a); +void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h); + +int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, uint32_t B); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); + +void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t *r, const poly *a); +#endif diff --git a/crypto_sign/dilithium2/avx2/polyvec.c b/crypto_sign/dilithium2/avx2/polyvec.c new file mode 100644 index 00000000..718e8375 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/polyvec.c @@ -0,0 +1,353 @@ +#include + +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* Input coefficients are assumed to be less than 22*Q. Output +* coeffcient are less than 2*L*Q. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const polyvecl *v: pointer to vector +* - uint32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B and 1 +* otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [0,2*Q[. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq +* +* Description: For all coefficients of polynomials in vector of length K +* subtract Q if coefficient is bigger than Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_csubq(&v->vec[i]); + } +} + +/************************************************* +* Name: polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* Assumes coefficients of polynomials in second input vector +* to be less than 2*Q. No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{32-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const polyveck *v: pointer to vector +* - uint32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B and 1 +* otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients Q + a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients Q + a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; + + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + } + + return s; +} + +/************************************************* +* Name: polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *v: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); + } +} diff --git a/crypto_sign/dilithium2/avx2/polyvec.h b/crypto_sign/dilithium2/avx2/polyvec.h new file mode 100644 index 00000000..a892cb6d --- /dev/null +++ b/crypto_sign/dilithium2/avx2/polyvec.h @@ -0,0 +1,52 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H +#define PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H + +#include + +#include "params.h" +#include "poly.h" + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + +int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v); + +int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t B); + +void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h); + +#endif diff --git a/crypto_sign/dilithium2/avx2/reduce.h b/crypto_sign/dilithium2/avx2/reduce.h new file mode 100644 index 00000000..74d4dd2e --- /dev/null +++ b/crypto_sign/dilithium2/avx2/reduce.h @@ -0,0 +1,9 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include + +void PQCLEAN_DILITHIUM2_AVX2_reduce_avx(uint32_t a[N]); +void PQCLEAN_DILITHIUM2_AVX2_csubq_avx(uint32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium2/avx2/reduce.s b/crypto_sign/dilithium2/avx2/reduce.s new file mode 100644 index 00000000..85a9eb1c --- /dev/null +++ b/crypto_sign/dilithium2/avx2/reduce.s @@ -0,0 +1,91 @@ +.global PQCLEAN_DILITHIUM2_AVX2_reduce_avx +PQCLEAN_DILITHIUM2_AVX2_reduce_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8x23ones(%rip),%ymm0 + +xor %eax,%eax +_looptop_rdc32: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm7 + +#reduce +vpsrld $23,%ymm1,%ymm2 +vpsrld $23,%ymm3,%ymm4 +vpsrld $23,%ymm5,%ymm6 +vpsrld $23,%ymm7,%ymm8 +vpand %ymm0,%ymm1,%ymm1 +vpand %ymm0,%ymm3,%ymm3 +vpand %ymm0,%ymm5,%ymm5 +vpand %ymm0,%ymm7,%ymm7 +vpsubd %ymm2,%ymm1,%ymm1 +vpsubd %ymm4,%ymm3,%ymm3 +vpsubd %ymm6,%ymm5,%ymm5 +vpsubd %ymm8,%ymm7,%ymm7 +vpslld $13,%ymm2,%ymm2 +vpslld $13,%ymm4,%ymm4 +vpslld $13,%ymm6,%ymm6 +vpslld $13,%ymm8,%ymm8 +vpaddd %ymm2,%ymm1,%ymm1 +vpaddd %ymm4,%ymm3,%ymm3 +vpaddd %ymm6,%ymm5,%ymm5 +vpaddd %ymm8,%ymm7,%ymm7 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm7,96(%rdi) + +add $128,%rdi +add $1,%eax +cmp $8,%eax +jb _looptop_rdc32 + +ret + +.global PQCLEAN_DILITHIUM2_AVX2_csubq_avx +PQCLEAN_DILITHIUM2_AVX2_csubq_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM2_AVX2_8xq(%rip),%ymm0 + +xor %eax,%eax +_looptop_csubq: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm7 + +#PQCLEAN_DILITHIUM2_AVX2_csubq +vpsubd %ymm0,%ymm1,%ymm1 +vpsubd %ymm0,%ymm3,%ymm3 +vpsubd %ymm0,%ymm5,%ymm5 +vpsubd %ymm0,%ymm7,%ymm7 +vpsrad $31,%ymm1,%ymm2 +vpsrad $31,%ymm3,%ymm4 +vpsrad $31,%ymm5,%ymm6 +vpsrad $31,%ymm7,%ymm8 +vpand %ymm0,%ymm2,%ymm2 +vpand %ymm0,%ymm4,%ymm4 +vpand %ymm0,%ymm6,%ymm6 +vpand %ymm0,%ymm8,%ymm8 +vpaddd %ymm2,%ymm1,%ymm1 +vpaddd %ymm4,%ymm3,%ymm3 +vpaddd %ymm6,%ymm5,%ymm5 +vpaddd %ymm8,%ymm7,%ymm7 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm7,96(%rdi) + +add $128,%rdi +add $1,%eax +cmp $8,%eax +jb _looptop_csubq + +ret diff --git a/crypto_sign/dilithium2/avx2/rejsample.c b/crypto_sign/dilithium2/avx2/rejsample.c new file mode 100644 index 00000000..cea1f4a9 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/rejsample.c @@ -0,0 +1,443 @@ +#include +#include + +#include "params.h" +#include "rejsample.h" + +static const uint8_t idx[256][8] = { + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 1, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 0, 0, 0, 0, 0, 0}, + { 2, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0}, + { 1, 2, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 0, 0, 0, 0, 0}, + { 3, 0, 0, 0, 0, 0, 0, 0}, + { 0, 3, 0, 0, 0, 0, 0, 0}, + { 1, 3, 0, 0, 0, 0, 0, 0}, + { 0, 1, 3, 0, 0, 0, 0, 0}, + { 2, 3, 0, 0, 0, 0, 0, 0}, + { 0, 2, 3, 0, 0, 0, 0, 0}, + { 1, 2, 3, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 0, 0, 0, 0}, + { 4, 0, 0, 0, 0, 0, 0, 0}, + { 0, 4, 0, 0, 0, 0, 0, 0}, + { 1, 4, 0, 0, 0, 0, 0, 0}, + { 0, 1, 4, 0, 0, 0, 0, 0}, + { 2, 4, 0, 0, 0, 0, 0, 0}, + { 0, 2, 4, 0, 0, 0, 0, 0}, + { 1, 2, 4, 0, 0, 0, 0, 0}, + { 0, 1, 2, 4, 0, 0, 0, 0}, + { 3, 4, 0, 0, 0, 0, 0, 0}, + { 0, 3, 4, 0, 0, 0, 0, 0}, + { 1, 3, 4, 0, 0, 0, 0, 0}, + { 0, 1, 3, 4, 0, 0, 0, 0}, + { 2, 3, 4, 0, 0, 0, 0, 0}, + { 0, 2, 3, 4, 0, 0, 0, 0}, + { 1, 2, 3, 4, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 0, 0, 0}, + { 5, 0, 0, 0, 0, 0, 0, 0}, + { 0, 5, 0, 0, 0, 0, 0, 0}, + { 1, 5, 0, 0, 0, 0, 0, 0}, + { 0, 1, 5, 0, 0, 0, 0, 0}, + { 2, 5, 0, 0, 0, 0, 0, 0}, + { 0, 2, 5, 0, 0, 0, 0, 0}, + { 1, 2, 5, 0, 0, 0, 0, 0}, + { 0, 1, 2, 5, 0, 0, 0, 0}, + { 3, 5, 0, 0, 0, 0, 0, 0}, + { 0, 3, 5, 0, 0, 0, 0, 0}, + { 1, 3, 5, 0, 0, 0, 0, 0}, + { 0, 1, 3, 5, 0, 0, 0, 0}, + { 2, 3, 5, 0, 0, 0, 0, 0}, + { 0, 2, 3, 5, 0, 0, 0, 0}, + { 1, 2, 3, 5, 0, 0, 0, 0}, + { 0, 1, 2, 3, 5, 0, 0, 0}, + { 4, 5, 0, 0, 0, 0, 0, 0}, + { 0, 4, 5, 0, 0, 0, 0, 0}, + { 1, 4, 5, 0, 0, 0, 0, 0}, + { 0, 1, 4, 5, 0, 0, 0, 0}, + { 2, 4, 5, 0, 0, 0, 0, 0}, + { 0, 2, 4, 5, 0, 0, 0, 0}, + { 1, 2, 4, 5, 0, 0, 0, 0}, + { 0, 1, 2, 4, 5, 0, 0, 0}, + { 3, 4, 5, 0, 0, 0, 0, 0}, + { 0, 3, 4, 5, 0, 0, 0, 0}, + { 1, 3, 4, 5, 0, 0, 0, 0}, + { 0, 1, 3, 4, 5, 0, 0, 0}, + { 2, 3, 4, 5, 0, 0, 0, 0}, + { 0, 2, 3, 4, 5, 0, 0, 0}, + { 1, 2, 3, 4, 5, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 0, 0}, + { 6, 0, 0, 0, 0, 0, 0, 0}, + { 0, 6, 0, 0, 0, 0, 0, 0}, + { 1, 6, 0, 0, 0, 0, 0, 0}, + { 0, 1, 6, 0, 0, 0, 0, 0}, + { 2, 6, 0, 0, 0, 0, 0, 0}, + { 0, 2, 6, 0, 0, 0, 0, 0}, + { 1, 2, 6, 0, 0, 0, 0, 0}, + { 0, 1, 2, 6, 0, 0, 0, 0}, + { 3, 6, 0, 0, 0, 0, 0, 0}, + { 0, 3, 6, 0, 0, 0, 0, 0}, + { 1, 3, 6, 0, 0, 0, 0, 0}, + { 0, 1, 3, 6, 0, 0, 0, 0}, + { 2, 3, 6, 0, 0, 0, 0, 0}, + { 0, 2, 3, 6, 0, 0, 0, 0}, + { 1, 2, 3, 6, 0, 0, 0, 0}, + { 0, 1, 2, 3, 6, 0, 0, 0}, + { 4, 6, 0, 0, 0, 0, 0, 0}, + { 0, 4, 6, 0, 0, 0, 0, 0}, + { 1, 4, 6, 0, 0, 0, 0, 0}, + { 0, 1, 4, 6, 0, 0, 0, 0}, + { 2, 4, 6, 0, 0, 0, 0, 0}, + { 0, 2, 4, 6, 0, 0, 0, 0}, + { 1, 2, 4, 6, 0, 0, 0, 0}, + { 0, 1, 2, 4, 6, 0, 0, 0}, + { 3, 4, 6, 0, 0, 0, 0, 0}, + { 0, 3, 4, 6, 0, 0, 0, 0}, + { 1, 3, 4, 6, 0, 0, 0, 0}, + { 0, 1, 3, 4, 6, 0, 0, 0}, + { 2, 3, 4, 6, 0, 0, 0, 0}, + { 0, 2, 3, 4, 6, 0, 0, 0}, + { 1, 2, 3, 4, 6, 0, 0, 0}, + { 0, 1, 2, 3, 4, 6, 0, 0}, + { 5, 6, 0, 0, 0, 0, 0, 0}, + { 0, 5, 6, 0, 0, 0, 0, 0}, + { 1, 5, 6, 0, 0, 0, 0, 0}, + { 0, 1, 5, 6, 0, 0, 0, 0}, + { 2, 5, 6, 0, 0, 0, 0, 0}, + { 0, 2, 5, 6, 0, 0, 0, 0}, + { 1, 2, 5, 6, 0, 0, 0, 0}, + { 0, 1, 2, 5, 6, 0, 0, 0}, + { 3, 5, 6, 0, 0, 0, 0, 0}, + { 0, 3, 5, 6, 0, 0, 0, 0}, + { 1, 3, 5, 6, 0, 0, 0, 0}, + { 0, 1, 3, 5, 6, 0, 0, 0}, + { 2, 3, 5, 6, 0, 0, 0, 0}, + { 0, 2, 3, 5, 6, 0, 0, 0}, + { 1, 2, 3, 5, 6, 0, 0, 0}, + { 0, 1, 2, 3, 5, 6, 0, 0}, + { 4, 5, 6, 0, 0, 0, 0, 0}, + { 0, 4, 5, 6, 0, 0, 0, 0}, + { 1, 4, 5, 6, 0, 0, 0, 0}, + { 0, 1, 4, 5, 6, 0, 0, 0}, + { 2, 4, 5, 6, 0, 0, 0, 0}, + { 0, 2, 4, 5, 6, 0, 0, 0}, + { 1, 2, 4, 5, 6, 0, 0, 0}, + { 0, 1, 2, 4, 5, 6, 0, 0}, + { 3, 4, 5, 6, 0, 0, 0, 0}, + { 0, 3, 4, 5, 6, 0, 0, 0}, + { 1, 3, 4, 5, 6, 0, 0, 0}, + { 0, 1, 3, 4, 5, 6, 0, 0}, + { 2, 3, 4, 5, 6, 0, 0, 0}, + { 0, 2, 3, 4, 5, 6, 0, 0}, + { 1, 2, 3, 4, 5, 6, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 0}, + { 7, 0, 0, 0, 0, 0, 0, 0}, + { 0, 7, 0, 0, 0, 0, 0, 0}, + { 1, 7, 0, 0, 0, 0, 0, 0}, + { 0, 1, 7, 0, 0, 0, 0, 0}, + { 2, 7, 0, 0, 0, 0, 0, 0}, + { 0, 2, 7, 0, 0, 0, 0, 0}, + { 1, 2, 7, 0, 0, 0, 0, 0}, + { 0, 1, 2, 7, 0, 0, 0, 0}, + { 3, 7, 0, 0, 0, 0, 0, 0}, + { 0, 3, 7, 0, 0, 0, 0, 0}, + { 1, 3, 7, 0, 0, 0, 0, 0}, + { 0, 1, 3, 7, 0, 0, 0, 0}, + { 2, 3, 7, 0, 0, 0, 0, 0}, + { 0, 2, 3, 7, 0, 0, 0, 0}, + { 1, 2, 3, 7, 0, 0, 0, 0}, + { 0, 1, 2, 3, 7, 0, 0, 0}, + { 4, 7, 0, 0, 0, 0, 0, 0}, + { 0, 4, 7, 0, 0, 0, 0, 0}, + { 1, 4, 7, 0, 0, 0, 0, 0}, + { 0, 1, 4, 7, 0, 0, 0, 0}, + { 2, 4, 7, 0, 0, 0, 0, 0}, + { 0, 2, 4, 7, 0, 0, 0, 0}, + { 1, 2, 4, 7, 0, 0, 0, 0}, + { 0, 1, 2, 4, 7, 0, 0, 0}, + { 3, 4, 7, 0, 0, 0, 0, 0}, + { 0, 3, 4, 7, 0, 0, 0, 0}, + { 1, 3, 4, 7, 0, 0, 0, 0}, + { 0, 1, 3, 4, 7, 0, 0, 0}, + { 2, 3, 4, 7, 0, 0, 0, 0}, + { 0, 2, 3, 4, 7, 0, 0, 0}, + { 1, 2, 3, 4, 7, 0, 0, 0}, + { 0, 1, 2, 3, 4, 7, 0, 0}, + { 5, 7, 0, 0, 0, 0, 0, 0}, + { 0, 5, 7, 0, 0, 0, 0, 0}, + { 1, 5, 7, 0, 0, 0, 0, 0}, + { 0, 1, 5, 7, 0, 0, 0, 0}, + { 2, 5, 7, 0, 0, 0, 0, 0}, + { 0, 2, 5, 7, 0, 0, 0, 0}, + { 1, 2, 5, 7, 0, 0, 0, 0}, + { 0, 1, 2, 5, 7, 0, 0, 0}, + { 3, 5, 7, 0, 0, 0, 0, 0}, + { 0, 3, 5, 7, 0, 0, 0, 0}, + { 1, 3, 5, 7, 0, 0, 0, 0}, + { 0, 1, 3, 5, 7, 0, 0, 0}, + { 2, 3, 5, 7, 0, 0, 0, 0}, + { 0, 2, 3, 5, 7, 0, 0, 0}, + { 1, 2, 3, 5, 7, 0, 0, 0}, + { 0, 1, 2, 3, 5, 7, 0, 0}, + { 4, 5, 7, 0, 0, 0, 0, 0}, + { 0, 4, 5, 7, 0, 0, 0, 0}, + { 1, 4, 5, 7, 0, 0, 0, 0}, + { 0, 1, 4, 5, 7, 0, 0, 0}, + { 2, 4, 5, 7, 0, 0, 0, 0}, + { 0, 2, 4, 5, 7, 0, 0, 0}, + { 1, 2, 4, 5, 7, 0, 0, 0}, + { 0, 1, 2, 4, 5, 7, 0, 0}, + { 3, 4, 5, 7, 0, 0, 0, 0}, + { 0, 3, 4, 5, 7, 0, 0, 0}, + { 1, 3, 4, 5, 7, 0, 0, 0}, + { 0, 1, 3, 4, 5, 7, 0, 0}, + { 2, 3, 4, 5, 7, 0, 0, 0}, + { 0, 2, 3, 4, 5, 7, 0, 0}, + { 1, 2, 3, 4, 5, 7, 0, 0}, + { 0, 1, 2, 3, 4, 5, 7, 0}, + { 6, 7, 0, 0, 0, 0, 0, 0}, + { 0, 6, 7, 0, 0, 0, 0, 0}, + { 1, 6, 7, 0, 0, 0, 0, 0}, + { 0, 1, 6, 7, 0, 0, 0, 0}, + { 2, 6, 7, 0, 0, 0, 0, 0}, + { 0, 2, 6, 7, 0, 0, 0, 0}, + { 1, 2, 6, 7, 0, 0, 0, 0}, + { 0, 1, 2, 6, 7, 0, 0, 0}, + { 3, 6, 7, 0, 0, 0, 0, 0}, + { 0, 3, 6, 7, 0, 0, 0, 0}, + { 1, 3, 6, 7, 0, 0, 0, 0}, + { 0, 1, 3, 6, 7, 0, 0, 0}, + { 2, 3, 6, 7, 0, 0, 0, 0}, + { 0, 2, 3, 6, 7, 0, 0, 0}, + { 1, 2, 3, 6, 7, 0, 0, 0}, + { 0, 1, 2, 3, 6, 7, 0, 0}, + { 4, 6, 7, 0, 0, 0, 0, 0}, + { 0, 4, 6, 7, 0, 0, 0, 0}, + { 1, 4, 6, 7, 0, 0, 0, 0}, + { 0, 1, 4, 6, 7, 0, 0, 0}, + { 2, 4, 6, 7, 0, 0, 0, 0}, + { 0, 2, 4, 6, 7, 0, 0, 0}, + { 1, 2, 4, 6, 7, 0, 0, 0}, + { 0, 1, 2, 4, 6, 7, 0, 0}, + { 3, 4, 6, 7, 0, 0, 0, 0}, + { 0, 3, 4, 6, 7, 0, 0, 0}, + { 1, 3, 4, 6, 7, 0, 0, 0}, + { 0, 1, 3, 4, 6, 7, 0, 0}, + { 2, 3, 4, 6, 7, 0, 0, 0}, + { 0, 2, 3, 4, 6, 7, 0, 0}, + { 1, 2, 3, 4, 6, 7, 0, 0}, + { 0, 1, 2, 3, 4, 6, 7, 0}, + { 5, 6, 7, 0, 0, 0, 0, 0}, + { 0, 5, 6, 7, 0, 0, 0, 0}, + { 1, 5, 6, 7, 0, 0, 0, 0}, + { 0, 1, 5, 6, 7, 0, 0, 0}, + { 2, 5, 6, 7, 0, 0, 0, 0}, + { 0, 2, 5, 6, 7, 0, 0, 0}, + { 1, 2, 5, 6, 7, 0, 0, 0}, + { 0, 1, 2, 5, 6, 7, 0, 0}, + { 3, 5, 6, 7, 0, 0, 0, 0}, + { 0, 3, 5, 6, 7, 0, 0, 0}, + { 1, 3, 5, 6, 7, 0, 0, 0}, + { 0, 1, 3, 5, 6, 7, 0, 0}, + { 2, 3, 5, 6, 7, 0, 0, 0}, + { 0, 2, 3, 5, 6, 7, 0, 0}, + { 1, 2, 3, 5, 6, 7, 0, 0}, + { 0, 1, 2, 3, 5, 6, 7, 0}, + { 4, 5, 6, 7, 0, 0, 0, 0}, + { 0, 4, 5, 6, 7, 0, 0, 0}, + { 1, 4, 5, 6, 7, 0, 0, 0}, + { 0, 1, 4, 5, 6, 7, 0, 0}, + { 2, 4, 5, 6, 7, 0, 0, 0}, + { 0, 2, 4, 5, 6, 7, 0, 0}, + { 1, 2, 4, 5, 6, 7, 0, 0}, + { 0, 1, 2, 4, 5, 6, 7, 0}, + { 3, 4, 5, 6, 7, 0, 0, 0}, + { 0, 3, 4, 5, 6, 7, 0, 0}, + { 1, 3, 4, 5, 6, 7, 0, 0}, + { 0, 1, 3, 4, 5, 6, 7, 0}, + { 2, 3, 4, 5, 6, 7, 0, 0}, + { 0, 2, 3, 4, 5, 6, 7, 0}, + { 1, 2, 3, 4, 5, 6, 7, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7} +}; + +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint32_t vec[8]; + __m256i d, tmp; + uint32_t good; + const __m256i bound = _mm256_set1_epi32(Q); + + ctr = pos = 0; + while (ctr + 8 <= len && pos + 24 <= buflen) { + for (i = 0; i < 8; i++) { + vec[i] = buf[pos++]; + vec[i] |= (uint32_t)buf[pos++] << 8; + vec[i] |= (uint32_t)buf[pos++] << 16; + vec[i] &= 0x7FFFFF; + } + + d = _mm256_loadu_si256((__m256i_u *)vec); + tmp = _mm256_cmpgt_epi32(bound, d); + good = _mm256_movemask_ps((__m256)tmp); + + __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); + tmp = _mm256_cvtepu8_epi32(rid); + d = _mm256_permutevar8x32_epi32(d, tmp); + _mm256_storeu_si256((__m256i_u *)&r[ctr], d); + ctr += __builtin_popcount(good); + } + + while (ctr < len && pos + 3 <= buflen) { + vec[0] = buf[pos++]; + vec[0] |= (uint32_t)buf[pos++] << 8; + vec[0] |= (uint32_t)buf[pos++] << 16; + vec[0] &= 0x7FFFFF; + + if (vec[0] < Q) { + r[ctr++] = vec[0]; + } + } + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint8_t vec[32]; + __m256i tmp0, tmp1; + __m128i d0, d1, rid; + uint32_t good; + const __m256i bound = _mm256_set1_epi8(2 * ETA + 1); + const __m256i off = _mm256_set1_epi32(Q + ETA); + + ctr = pos = 0; + while (ctr + 32 <= len && pos + 16 <= buflen) { + for (i = 0; i < 16; i++) { + vec[2 * i + 0] = buf[pos] & 0x0F; + vec[2 * i + 1] = buf[pos++] >> 4; + } + + tmp0 = _mm256_loadu_si256((__m256i_u *)vec); + tmp1 = _mm256_cmpgt_epi8(bound, tmp0); + good = _mm256_movemask_epi8(tmp1); + + d0 = _mm256_castsi256_si128(tmp0); + rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount(good & 0xFF); + + d0 = _mm_bsrli_si128(d0, 8); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 8) & 0xFF); + + d0 = _mm256_extracti128_si256(tmp0, 1); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 16) & 0xFF); + + d0 = _mm_bsrli_si128(d0, 8); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 24) & 0xFF); + } + + while (ctr < len && pos < buflen) { + vec[0] = buf[pos] & 0x0F; + vec[1] = buf[pos++] >> 4; + + if (vec[0] <= 2 * ETA) { + r[ctr++] = Q + ETA - vec[0]; + } + if (vec[1] <= 2 * ETA && ctr < len) { + r[ctr++] = Q + ETA - vec[1]; + } + } + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint32_t vec[8]; + __m256i d, tmp; + uint32_t good; + const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1); + const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1); + + ctr = pos = 0; + while (ctr + 8 <= len && pos + 20 <= buflen) { + for (i = 0; i < 4; i++) { + vec[2 * i + 0] = buf[pos + 0]; + vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8; + vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16; + vec[2 * i + 0] &= 0xFFFFF; + + vec[2 * i + 1] = buf[pos + 2] >> 4; + vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4; + vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + } + + d = _mm256_loadu_si256((__m256i_u *)vec); + tmp = _mm256_cmpgt_epi32(bound, d); + good = _mm256_movemask_ps((__m256)tmp); + d = _mm256_sub_epi32(off, d); + + __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); + tmp = _mm256_cvtepu8_epi32(rid); + d = _mm256_permutevar8x32_epi32(d, tmp); + _mm256_storeu_si256((__m256i_u *)&r[ctr], d); + ctr += __builtin_popcount(good); + } + + while (ctr < len && pos + 5 <= buflen) { + vec[0] = buf[pos + 0]; + vec[0] |= (uint32_t)buf[pos + 1] << 8; + vec[0] |= (uint32_t)buf[pos + 2] << 16; + vec[0] &= 0xFFFFF; + + vec[1] = buf[pos + 2] >> 4; + vec[1] |= (uint32_t)buf[pos + 3] << 4; + vec[1] |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + + if (vec[0] <= 2 * GAMMA1 - 2) { + r[ctr++] = Q + GAMMA1 - 1 - vec[0]; + } + if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) { + r[ctr++] = Q + GAMMA1 - 1 - vec[1]; + } + } + + return ctr; +} diff --git a/crypto_sign/dilithium2/avx2/rejsample.h b/crypto_sign/dilithium2/avx2/rejsample.h new file mode 100644 index 00000000..80da5b3c --- /dev/null +++ b/crypto_sign/dilithium2/avx2/rejsample.h @@ -0,0 +1,26 @@ +#ifndef REJSAMPLE_H +#define REJSAMPLE_H + +#include + +#include "poly.h" + +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +#endif diff --git a/crypto_sign/dilithium2/avx2/rounding.c b/crypto_sign/dilithium2/avx2/rounding.c new file mode 100644 index 00000000..ae8f6f5b --- /dev/null +++ b/crypto_sign/dilithium2/avx2/rounding.c @@ -0,0 +1,115 @@ +#include "rounding.h" + +/************************************************* +* Name: power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - uint32_t a: input element +* - uint32_t *a0: pointer to output element Q + a0 +* +* Returns a1. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0) { + int32_t t; + + /* Centralized remainder mod 2^D */ + t = a & ((1U << D) - 1); + t -= (1U << (D - 1)) + 1; + t += (t >> 31) & (1U << D); + t -= (1U << (D - 1)) - 1; + *a0 = Q + t; + a = (a - t) >> D; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - uint32_t a: input element +* - uint32_t *a0: pointer to output element Q + a0 +* +* Returns a1. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0) { + int32_t t, u; + + /* Centralized remainder mod ALPHA */ + t = a & 0x7FFFF; + t += (a >> 19) << 9; + t -= ALPHA / 2 + 1; + t += (t >> 31) & ALPHA; + t -= ALPHA / 2 - 1; + a -= t; + + /* Divide by ALPHA (possible to avoid) */ + u = a - 1; + u >>= 31; + a = (a >> 19) + 1; + a -= u & 1; + + /* Border case */ + *a0 = Q + t - (a >> 4); + a &= 0xF; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. Inputs assumed to be +* standard representatives. +* +* Arguments: - uint32_t a0: low bits of input element +* - uint32_t a1: high bits of input element +* +* Returns 1 if high bits of a and b differ and 0 otherwise. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(const uint32_t a0, const uint32_t a1) { + if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { + return 0; + } + + return 1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - uint32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(const uint32_t a, const unsigned int hint) { + uint32_t a0, a1; + + a1 = PQCLEAN_DILITHIUM2_AVX2_decompose(a, &a0); + if (hint == 0) { + return a1; + } + if (a0 > Q) { + return (a1 + 1) & 0xF; + } + return (a1 - 1) & 0xF; + + /* If decompose does not divide out ALPHA: + if(hint == 0) + return a1; + else if(a0 > Q) + return (a1 + ALPHA) % (Q - 1); + else + return (a1 - ALPHA) % (Q - 1); + */ +} diff --git a/crypto_sign/dilithium2/avx2/rounding.h b/crypto_sign/dilithium2/avx2/rounding.h new file mode 100644 index 00000000..3a30aa15 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/rounding.h @@ -0,0 +1,12 @@ +#ifndef ROUNDING_H +#define ROUNDING_H + +#include "params.h" +#include + +uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0); +uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0); +unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(uint32_t a0, uint32_t a1); +uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(uint32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium2/avx2/shuffle.inc b/crypto_sign/dilithium2/avx2/shuffle.inc new file mode 100644 index 00000000..df352030 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/shuffle.inc @@ -0,0 +1,23 @@ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +vpsllq $32,%ymm\r1,%ymm12 +vpsrlq $32,%ymm\r0,%ymm13 +vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 +vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm12 +vpsrld $16,%ymm\r0,%ymm13 +vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 +vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +.endm diff --git a/crypto_sign/dilithium2/avx2/sign.c b/crypto_sign/dilithium2/avx2/sign.c new file mode 100644 index 00000000..2af77c84 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/sign.c @@ -0,0 +1,433 @@ +#include +#include + +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|i|j). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[4], const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[0].vec[0], + &mat[0].vec[1], + &mat[0].vec[2], + &mat[1].vec[0], + rho, 0, 1, 2, 256); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[1].vec[1], + &mat[1].vec[2], + &mat[2].vec[0], + &mat[2].vec[1], + rho, 257, 258, 512, 513); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[2].vec[2], + &mat[3].vec[0], + &mat[3].vec[1], + &mat[3].vec[2], + rho, 514, 768, 769, 770); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with 60 nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(mu|w1). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing mu +* - const polyveck *w1: pointer to vector w1 +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, + const uint8_t mu[CRHBYTES], + const polyveck *w1) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; + uint8_t outbuf[SHAKE256_RATE]; + shake256ctx state; + + for (i = 0; i < CRHBYTES; ++i) { + inbuf[i] = mu[i]; + } + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); + } + + shake256_absorb(&state, inbuf, sizeof(inbuf)); + shake256_squeezeblocks(outbuf, 1, &state); + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t) outbuf[i] << 8 * i; + } + + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + + for (i = 196; i < 256; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_squeezeblocks(outbuf, 1, &state); + pos = 0; + } + + b = outbuf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1; + c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1)); + signs >>= 1; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; + uint16_t nonce = 0; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t, t1, t0; + + /* Expand 32 bytes of randomness into rho, rhoprime and key */ + randombytes(seedbuf, 3 * SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Expand matrix */ + PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s2.vec[0], rhoprime, + nonce, nonce + 1, nonce + 2, nonce + 3); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[1], &s2.vec[2], &s2.vec[3], &t.vec[0], rhoprime, + nonce + 4, nonce + 5, nonce + 6, 0); + + /* Matrix-vector multiplication */ + s1hat = s1; + PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1hat); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); + //PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&t.vec[i]); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&t.vec[i]); + } + + /* Add error vector s2 */ + PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&t, &t, &s2); + + /* Extract t1 and write public key */ + PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&t); + PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(&t1, &t0, &t); + PQCLEAN_DILITHIUM2_AVX2_pack_pk(pk, rho, &t1); + + /* Compute CRH(rho, t1) and write secret key */ + crh(tr, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); + PQCLEAN_DILITHIUM2_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES +* of len) +* - size_t *siglen: pointer to output length of signed message +* (should be PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + size_t i; + unsigned int n; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint16_t nonce = 0; + poly c, chat; + polyvecl mat[K], s1, y, yhat, z; + polyveck t0, s2, w, w1, w0; + polyveck h, cs2, ct0; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); + + + // use incremental hash API instead of copying around buffers + /* Compute CRH(tr, m) */ + shake256incctx state; + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); + PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &yhat.vec[0], + rhoprime, nonce, nonce + 1, nonce + 2, 0); + nonce += 3; + + /* Matrix-vector multiplication */ + yhat = y; + PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&yhat); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); + PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w.vec[i]); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&w.vec[i]); + } + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w); + PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &w0, &w); + PQCLEAN_DILITHIUM2_AVX2_challenge(&c, mu, &w1); + chat = c; + PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat); + + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&cs2.vec[i]); + } + PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&w0, &w0, &cs2); + PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&w0); + if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) { + goto rej; + } + + /* Compute z, reject if it reveals secret */ + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&z.vec[i]); + } + PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(&z, &z, &y); + PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(&z); + if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + goto rej; + } + + /* Compute hints for w1 */ + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&ct0.vec[i]); + } + + PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&ct0); + if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&ct0, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&w0, &w0, &ct0); + PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w0); + n = PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(&h, &w0, &w1); + if (n > OMEGA) { + goto rej; + } + + /* Write signature */ + PQCLEAN_DILITHIUM2_AVX2_pack_sig(sig, &z, &h, &c); + *siglen = PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - unsigned long long *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - unsigned long long mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + int rc; + memmove(sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, m, mlen); + rc = PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk); + *smlen += mlen; + return rc; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *sig: signature +* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) +* - uint8_t *m: pointer to message +* - size_t *mlen: pointer to output length of message +* - uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk) { + size_t i; + uint8_t rho[SEEDBYTES]; + uint8_t mu[CRHBYTES]; + poly c, chat, cp; + polyvecl mat[K], z; + polyveck t1, w1, h, tmp1, tmp2; + + if (siglen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { + return -1; + } + + PQCLEAN_DILITHIUM2_AVX2_unpack_pk(rho, &t1, pk); + if (PQCLEAN_DILITHIUM2_AVX2_unpack_sig(&z, &h, &c, sig)) { + return -1; + } + if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); + + shake256incctx state; + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); + PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&z); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); + } + + chat = c; + PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat); + PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(&t1); + PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t1); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); + } + + PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2); + PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(&tmp1); + PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(&tmp1); + + /* Reconstruct w1 */ + PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&tmp1); + PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(&w1, &tmp1, &h); + + /* Call random oracle and verify challenge */ + PQCLEAN_DILITHIUM2_AVX2_challenge(&cp, mu, &w1); + for (i = 0; i < N; ++i) { + if (c.coeffs[i] != cp.coeffs[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - unsigned char *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - unsigned long long *mlen: pointer to output length of message +* - const unsigned char *sm: pointer to signed message +* - unsigned long long smlen: length of signed message +* - const unsigned char *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk) { + size_t i; + if (smlen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { + goto badsig; + } + *mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; + + if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, + sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + i]; + } + return 0; + } + + /* Signature verification failed */ +badsig: + *mlen = (size_t) -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium2/avx2/sign.h b/crypto_sign/dilithium2/avx2/sign.h new file mode 100644 index 00000000..a8e5c368 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/sign.h @@ -0,0 +1,15 @@ +#ifndef SIGN_H +#define SIGN_H + +#include "api.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES], + const polyveck *w1); + + +#endif + diff --git a/crypto_sign/dilithium2/avx2/stream.c b/crypto_sign/dilithium2/avx2/stream.c new file mode 100644 index 00000000..98e7a6d4 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/stream.c @@ -0,0 +1,26 @@ +#include "stream.h" + +#include + +void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + + uint8_t buf[SEEDBYTES + 2]; + memcpy(buf, seed, SEEDBYTES); + buf[SEEDBYTES] = (uint8_t)nonce; + buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); + + shake128_absorb(state, buf, SEEDBYTES + 2); +} + + +void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + + uint8_t buf[CRHBYTES + 2]; + memcpy(buf, seed, CRHBYTES); + buf[CRHBYTES] = (uint8_t)nonce; + buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); + + shake256_absorb(state, buf, CRHBYTES + 2); +} diff --git a/crypto_sign/dilithium2/avx2/stream.h b/crypto_sign/dilithium2/avx2/stream.h new file mode 100644 index 00000000..9185af8c --- /dev/null +++ b/crypto_sign/dilithium2/avx2/stream.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_STREAM_H +#define PQCLEAN_DILITHIUM2_AVX2_STREAM_H + +#include + +#include "fips202.h" +#include "params.h" + +void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#endif diff --git a/crypto_sign/dilithium2/avx2/symmetric.h b/crypto_sign/dilithium2/avx2/symmetric.h new file mode 100644 index 00000000..92c36244 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/symmetric.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H +#define PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H + +#include "params.h" +#include "stream.h" + + +#include "fips202.h" + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) + +#define STREAM128_BLOCKBYTES SHAKE128_RATE +#define STREAM256_BLOCKBYTES SHAKE256_RATE + +typedef shake128ctx stream128_state; +typedef shake256ctx stream256_state; + + +#endif diff --git a/crypto_sign/dilithium2/clean/LICENSE b/crypto_sign/dilithium2/clean/LICENSE index 0299dbff..40541676 100644 --- a/crypto_sign/dilithium2/clean/LICENSE +++ b/crypto_sign/dilithium2/clean/LICENSE @@ -1,2 +1,6 @@ -Public Domain -Authors: Léo Ducas, Eike Kiltz, Tancrède Lepoint, Vadim Lyubashevsky, Gregor Seiler, Peter Schwabe, Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and the random number generator +we are using public-domain code from sources +and by authors listed in comments on top of +the respective files. diff --git a/crypto_sign/dilithium2/clean/Makefile b/crypto_sign/dilithium2/clean/Makefile index 61c55b73..f9448299 100644 --- a/crypto_sign/dilithium2/clean/Makefile +++ b/crypto_sign/dilithium2/clean/Makefile @@ -2,10 +2,10 @@ LIB=libdilithium2_clean.a -SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c symmetric.c -OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o symmetric.o +SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c +OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - reduce.h rounding.h symmetric.h + reduce.h rounding.h symmetric.h stream.h CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake index 7db58d88..5f22b2c1 100644 --- a/crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake +++ b/crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libdilithium2_clean.lib -OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj symmetric.obj +OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX all: $(LIBRARY) diff --git a/crypto_sign/dilithium2/clean/api.h b/crypto_sign/dilithium2/clean/api.h index 6bba4842..ce2d43ae 100644 --- a/crypto_sign/dilithium2/clean/api.h +++ b/crypto_sign/dilithium2/clean/api.h @@ -4,14 +4,25 @@ #include #include -#define MODE 2 - #define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1184U #define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2800U #define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2044U + #define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2" -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair( + uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *msg, size_t len, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( uint8_t *sig, size_t *siglen, @@ -21,13 +32,6 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); #endif diff --git a/crypto_sign/dilithium2/clean/ntt.c b/crypto_sign/dilithium2/clean/ntt.c index 45a9e98e..daff8292 100644 --- a/crypto_sign/dilithium2/clean/ntt.c +++ b/crypto_sign/dilithium2/clean/ntt.c @@ -1,11 +1,12 @@ +#include + #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" -#include -/* Roots of unity in order needed by forward ntt */ -static const uint32_t zetas[N] = { +/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM2_CLEAN_ntt */ +static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas[N] = { 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231, @@ -40,8 +41,8 @@ static const uint32_t zetas[N] = { 8332111, 7018208, 3937738, 1400424, 7534263, 1976782 }; -/* Roots of unity in order needed by inverse ntt */ -static const uint32_t zetas_inv[N] = { +/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM2_CLEAN_ntt */ +static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[N] = { 6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416, 3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036, 3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683, @@ -77,7 +78,7 @@ static const uint32_t zetas_inv[N] = { }; /************************************************* -* Name: ntt +* Name: PQCLEAN_DILITHIUM2_CLEAN_ntt * * Description: Forward NTT, in-place. No modular reduction is performed after * additions or subtractions. Hence output coefficients can be up @@ -86,16 +87,16 @@ static const uint32_t zetas_inv[N] = { * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) { +void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t *p) { unsigned int len, start, j, k; uint32_t zeta, t; k = 1; for (len = 128; len > 0; len >>= 1) { for (start = 0; start < N; start = j + len) { - zeta = zetas[k++]; + zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]); + t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); p[j + len] = p[j] + 2 * Q - t; p[j] = p[j] + t; } @@ -104,7 +105,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) { } /************************************************* -* Name: invntt_frominvmont +* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont * * Description: Inverse NTT and multiplication by Montgomery factor 2^32. * In-place. No modular reductions after additions or @@ -113,7 +114,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) { * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]) { +void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t *p) { unsigned int start, len, j, k; uint32_t t, zeta; const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q; @@ -121,17 +122,17 @@ void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]) { k = 0; for (len = 1; len < N; len <<= 1) { for (start = 0; start < N; start = j + len) { - zeta = zetas_inv[k++]; + zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { t = p[j]; p[j] = t + p[j + len]; p[j + len] = t + 256 * Q - p[j + len]; - p[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]); + p[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); } } } for (j = 0; j < N; ++j) { - p[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)f * p[j]); + p[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) f * p[j]); } } diff --git a/crypto_sign/dilithium2/clean/ntt.h b/crypto_sign/dilithium2/clean/ntt.h index 5ba0fcac..b02c9dab 100644 --- a/crypto_sign/dilithium2/clean/ntt.h +++ b/crypto_sign/dilithium2/clean/ntt.h @@ -1,8 +1,9 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_NTT_H +#define PQCLEAN_DILITHIUM2_CLEAN_NTT_H + +#include #include "params.h" -#include void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]); void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]); diff --git a/crypto_sign/dilithium2/clean/packing.c b/crypto_sign/dilithium2/clean/packing.c index 31c5ce9a..d6008917 100644 --- a/crypto_sign/dilithium2/clean/packing.c +++ b/crypto_sign/dilithium2/clean/packing.c @@ -4,17 +4,18 @@ #include "polyvec.h" /************************************************* -* Name: pack_pk +* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_pk * * Description: Bit-pack public key pk = (rho, t1). * -* Arguments: - unsigned char pk[]: output byte array -* - const unsigned char rho[]: byte array containing rho +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], - const unsigned char rho[SEEDBYTES], - const polyveck *t1) { +void PQCLEAN_DILITHIUM2_CLEAN_pack_pk( + uint8_t *pk, + const uint8_t *rho, + const polyveck *t1) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -28,17 +29,18 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], } /************************************************* -* Name: unpack_pk +* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_pk * * Description: Unpack public key pk = (rho, t1). * -* Arguments: - const unsigned char rho[]: output byte array for rho +* Arguments: - const uint8_t rho[]: output byte array for rho * - const polyveck *t1: pointer to output vector t1 -* - unsigned char pk[]: byte array containing bit-packed pk +* - uint8_t pk[]: byte array containing bit-packed pk **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], - polyveck *t1, - const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) { +void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk( + uint8_t *rho, + polyveck *t1, + const uint8_t *pk) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -52,25 +54,26 @@ void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], } /************************************************* -* Name: pack_sk +* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sk * * Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). * -* Arguments: - unsigned char sk[]: output byte array -* - const unsigned char rho[]: byte array containing rho -* - const unsigned char key[]: byte array containing key -* - const unsigned char tr[]: byte array containing tr +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t key[]: byte array containing key +* - const uint8_t tr[]: byte array containing tr * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 * - const polyveck *t0: pointer to vector t0 **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], - const unsigned char rho[SEEDBYTES], - const unsigned char key[SEEDBYTES], - const unsigned char tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { +void PQCLEAN_DILITHIUM2_CLEAN_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -104,25 +107,26 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], } /************************************************* -* Name: unpack_sk +* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sk * * Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). * -* Arguments: - const unsigned char rho[]: output byte array for rho -* - const unsigned char key[]: output byte array for key -* - const unsigned char tr[]: output byte array for tr +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t key[]: output byte array for key +* - const uint8_t tr[]: output byte array for tr * - const polyvecl *s1: pointer to output vector s1 * - const polyveck *s2: pointer to output vector s2 * - const polyveck *r0: pointer to output vector t0 -* - unsigned char sk[]: byte array containing bit-packed sk +* - uint8_t sk[]: byte array containing bit-packed sk **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], - unsigned char key[SEEDBYTES], - unsigned char tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const unsigned char sk[CRYPTO_SECRETKEYBYTES]) { +void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -156,19 +160,20 @@ void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], } /************************************************* -* Name: pack_sig +* Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sig * * Description: Bit-pack signature sig = (z, h, c). * -* Arguments: - unsigned char sig[]: output byte array +* Arguments: - uint8_t sig[]: output byte array * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial +* - const poly *c: pointer to PQCLEAN_DILITHIUM2_CLEAN_challenge polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { +void PQCLEAN_DILITHIUM2_CLEAN_pack_sig( + uint8_t *sig, + const polyvecl *z, + const polyveck *h, + const poly *c) { unsigned int i, j, k; uint64_t signs, mask; @@ -182,10 +187,11 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], for (i = 0; i < K; ++i) { for (j = 0; j < N; ++j) { if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (unsigned char) j; + sig[k++] = (uint8_t)j; } } - sig[OMEGA + i] = (unsigned char) k; + + sig[OMEGA + i] = (uint8_t)k; } while (k < OMEGA) { sig[k++] = 0; @@ -199,7 +205,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], sig[i] = 0; for (j = 0; j < 8; ++j) { if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (unsigned char) (1U << j); + sig[i] |= (uint8_t)(1u << j); if (c->coeffs[8 * i + j] == (Q - 1)) { signs |= mask; } @@ -209,27 +215,28 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], } sig += N / 8; for (i = 0; i < 8; ++i) { - sig[i] = (unsigned char) (signs >> 8 * i); + sig[i] = (uint8_t)(signs >> 8u * i); } } /************************************************* -* Name: unpack_sig +* Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sig * * Description: Unpack signature sig = (z, h, c). * * Arguments: - polyvecl *z: pointer to output vector z * - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial -* - const unsigned char sig[]: byte array containing +* - poly *c: pointer to output PQCLEAN_DILITHIUM2_CLEAN_challenge polynomial +* - const uint8_t sig[]: byte array containing * bit-packed signature * * Returns 1 in case of malformed signature; otherwise 0. **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(polyvecl *z, - polyveck *h, - poly *c, - const unsigned char sig[CRYPTO_BYTES]) { +int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( + polyvecl *z, + polyveck *h, + poly *c, + const uint8_t *sig) { unsigned int i, j, k; uint64_t signs; @@ -266,6 +273,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(polyvecl *z, return 1; } } + sig += OMEGA + K; /* Decode c */ diff --git a/crypto_sign/dilithium2/clean/packing.h b/crypto_sign/dilithium2/clean/packing.h index 3937b634..7207a66c 100644 --- a/crypto_sign/dilithium2/clean/packing.h +++ b/crypto_sign/dilithium2/clean/packing.h @@ -1,31 +1,36 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_PACKING_H +#define PQCLEAN_DILITHIUM2_CLEAN_PACKING_H #include "params.h" #include "polyvec.h" -void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], - const unsigned char rho[SEEDBYTES], const polyveck *t1); -void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], - const unsigned char rho[SEEDBYTES], - const unsigned char key[SEEDBYTES], - const unsigned char tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], - const polyvecl *z, const polyveck *h, const poly *c); +void PQCLEAN_DILITHIUM2_CLEAN_pack_pk( + uint8_t *pk, + const uint8_t *rho, const polyveck *t1); +void PQCLEAN_DILITHIUM2_CLEAN_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0); +void PQCLEAN_DILITHIUM2_CLEAN_pack_sig( + uint8_t *sig, + const polyvecl *z, const polyveck *h, const poly *c); -void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], polyveck *t1, - const unsigned char pk[CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], - unsigned char key[SEEDBYTES], - unsigned char tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const unsigned char sk[CRYPTO_SECRETKEYBYTES]); -int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(polyvecl *z, polyveck *h, poly *c, - const unsigned char sig[CRYPTO_BYTES]); +void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk( + uint8_t *rho, polyveck *t1, + const uint8_t *pk); +void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk); +int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( + polyvecl *z, polyveck *h, poly *c, const uint8_t *sig); #endif diff --git a/crypto_sign/dilithium2/clean/params.h b/crypto_sign/dilithium2/clean/params.h index ab615ec7..b7505cbe 100644 --- a/crypto_sign/dilithium2/clean/params.h +++ b/crypto_sign/dilithium2/clean/params.h @@ -1,19 +1,17 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H +#define PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H + #define SEEDBYTES 32 #define CRHBYTES 48 #define N 256 #define Q 8380417 #define QBITS 23 -#define ROOT_OF_UNITY 1753 #define D 14 #define GAMMA1 ((Q - 1)/16) #define GAMMA2 (GAMMA1/2) #define ALPHA (2*GAMMA2) - -// DilithiumII parameters #define K 4 #define L 3 #define ETA 6 diff --git a/crypto_sign/dilithium2/clean/poly.c b/crypto_sign/dilithium2/clean/poly.c index 9a0d19ff..18c759b1 100644 --- a/crypto_sign/dilithium2/clean/poly.c +++ b/crypto_sign/dilithium2/clean/poly.c @@ -1,10 +1,11 @@ +#include + #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "rounding.h" #include "symmetric.h" -#include /************************************************* @@ -16,8 +17,7 @@ * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a->coeffs[i]); } } @@ -31,8 +31,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_csubq(a->coeffs[i]); } } @@ -46,8 +45,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(poly *a) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_freeze(a->coeffs[i]); } } @@ -61,9 +59,8 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a) { * - const poly *a: pointer to first summand * - const poly *b: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { - unsigned int i; - for (i = 0; i < N; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } @@ -78,11 +75,10 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial to be -* subtraced from first input polynomial +* subtracted from first input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i]; } } @@ -96,8 +92,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] <<= D; } } @@ -139,9 +134,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(poly *a) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]); } @@ -160,12 +153,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly * * - const poly *v: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_power2round(a->coeffs[i], &a0->coeffs[i]); } - } /************************************************* @@ -182,12 +172,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a * - const poly *c: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_decompose(a->coeffs[i], &a0->coeffs[i]); } - } /************************************************* @@ -204,13 +191,11 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) * Returns number of 1 bits. **************************************************/ unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { - unsigned int i, s = 0; - - for (i = 0; i < N; ++i) { + unsigned int s = 0; + for (size_t i = 0; i < N; ++i) { h->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); s += h->coeffs[i]; } - return s; } @@ -224,12 +209,9 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, co * - const poly *h: pointer to input hint polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *h) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_use_hint(b->coeffs[i], h->coeffs[i]); } - } /************************************************* @@ -244,15 +226,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *a, const poly *b, const poly * * Returns 0 if norm is strictly smaller than B and 1 otherwise. **************************************************/ int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) { - unsigned int i; int32_t t; - /* It is ok to leak which coefficient violates the bound since the probability for each coefficient is independent of secret data but we must not leak the sign of the centralized representative. */ - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { /* Absolute value of centralized representative */ - t = (int32_t) ((Q - 1) / 2 - a->coeffs[i]); + t = (int32_t)((Q - 1) / 2 - a->coeffs[i]); t ^= (t >> 31); t = (Q - 1) / 2 - t; @@ -260,7 +240,6 @@ int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) { return 1; } } - return 0; } @@ -272,7 +251,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) { * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -280,8 +259,8 @@ int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) { **************************************************/ static unsigned int rej_uniform(uint32_t *a, unsigned int len, - const unsigned char *buf, - unsigned int buflen) { + const uint8_t *buf, + size_t buflen) { unsigned int ctr, pos; uint32_t t; @@ -308,19 +287,20 @@ static unsigned int rej_uniform(uint32_t *a, * output stream from SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES)/STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t seed[SEEDBYTES], uint16_t nonce) { - unsigned int i, ctr, off; - unsigned int buflen = POLY_UNIFORM_BUFLEN; - unsigned char buf[POLY_UNIFORM_BUFLEN + 2]; - shake128ctx state; + unsigned int i, ctr; + size_t buflen = POLY_UNIFORM_BUFLEN; + uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; + stream128_state state; + size_t off; stream128_init(&state, seed, nonce); stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); @@ -347,7 +327,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -355,7 +335,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, **************************************************/ static unsigned int rej_eta(uint32_t *a, unsigned int len, - const unsigned char *buf, + const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; uint32_t t0, t1; @@ -384,19 +364,18 @@ static unsigned int rej_eta(uint32_t *a, * output stream from SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N/2 * (1U << SETABITS)) / (2*ETA + 1)\ - + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) #define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce) { unsigned int ctr; - unsigned char buf[POLY_UNIFORM_ETA_BUFLEN]; - shake128ctx state; + uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; + stream128_state state; stream128_init(&state, seed, nonce); stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); @@ -418,7 +397,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -426,7 +405,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, **************************************************/ static unsigned int rej_gamma1m1(uint32_t *a, unsigned int len, - const unsigned char *buf, + const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; uint32_t t0, t1; @@ -438,7 +417,7 @@ static unsigned int rej_gamma1m1(uint32_t *a, t0 |= (uint32_t)buf[pos + 2] << 16; t0 &= 0xFFFFF; - t1 = buf[pos + 2] >> 4; + t1 = buf[pos + 2] >> 4; t1 |= (uint32_t)buf[pos + 3] << 4; t1 |= (uint32_t)buf[pos + 4] << 12; @@ -463,19 +442,19 @@ static unsigned int rej_gamma1m1(uint32_t *a, * sampling on output stream of SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * CRHBYTES * - uint16_t nonce: 16-bit nonce **************************************************/ #define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) #define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(poly *a, - const unsigned char seed[CRHBYTES], + const uint8_t seed[CRHBYTES], uint16_t nonce) { unsigned int i, ctr, off; unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; - shake256ctx state; + uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; + stream256_state state; stream256_init(&state, seed, nonce); stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); @@ -500,18 +479,18 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(poly *a, * Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. * Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLETA_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(unsigned char *r, const poly *a) { +void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { unsigned int i; - unsigned char t[8]; + uint8_t t[8]; for (i = 0; i < N / 2; ++i) { - t[0] = (uint8_t) (Q + ETA - a->coeffs[2 * i + 0]); - t[1] = (uint8_t) (Q + ETA - a->coeffs[2 * i + 1]); - r[i] = (uint8_t) (t[0] | (t[1] << 4)); + t[0] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 0]); + t[1] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 1]); + r[i] = (uint8_t)(t[0] | (t[1] << 4)); } } @@ -522,68 +501,67 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(unsigned char *r, const poly *a) { * Output coefficients lie in [Q-ETA,Q+ETA]. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const unsigned char *a) { +void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; + for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[i] & 0x0F; r->coeffs[2 * i + 1] = a[i] >> 4; r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0]; r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1]; } + } /************************************************* -* Name: polyt1_pack +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack * * Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLT1_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(unsigned char *r, const poly *a) { +void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { unsigned int i; for (i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t) ((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t) ((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t) ((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t) ((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t) ((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t) ((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t) ((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t) ((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t) ((a->coeffs[8 * i + 7] >> 1)); + r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); + r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); + r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); + r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); + r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); + r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); + r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); + r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); + r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); } } /************************************************* -* Name: polyt1_unpack +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack * * Description: Unpack polynomial t1 with 9-bit coefficients. * Output coefficients are standard representatives. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) { - unsigned int i; - - for (i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; +void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { + for (size_t i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t) a[9 * i + 1] << 8)) & 0x1FF; + r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t) a[9 * i + 2] << 7)) & 0x1FF; + r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t) a[9 * i + 3] << 6)) & 0x1FF; + r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t) a[9 * i + 4] << 5)) & 0x1FF; + r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t) a[9 * i + 5] << 4)) & 0x1FF; + r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t) a[9 * i + 6] << 3)) & 0x1FF; + r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t) a[9 * i + 7] << 2)) & 0x1FF; + r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t) a[9 * i + 8] << 1)) & 0x1FF; } - } /************************************************* @@ -592,32 +570,30 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) { * Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. * Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLT0_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(unsigned char *r, const poly *a) { - unsigned int i; +void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { uint32_t t[4]; - for (i = 0; i < N / 4; ++i) { - t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; + for (size_t i = 0; i < N / 4; ++i) { + t[0] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 0]; + t[1] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 1]; + t[2] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 2]; + t[3] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 3]; - r[7 * i + 0] = (uint8_t) (t[0]); - r[7 * i + 1] = (uint8_t) (t[0] >> 8); - r[7 * i + 1] |= (uint8_t) (t[1] << 6); - r[7 * i + 2] = (uint8_t) (t[1] >> 2); - r[7 * i + 3] = (uint8_t) (t[1] >> 10); - r[7 * i + 3] |= (uint8_t) (t[2] << 4); - r[7 * i + 4] = (uint8_t) (t[2] >> 4); - r[7 * i + 5] = (uint8_t) (t[2] >> 12); - r[7 * i + 5] |= (uint8_t) (t[3] << 2); - r[7 * i + 6] = (uint8_t) (t[3] >> 6); + r[7 * i + 0] = (uint8_t)(t[0]); + r[7 * i + 1] = (uint8_t)(t[0] >> 8); + r[7 * i + 1] |= (uint8_t)(t[1] << 6); + r[7 * i + 2] = (uint8_t)(t[1] >> 2); + r[7 * i + 3] = (uint8_t)(t[1] >> 10); + r[7 * i + 3] |= (uint8_t)(t[2] << 4); + r[7 * i + 4] = (uint8_t)(t[2] >> 4); + r[7 * i + 5] = (uint8_t)(t[2] >> 12); + r[7 * i + 5] |= (uint8_t)(t[3] << 2); + r[7 * i + 6] = (uint8_t)(t[3] >> 6); } - } /************************************************* @@ -627,32 +603,30 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(unsigned char *r, const poly *a) { * Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) { - unsigned int i; +void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { - for (i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; + for (size_t i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[7 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t) (a[7 * i + 1] & 0x3F) << 8; - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; + r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; + r->coeffs[4 * i + 1] |= (uint32_t) a[7 * i + 2] << 2; + r->coeffs[4 * i + 1] |= (uint32_t) (a[7 * i + 3] & 0x0F) << 10; - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; + r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t) a[7 * i + 4] << 4; + r->coeffs[4 * i + 2] |= (uint32_t) (a[7 * i + 5] & 0x03) << 12; - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; + r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; + r->coeffs[4 * i + 3] |= (uint32_t) a[7 * i + 6] << 6; r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; } - } /************************************************* @@ -662,29 +636,27 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) { * in [-(GAMMA1 - 1), GAMMA1 - 1]. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLZ_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(unsigned char *r, const poly *a) { - unsigned int i; +void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a) { uint32_t t[2]; - for (i = 0; i < N / 2; ++i) { + for (size_t i = 0; i < N / 2; ++i) { /* Map to {0,...,2*GAMMA1 - 2} */ t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; t[0] += ((int32_t)t[0] >> 31) & Q; t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; t[1] += ((int32_t)t[1] >> 31) & Q; - r[5 * i + 0] = (uint8_t) (t[0]); - r[5 * i + 1] = (uint8_t) (t[0] >> 8); - r[5 * i + 2] = (uint8_t) (t[0] >> 16); - r[5 * i + 2] |= (uint8_t) (t[1] << 4); - r[5 * i + 3] = (uint8_t) (t[1] >> 4); - r[5 * i + 4] = (uint8_t) (t[1] >> 12); + r[5 * i + 0] = (uint8_t)t[0]; + r[5 * i + 1] = (uint8_t)(t[0] >> 8); + r[5 * i + 2] = (uint8_t)(t[0] >> 16); + r[5 * i + 2] |= (uint8_t)(t[1] << 4); + r[5 * i + 3] = (uint8_t)(t[1] >> 4); + r[5 * i + 4] = (uint8_t)(t[1] >> 12); } - } /************************************************* @@ -695,26 +667,23 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(unsigned char *r, const poly *a) { * Output coefficients are standard representatives. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const unsigned char *a) { - unsigned int i; - - for (i = 0; i < N / 2; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { + for (size_t i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; + r->coeffs[2 * i + 0] |= (uint32_t) a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t) (a[5 * i + 2] & 0x0F) << 16; r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 4] << 12; - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; + r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; } - } /************************************************* @@ -723,15 +692,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const unsigned char *a) { * Description: Bit-pack polynomial w1 with coefficients in [0, 15]. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLW1_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(unsigned char *r, const poly *a) { - unsigned int i; - - for (i = 0; i < N / 2; ++i) { - r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)); +void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { + for (size_t i = 0; i < N / 2; ++i) { + r[i] = (uint8_t)(a->coeffs[2 * i + 0] | a->coeffs[2 * i + 1] << 4); } } diff --git a/crypto_sign/dilithium2/clean/poly.h b/crypto_sign/dilithium2/clean/poly.h index 434f8ebb..c33876cc 100644 --- a/crypto_sign/dilithium2/clean/poly.h +++ b/crypto_sign/dilithium2/clean/poly.h @@ -1,8 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLY_H +#define PQCLEAN_DILITHIUM2_CLEAN_POLY_H + +#include #include "params.h" -#include typedef struct { uint32_t coeffs[N]; @@ -27,27 +28,27 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *a, const poly *b, const poly * int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B); void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce); void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce); void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(poly *a, - const unsigned char seed[CRHBYTES], + const uint8_t seed[CRHBYTES], uint16_t nonce); -void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(unsigned char *r, const poly *a); +void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a); #endif diff --git a/crypto_sign/dilithium2/clean/polyvec.c b/crypto_sign/dilithium2/clean/polyvec.c index 47c612ac..6a95880c 100644 --- a/crypto_sign/dilithium2/clean/polyvec.c +++ b/crypto_sign/dilithium2/clean/polyvec.c @@ -1,14 +1,15 @@ +#include + #include "params.h" #include "poly.h" #include "polyvec.h" -#include /**************************************************************/ /************ Vectors of polynomials of length L **************/ /**************************************************************/ /************************************************* -* Name: polyvecl_freeze +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze * * Description: Reduce coefficients of polynomials in vector of length L * to standard representatives. @@ -24,7 +25,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) { } /************************************************* -* Name: polyvecl_add +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add * * Description: Add vectors of polynomials of length L. * No modular reduction is performed. @@ -42,7 +43,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const } /************************************************* -* Name: polyvecl_ntt +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt * * Description: Forward NTT of all polynomials in vector of length L. Output * coefficients can be up to 16*Q larger than input coefficients. @@ -58,7 +59,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v) { } /************************************************* -* Name: polyvecl_pointwise_acc_invmontgomery +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery * * Description: Pointwise multiply vectors of polynomials of length L, multiply * resulting vector by 2^{-32} and add (accumulate) polynomials @@ -85,7 +86,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w, } /************************************************* -* Name: polyvecl_chknorm +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm * * Description: Check infinity norm of polynomials in vector of length L. * Assumes input coefficients to be standard representatives. @@ -96,14 +97,15 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w, * Returns 0 if norm of all polynomials is strictly smaller than B and 1 * otherwise. **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { +int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { unsigned int i; for (i = 0; i < L; ++i) { - if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) { + if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) { return 1; } } + return 0; } @@ -113,7 +115,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound) /************************************************* -* Name: polyveck_reduce +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce * * Description: Reduce coefficients of polynomials in vector of length K * to representatives in [0,2*Q[. @@ -129,7 +131,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v) { } /************************************************* -* Name: polyveck_csubq +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq * * Description: For all coefficients of polynomials in vector of length K * subtract Q if coefficient is bigger than Q. @@ -145,7 +147,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v) { } /************************************************* -* Name: polyveck_freeze +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze * * Description: Reduce coefficients of polynomials in vector of length K * to standard representatives. @@ -161,7 +163,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) { } /************************************************* -* Name: polyveck_add +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_add * * Description: Add vectors of polynomials of length K. * No modular reduction is performed. @@ -179,7 +181,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const } /************************************************* -* Name: polyveck_sub +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub * * Description: Subtract vectors of polynomials of length K. * Assumes coefficients of polynomials in second input vector @@ -199,7 +201,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const } /************************************************* -* Name: polyveck_shiftl +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl * * Description: Multiply vector of polynomials of Length K by 2^D without modular * reduction. Assumes input coefficients to be less than 2^{32-D}. @@ -215,7 +217,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) { } /************************************************* -* Name: polyveck_ntt +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt * * Description: Forward NTT of all polynomials in vector of length K. Output * coefficients can be up to 16*Q larger than input coefficients. @@ -231,7 +233,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) { } /************************************************* -* Name: polyveck_invntt_montgomery +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery * * Description: Inverse NTT and multiplication by 2^{32} of polynomials * in vector of length K. Input coefficients need to be less @@ -248,7 +250,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v) { } /************************************************* -* Name: polyveck_chknorm +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm * * Description: Check infinity norm of polynomials in vector of length K. * Assumes input coefficients to be standard representatives. @@ -259,19 +261,20 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v) { * Returns 0 if norm of all polynomials are strictly smaller than B and 1 * otherwise. **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t bound) { +int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { unsigned int i; for (i = 0; i < K; ++i) { - if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) { + if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) { return 1; } } + return 0; } /************************************************* -* Name: polyveck_power2round +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round * * Description: For all coefficients a of polynomials in vector of length K, * compute a0, a1 such that a mod Q = a1*2^D + a0 @@ -293,7 +296,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, c } /************************************************* -* Name: polyveck_decompose +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose * * Description: For all coefficients a of polynomials in vector of length K, * compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 @@ -316,7 +319,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con } /************************************************* -* Name: polyveck_make_hint +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint * * Description: Compute hint vector. * @@ -339,19 +342,19 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h, } /************************************************* -* Name: polyveck_use_hint +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint * * Description: Use hint vector to correct the high bits of input vector. * * Arguments: - polyveck *w: pointer to output vector of polynomials with * corrected high bits -* - const polyveck *u: pointer to input vector +* - const polyveck *v: pointer to input vector * - const polyveck *h: pointer to input hint vector **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { unsigned int i; for (i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); } } diff --git a/crypto_sign/dilithium2/clean/polyvec.h b/crypto_sign/dilithium2/clean/polyvec.h index 814043fc..4662e70a 100644 --- a/crypto_sign/dilithium2/clean/polyvec.h +++ b/crypto_sign/dilithium2/clean/polyvec.h @@ -1,9 +1,10 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H +#define PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H + +#include #include "params.h" #include "poly.h" -#include /* Vectors of polynomials of length L */ typedef struct { @@ -46,6 +47,6 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h, const polyveck *v0, const polyveck *v1); -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h); #endif diff --git a/crypto_sign/dilithium2/clean/reduce.c b/crypto_sign/dilithium2/clean/reduce.c index 8b07c59e..8444de2d 100644 --- a/crypto_sign/dilithium2/clean/reduce.c +++ b/crypto_sign/dilithium2/clean/reduce.c @@ -1,9 +1,10 @@ -#include "params.h" -#include "reduce.h" #include +#include "params.h" +#include "reduce.h" + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce * * Description: For finite field element a with 0 <= a <= Q*2^32, * compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q. @@ -20,11 +21,11 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(uint64_t a) { t *= Q; t = a + t; t >>= 32; - return (uint32_t) t; + return (uint32_t)t; } /************************************************* -* Name: reduce32 +* Name: PQCLEAN_DILITHIUM2_CLEAN_reduce32 * * Description: For finite field element a, compute r \equiv a (mod Q) * such that 0 <= r < 2*Q. @@ -43,7 +44,7 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(uint32_t a) { } /************************************************* -* Name: csubq +* Name: PQCLEAN_DILITHIUM2_CLEAN_csubq * * Description: Subtract Q if input coefficient is bigger than Q. * @@ -58,7 +59,7 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a) { } /************************************************* -* Name: freeze +* Name: PQCLEAN_DILITHIUM2_CLEAN_freeze * * Description: For finite field element a, compute standard * representative r = a mod Q. diff --git a/crypto_sign/dilithium2/clean/reduce.h b/crypto_sign/dilithium2/clean/reduce.h index 91ba25b2..fbd4b573 100644 --- a/crypto_sign/dilithium2/clean/reduce.h +++ b/crypto_sign/dilithium2/clean/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H +#define PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H #include diff --git a/crypto_sign/dilithium2/clean/rounding.c b/crypto_sign/dilithium2/clean/rounding.c index 126719e2..6272c927 100644 --- a/crypto_sign/dilithium2/clean/rounding.c +++ b/crypto_sign/dilithium2/clean/rounding.c @@ -1,7 +1,10 @@ +#include + #include "params.h" #include "rounding.h" + /************************************************* -* Name: power2round +* Name: PQCLEAN_DILITHIUM2_CLEAN_power2round * * Description: For finite field element a, compute a0, a1 such that * a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. @@ -17,16 +20,16 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0) { /* Centralized remainder mod 2^D */ t = a & ((1U << D) - 1); - t -= ((1U << (D - 1)) + 1); - t += ((uint32_t)((int32_t)t >> 31) & (1U << D)); - t -= ((1U << (D - 1)) - 1); - *a0 = (Q + t); + t -= (1U << (D - 1)) + 1; + t += ((uint32_t)((int32_t)t >> 31) & (1 << D)); + t -= (1U << (D - 1)) - 1; + *a0 = Q + t; a = (a - t) >> D; return a; } /************************************************* -* Name: decompose +* Name: PQCLEAN_DILITHIUM2_CLEAN_decompose * * Description: For finite field element a, compute high and low bits a0, a1 such * that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except @@ -41,28 +44,29 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0) { **************************************************/ uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0) { int32_t t, u; + /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFF; - t += (int32_t) ((a >> 19) << 9); + t = a & 0x7FFFFu; + t += (int32_t)((a >> 19u) << 9u); t -= ALPHA / 2 + 1; t += (t >> 31) & ALPHA; t -= ALPHA / 2 - 1; - a -= (uint32_t) t; + a -= (uint32_t)t; /* Divide by ALPHA (possible to avoid) */ - u = (int32_t) a - 1; + u = (int32_t)(a - 1); u >>= 31; a = (a >> 19) + 1; a -= u & 1; /* Border case */ - *a0 = Q + (uint32_t)t - (a >> 4); - a &= 0xF; + *a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u)); + a &= 0xFu; return a; } /************************************************* -* Name: make_hint +* Name: PQCLEAN_DILITHIUM2_CLEAN_make_hint * * Description: Compute hint bit indicating whether the low bits of the * input element overflow into the high bits. Inputs assumed to be @@ -73,7 +77,7 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0) { * * Returns 1 if high bits of a and b differ and 0 otherwise. **************************************************/ -unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) { +unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(const uint32_t a0, const uint32_t a1) { if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { return 0; } @@ -82,7 +86,7 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) { } /************************************************* -* Name: use_hint +* Name: PQCLEAN_DILITHIUM2_CLEAN_use_hint * * Description: Correct high bits according to hint. * @@ -91,7 +95,7 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) { * * Returns corrected high bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint) { +uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(const uint32_t a, const unsigned int hint) { uint32_t a0, a1; a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(a, &a0); @@ -101,5 +105,15 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint) { if (a0 > Q) { return (a1 + 1) & 0xF; } + return (a1 - 1) & 0xF; + + /* If PQCLEAN_DILITHIUM2_CLEAN_decompose does not divide out ALPHA: + if(hint == 0) + return a1; + else if(a0 > Q) + return (a1 + ALPHA) % (Q - 1); + else + return (a1 - ALPHA) % (Q - 1); + */ } diff --git a/crypto_sign/dilithium2/clean/rounding.h b/crypto_sign/dilithium2/clean/rounding.h index 94773a3b..5010ce35 100644 --- a/crypto_sign/dilithium2/clean/rounding.h +++ b/crypto_sign/dilithium2/clean/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H +#define PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H #include diff --git a/crypto_sign/dilithium2/clean/sign.c b/crypto_sign/dilithium2/clean/sign.c index b6b2e28f..4be7f2bb 100644 --- a/crypto_sign/dilithium2/clean/sign.c +++ b/crypto_sign/dilithium2/clean/sign.c @@ -1,3 +1,6 @@ +#include +#include + #include "fips202.h" #include "packing.h" #include "params.h" @@ -7,19 +10,17 @@ #include "sign.h" #include "symmetric.h" -#include - /************************************************* -* Name: expand_mat +* Name: PQCLEAN_DILITHIUM2_CLEAN_expand_mat * * Description: Implementation of ExpandA. Generates matrix A with uniformly * random coefficients a_{i,j} by performing rejection * sampling on the output stream of SHAKE128(rho|i|j). * * Arguments: - polyvecl mat[K]: output matrix -* - const unsigned char rho[]: byte array containing seed rho +* - const uint8_t rho[]: byte array containing seed rho **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]) { +void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { unsigned int i, j; for (i = 0; i < K; ++i) { @@ -30,23 +31,23 @@ void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rh } /************************************************* -* Name: challenge +* Name: PQCLEAN_DILITHIUM2_CLEAN_challenge * * Description: Implementation of H. Samples polynomial with 60 nonzero * coefficients in {-1,1} using the output stream of * SHAKE256(mu|w1). * * Arguments: - poly *c: pointer to output polynomial -* - const unsigned char mu[]: byte array containing mu +* - const uint8_t mu[]: byte array containing mu * - const polyveck *w1: pointer to vector w1 **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, - const unsigned char mu[CRHBYTES], + const uint8_t mu[CRHBYTES], const polyveck *w1) { unsigned int i, b, pos; uint64_t signs; - unsigned char inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - unsigned char outbuf[SHAKE256_RATE]; + uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; + uint8_t outbuf[SHAKE256_RATE]; shake256ctx state; for (i = 0; i < CRHBYTES; ++i) { @@ -88,22 +89,22 @@ void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, } /************************************************* -* Name: crypto_sign_keypair +* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair * * Description: Generates public and private key. * -* Arguments: - unsigned char *pk: pointer to output public key (allocated -* array of CRYPTO_PUBLICKEYBYTES bytes) -* - unsigned char *sk: pointer to output private key (allocated -* array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { unsigned int i; - unsigned char seedbuf[3 * SEEDBYTES]; - unsigned char tr[CRHBYTES]; - const unsigned char *rho, *rhoprime, *key; + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; uint16_t nonce = 0; polyvecl mat[K]; polyvecl s1, s1hat; @@ -144,19 +145,35 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { PQCLEAN_DILITHIUM2_CLEAN_pack_pk(pk, rho, &t1); /* Compute CRH(rho, t1) and write secret key */ - crh(tr, pk, CRYPTO_PUBLICKEYBYTES); + crh(tr, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES); PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); return 0; } +/************************************************* +* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES +* of len) +* - size_t *smlen: pointer to output length of signed message +* (should be PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, const uint8_t *sk) { + const uint8_t *msg, size_t mlen, + const uint8_t *sk) { unsigned long long i; unsigned int n; - unsigned char seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; - unsigned char *rho, *tr, *key, *mu, *rhoprime; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; uint16_t nonce = 0; poly c, chat; polyvecl mat[K], s1, y, yhat, z; @@ -170,13 +187,12 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( rhoprime = mu + CRHBYTES; PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); - // use incremental hash API instead of copying around buffers /* Compute CRH(tr, msg) */ shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, tr, CRHBYTES); - shake256_inc_absorb(&state, m, mlen); + shake256_inc_absorb(&state, msg, mlen); shake256_inc_finalize(&state); shake256_inc_squeeze(mu, CRHBYTES, &state); @@ -253,11 +269,51 @@ rej: /* Write signature */ PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, &z, &h, &c); - - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; return 0; } +/************************************************* +* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - unsigned long long *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - unsigned long long mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + int rc; + memmove(sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, m, mlen); + rc = PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); + *smlen += mlen; + return rc; +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *sig: signature +* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) +* - uint8_t *m: pointer to message +* - size_t *mlen: pointer to output length of message +* - uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { @@ -268,7 +324,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( polyvecl mat[K], z; polyveck t1, w1, h, tmp1, tmp2; - if (siglen < CRYPTO_BYTES) { + if (siglen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { return -1; } @@ -281,7 +337,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( } /* Compute CRH(CRH(rho, t1), msg) */ - crh(mu, pk, CRYPTO_PUBLICKEYBYTES); + crh(mu, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES); shake256incctx state; shake256_inc_init(&state); @@ -325,40 +381,9 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( // All good return 0; } -/************************************************* -* Name: crypto_sign -* -* Description: Compute signed message. -* -* Arguments: - unsigned char *sm: pointer to output signed message (allocated -* array with CRYPTO_BYTES + mlen bytes), -* can be equal to m -* - unsigned long long *smlen: pointer to output length of signed -* message -* - const unsigned char *m: pointer to message to be signed -* - unsigned long long mlen: length of message -* - const unsigned char *sk: pointer to bit-packed secret key -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, - size_t *smlen, - const uint8_t *m, - size_t mlen, - const uint8_t *sk) { - size_t i; - int rc; - for (i = 0; i < mlen; i++) { - sm[CRYPTO_BYTES + i] = m[i]; - } - rc = PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); - *smlen += mlen; - return rc; - -} /************************************************* -* Name: crypto_sign_open +* Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open * * Description: Verify signed message. * @@ -371,24 +396,23 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, * * Returns 0 if signed message could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, - size_t *mlen, - const uint8_t *sm, - size_t smlen, - const uint8_t *pk) { +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; + *mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; - if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, CRYPTO_BYTES, - sm + CRYPTO_BYTES, *mlen, pk)) { + if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, + sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + i]; } return 0; } diff --git a/crypto_sign/dilithium2/clean/sign.h b/crypto_sign/dilithium2/clean/sign.h index 0d909396..4196d29b 100644 --- a/crypto_sign/dilithium2/clean/sign.h +++ b/crypto_sign/dilithium2/clean/sign.h @@ -1,30 +1,12 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_SIGN_H +#define PQCLEAN_DILITHIUM2_CLEAN_SIGN_H +#include "api.h" #include "params.h" #include "poly.h" #include "polyvec.h" -void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const unsigned char mu[CRHBYTES], +void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES], const polyveck *w1); - -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, const uint8_t *sk); - -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, const uint8_t *pk); - -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); - #endif diff --git a/crypto_sign/dilithium2/clean/stream.c b/crypto_sign/dilithium2/clean/stream.c new file mode 100644 index 00000000..e862e9de --- /dev/null +++ b/crypto_sign/dilithium2/clean/stream.c @@ -0,0 +1,26 @@ +#include "stream.h" + +#include + +void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + + uint8_t buf[SEEDBYTES + 2]; + memcpy(buf, seed, SEEDBYTES); + buf[SEEDBYTES] = (uint8_t)nonce; + buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); + + shake128_absorb(state, buf, SEEDBYTES + 2); +} + + +void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + + uint8_t buf[CRHBYTES + 2]; + memcpy(buf, seed, CRHBYTES); + buf[CRHBYTES] = (uint8_t)nonce; + buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); + + shake256_absorb(state, buf, CRHBYTES + 2); +} diff --git a/crypto_sign/dilithium2/clean/stream.h b/crypto_sign/dilithium2/clean/stream.h new file mode 100644 index 00000000..d607ce99 --- /dev/null +++ b/crypto_sign/dilithium2/clean/stream.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_DILITHIUM2_CLEAN_STREAM_H +#define PQCLEAN_DILITHIUM2_CLEAN_STREAM_H + +#include + +#include "fips202.h" +#include "params.h" + +void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#endif diff --git a/crypto_sign/dilithium2/clean/symmetric.c b/crypto_sign/dilithium2/clean/symmetric.c deleted file mode 100644 index 3618b057..00000000 --- a/crypto_sign/dilithium2/clean/symmetric.c +++ /dev/null @@ -1,32 +0,0 @@ -#include "symmetric.h" -#include "fips202.h" - -void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(shake128ctx *state, - const unsigned char seed[SEEDBYTES], - uint16_t nonce) { - unsigned int i; - unsigned char buf[SEEDBYTES + 2]; - - for (i = 0; i < SEEDBYTES; ++i) { - buf[i] = seed[i]; - } - buf[SEEDBYTES] = (uint8_t) nonce; - buf[SEEDBYTES + 1] = (uint8_t) (nonce >> 8); - - shake128_absorb(state, buf, sizeof(buf)); -} - -void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(shake256ctx *state, - const unsigned char seed[CRHBYTES], - uint16_t nonce) { - unsigned int i; - unsigned char buf[CRHBYTES + 2]; - - for (i = 0; i < CRHBYTES; ++i) { - buf[i] = seed[i]; - } - buf[CRHBYTES] = (uint8_t) nonce; - buf[CRHBYTES + 1] = (uint8_t) (nonce >> 8); - - shake256_absorb(state, buf, sizeof(buf)); -} diff --git a/crypto_sign/dilithium2/clean/symmetric.h b/crypto_sign/dilithium2/clean/symmetric.h index c26cc354..b36694cc 100644 --- a/crypto_sign/dilithium2/clean/symmetric.h +++ b/crypto_sign/dilithium2/clean/symmetric.h @@ -1,8 +1,11 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H +#define PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H + +#include "params.h" +#include "stream.h" + #include "fips202.h" -#include "params.h" #define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) #define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(STATE, SEED, NONCE) @@ -13,11 +16,8 @@ #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE -void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(shake128ctx *state, - const unsigned char *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(shake256ctx *state, - const unsigned char *seed, - uint16_t nonce); +typedef shake128ctx stream128_state; +typedef shake256ctx stream256_state; + #endif diff --git a/crypto_sign/dilithium3/META.yml b/crypto_sign/dilithium3/META.yml index 58f92d67..4e8548a4 100644 --- a/crypto_sign/dilithium3/META.yml +++ b/crypto_sign/dilithium3/META.yml @@ -17,4 +17,13 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/dilithium/commit/40f79645879b5c69835cd91d06945d7c24f39922 + version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + - name: avx2 + version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + required_flags: + - avx2 + - bmi2 diff --git a/crypto_sign/dilithium3/avx2/LICENSE b/crypto_sign/dilithium3/avx2/LICENSE new file mode 100644 index 00000000..40541676 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/LICENSE @@ -0,0 +1,6 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and the random number generator +we are using public-domain code from sources +and by authors listed in comments on top of +the respective files. diff --git a/crypto_sign/dilithium3/avx2/Makefile b/crypto_sign/dilithium3/avx2/Makefile new file mode 100644 index 00000000..3438ba44 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/Makefile @@ -0,0 +1,43 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium3_avx2.a + +SOURCES = fips202x4.c invntt.s nttconsts.c ntt.s packing.c pointwise.S poly.c \ + polyvec.c reduce.s rejsample.c rounding.c sign.c stream.c +OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \ + polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o +HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ + nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \ + fips202x4.h shuffle.inc + +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \ + -Wmissing-prototypes -Wredundant-decls -std=c99 \ + -Wcast-align \ + -mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +KECCAK4XDIR=../../../common/keccak4x +KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o +KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.s $(HEADERS) + $(AS) -o $@ $< + +%.o: %.S $(HEADERS) + $(AS) -c -o $@ $< + +$(LIB): $(OBJECTS) $(KECCAK4X) + $(AR) -r $@ $^ + +$(KECCAK4X): + $(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) + $(MAKE) -C $(KECCAK4XDIR) clean + diff --git a/crypto_sign/dilithium3/avx2/alignment.h b/crypto_sign/dilithium3/avx2/alignment.h new file mode 100644 index 00000000..a1eb88f8 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/alignment.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H +#define PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H + +#define ALIGNED_UINT8(N) \ + union { \ + uint32_t as_arr[N]; \ + __m256i as_vec[(N)/32]; \ + } + +#define ALIGNED_UINT32(N) \ + union { \ + uint32_t as_arr[N]; \ + __m256i as_vec[(N)/8]; \ + } + +#define ALIGNED_UINT64(N) \ + union { \ + uint64_t as_arr[N]; \ + __m256i as_vec[(N)/8]; \ + } + +#endif //PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H diff --git a/crypto_sign/dilithium3/avx2/api.h b/crypto_sign/dilithium3/avx2/api.h new file mode 100644 index 00000000..b5e5cbf7 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/api.h @@ -0,0 +1,37 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_API_H +#define PQCLEAN_DILITHIUM3_AVX2_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES 1472U +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES 3504U +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES 2701U + +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_ALGNAME "Dilithium3" + + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair( + uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *msg, size_t len, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + + + +#endif diff --git a/crypto_sign/dilithium3/avx2/fips202x4.c b/crypto_sign/dilithium3/avx2/fips202x4.c new file mode 100644 index 00000000..d3bc9a2a --- /dev/null +++ b/crypto_sign/dilithium3/avx2/fips202x4.c @@ -0,0 +1,239 @@ +#include +#include + +#include "fips202.h" +#include "fips202x4.h" +#include "params.h" + +#define NROUNDS 24 +#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) + +static uint64_t load64(const uint8_t *x) { + unsigned int i; + uint64_t r = 0; + + for (i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +static void store64(uint8_t *x, uint64_t u) { + unsigned int i; + + for (i = 0; i < 8; ++i) { + x[i] = (uint8_t)(u >> 8 * i); + } +} + +/* Use implementation from the Keccak Code Package */ +extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); +#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds + +static void keccak_absorb4x(__m256i *s, + unsigned int r, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen, + uint8_t p) { + unsigned long long i; + uint8_t t0[200]; + uint8_t t1[200]; + uint8_t t2[200]; + uint8_t t3[200]; + uint64_t *ss = (uint64_t *)s; + + for (i = 0; i < 25; ++i) { + s[i] = _mm256_xor_si256(s[i], s[i]); + } + + while (mlen >= r) { + for (i = 0; i < r / 8; ++i) { + ss[4 * i + 0] ^= load64(m0 + 8 * i); + ss[4 * i + 1] ^= load64(m1 + 8 * i); + ss[4 * i + 2] ^= load64(m2 + 8 * i); + ss[4 * i + 3] ^= load64(m3 + 8 * i); + } + + KeccakF1600_StatePermute4x(s); + mlen -= r; + m0 += r; + m1 += r; + m2 += r; + m3 += r; + } + + for (i = 0; i < r; ++i) { + t0[i] = 0; + t1[i] = 0; + t2[i] = 0; + t3[i] = 0; + } + for (i = 0; i < mlen; ++i) { + t0[i] = m0[i]; + t1[i] = m1[i]; + t2[i] = m2[i]; + t3[i] = m3[i]; + } + + t0[i] = p; + t1[i] = p; + t2[i] = p; + t3[i] = p; + + t0[r - 1] |= 128; + t1[r - 1] |= 128; + t2[r - 1] |= 128; + t3[r - 1] |= 128; + + for (i = 0; i < r / 8; ++i) { + ss[4 * i + 0] ^= load64(t0 + 8 * i); + ss[4 * i + 1] ^= load64(t1 + 8 * i); + ss[4 * i + 2] ^= load64(t2 + 8 * i); + ss[4 * i + 3] ^= load64(t3 + 8 * i); + } +} + + +static void keccak_squeezeblocks4x(uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + unsigned int r, + __m256i *s) { + unsigned int i; + uint64_t *ss = (uint64_t *)s; + + while (nblocks > 0) { + KeccakF1600_StatePermute4x(s); + for (i = 0; i < r / 8; ++i) { + store64(h0 + 8 * i, ss[4 * i + 0]); + store64(h1 + 8 * i, ss[4 * i + 1]); + store64(h2 + 8 * i, ss[4 * i + 2]); + store64(h3 + 8 * i, ss[4 * i + 3]); + } + + h0 += r; + h1 += r; + h2 += r; + h3 += r; + --nblocks; + } + +} + +void PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F); +} + +void PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s) { + keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s); +} + +void PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F); +} + +void PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s) { + keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s); +} + +void PQCLEAN_DILITHIUM3_AVX2_shake128_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + unsigned int i; + unsigned long nblocks = hlen / SHAKE128_RATE; + uint8_t t[4][SHAKE128_RATE]; + __m256i s[25]; + + PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen); + PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + + h0 += nblocks * SHAKE128_RATE; + h1 += nblocks * SHAKE128_RATE; + h2 += nblocks * SHAKE128_RATE; + h3 += nblocks * SHAKE128_RATE; + hlen -= nblocks * SHAKE128_RATE; + + if (hlen) { + PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); + for (i = 0; i < hlen; ++i) { + h0[i] = t[0][i]; + h1[i] = t[1][i]; + h2[i] = t[2][i]; + h3[i] = t[3][i]; + } + } +} + +void PQCLEAN_DILITHIUM3_AVX2_shake256_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + unsigned int i; + unsigned long nblocks = hlen / SHAKE256_RATE; + uint8_t t[4][SHAKE256_RATE]; + __m256i s[25]; + + PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen); + PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + + h0 += nblocks * SHAKE256_RATE; + h1 += nblocks * SHAKE256_RATE; + h2 += nblocks * SHAKE256_RATE; + h3 += nblocks * SHAKE256_RATE; + hlen -= nblocks * SHAKE256_RATE; + + if (hlen) { + PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); + for (i = 0; i < hlen; ++i) { + h0[i] = t[0][i]; + h1[i] = t[1][i]; + h2[i] = t[2][i]; + h3[i] = t[3][i]; + } + } +} diff --git a/crypto_sign/dilithium3/avx2/fips202x4.h b/crypto_sign/dilithium3/avx2/fips202x4.h new file mode 100644 index 00000000..58d25641 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/fips202x4.h @@ -0,0 +1,65 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_FIPS202X4_H +#define PQCLEAN_DILITHIUM3_AVX2_FIPS202X4_H + +#include +#include + +#include "params.h" + +void PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s); + +void PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s); + +void PQCLEAN_DILITHIUM3_AVX2_shake128_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM3_AVX2_shake256_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +#endif diff --git a/crypto_sign/dilithium3/avx2/invntt.s b/crypto_sign/dilithium3/avx2/invntt.s new file mode 100644 index 00000000..42fe13d6 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/invntt.s @@ -0,0 +1,281 @@ +.include "shuffle.inc" + +.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 +vpaddd %ymm2,%ymm\l0,%ymm12 +vpaddd %ymm2,%ymm\l1,%ymm13 +vpaddd %ymm2,%ymm\l2,%ymm14 + +vpsubd %ymm\h0,%ymm12,%ymm12 +vpsubd %ymm\h1,%ymm13,%ymm13 +vpsubd %ymm\h2,%ymm14,%ymm14 + +vpmuludq %ymm\z0,%ymm12,%ymm12 +vpmuludq %ymm\z0,%ymm13,%ymm13 +vpaddd %ymm2,%ymm\l3,%ymm15 + +vpmuludq %ymm\z1,%ymm14,%ymm14 +vpsubd %ymm\h3,%ymm15,%ymm15 +vpaddd %ymm\l0,%ymm\h0,%ymm\l0 + +vpmuludq %ymm\z1,%ymm15,%ymm15 +vpaddd %ymm\l1,%ymm\h1,%ymm\l1 +vpaddd %ymm\l2,%ymm\h2,%ymm\l2 + +vpaddd %ymm\l3,%ymm\h3,%ymm\l3 + +vpmuludq %ymm0,%ymm12,%ymm\h0 +vpmuludq %ymm0,%ymm13,%ymm\h1 +vpmuludq %ymm0,%ymm14,%ymm\h2 +vpmuludq %ymm0,%ymm15,%ymm\h3 +vpmuludq %ymm1,%ymm\h0,%ymm\h0 +vpmuludq %ymm1,%ymm\h1,%ymm\h1 +vpmuludq %ymm1,%ymm\h2,%ymm\h2 +vpmuludq %ymm1,%ymm\h3,%ymm\h3 +vpaddq %ymm12,%ymm\h0,%ymm\h0 +vpaddq %ymm13,%ymm\h1,%ymm\h1 +vpaddq %ymm14,%ymm\h2,%ymm\h2 +vpaddq %ymm15,%ymm\h3,%ymm\h3 +vpsrlq $32,%ymm\h0,%ymm\h0 +vpsrlq $32,%ymm\h1,%ymm\h1 +vpsrlq $32,%ymm\h2,%ymm\h2 +vpsrlq $32,%ymm\h3,%ymm\h3 +.endm + +.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx +PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm6 +vmovdqa 32(%rsi),%ymm7 +vmovdqa 64(%rsi),%ymm5 +vmovdqa 96(%rsi),%ymm10 + +#reorder +shuffle8 6,5,8,5 +shuffle8 7,10,6,10 + +shuffle4 8,6,4,6 +shuffle4 5,10,8,10 + +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vpsrlq $32,%ymm10,%ymm11 + +level0: +vpmovzxdq (%rdx),%ymm3 +vpmovzxdq 16(%rdx),%ymm15 +vpaddd %ymm2,%ymm4,%ymm12 +vpaddd %ymm2,%ymm6,%ymm13 +vpaddd %ymm2,%ymm8,%ymm14 + +vpsubd %ymm5,%ymm12,%ymm12 +vpsubd %ymm7,%ymm13,%ymm13 +vpsubd %ymm9,%ymm14,%ymm14 + +vpmuludq %ymm3,%ymm12,%ymm12 +vpmuludq %ymm15,%ymm13,%ymm13 +vpaddd %ymm2,%ymm10,%ymm15 + +vpsubd %ymm11,%ymm15,%ymm15 +vpaddd %ymm4,%ymm5,%ymm4 +vpaddd %ymm6,%ymm7,%ymm6 +vpmovzxdq 32(%rdx),%ymm5 +vpmovzxdq 48(%rdx),%ymm7 + +vpmuludq %ymm5,%ymm14,%ymm14 +vpmuludq %ymm7,%ymm15,%ymm15 +vpaddd %ymm8,%ymm9,%ymm8 + +vpaddd %ymm10,%ymm11,%ymm10 + +vpmuludq %ymm0,%ymm12,%ymm5 +vpmuludq %ymm0,%ymm13,%ymm7 +vpmuludq %ymm0,%ymm14,%ymm9 +vpmuludq %ymm0,%ymm15,%ymm11 +vpmuludq %ymm1,%ymm5,%ymm5 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm9,%ymm9 +vpmuludq %ymm1,%ymm11,%ymm11 +vpaddq %ymm12,%ymm5,%ymm5 +vpaddq %ymm13,%ymm7,%ymm7 +vpaddq %ymm14,%ymm9,%ymm9 +vpaddq %ymm15,%ymm11,%ymm11 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm7,%ymm7 +vpsrlq $32,%ymm9,%ymm9 +vpsrlq $32,%ymm11,%ymm11 + +level1: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpmovzxdq 64(%rdx),%ymm15 +vpmovzxdq 80(%rdx),%ymm3 + +butterfly 4,5,8,9,6,7,10,11 + +level2: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpmovzxdq 96(%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 3,3 + +#shuffle +shuffle4 4,5,3,5 +shuffle4 6,7,4,7 +shuffle4 8,9,6,9 +shuffle4 10,11,8,11 + +level3: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd 112(%rdx),%ymm14 +vpbroadcastd 116(%rdx),%ymm15 +vpblendd $0xF0,%ymm15,%ymm14,%ymm10 + +butterfly 3,4,6,8,5,7,9,11 10,10 + +#shuffle +shuffle8 3,4,10,4 +shuffle8 6,8,3,8 +shuffle8 5,7,6,7 +shuffle8 9,11,5,11 + +level4: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd 120(%rdx),%ymm9 + +butterfly 10,3,6,5,4,8,7,11 9,9 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm4,128(%rdi) +vmovdqa %ymm8,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx +PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x256q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm4 +vmovdqa 256(%rsi),%ymm5 +vmovdqa 512(%rsi),%ymm6 +vmovdqa 768(%rsi),%ymm7 +vmovdqa 1024(%rsi),%ymm8 +vmovdqa 1280(%rsi),%ymm9 +vmovdqa 1536(%rsi),%ymm10 +vmovdqa 1792(%rsi),%ymm11 + +level5: +vpbroadcastd (%rdx),%ymm3 +vpbroadcastd 4(%rdx),%ymm15 +vpaddd %ymm2,%ymm4,%ymm12 +vpaddd %ymm2,%ymm6,%ymm13 +vpaddd %ymm2,%ymm8,%ymm14 + +vpsubd %ymm5,%ymm12,%ymm12 +vpsubd %ymm7,%ymm13,%ymm13 +vpsubd %ymm9,%ymm14,%ymm14 + +vpmuludq %ymm3,%ymm12,%ymm12 +vpmuludq %ymm15,%ymm13,%ymm13 +vpaddd %ymm2,%ymm10,%ymm15 + +vpsubd %ymm11,%ymm15,%ymm15 +vpaddd %ymm4,%ymm5,%ymm4 +vpaddd %ymm6,%ymm7,%ymm6 +vpbroadcastd 8(%rdx),%ymm5 +vpbroadcastd 12(%rdx),%ymm7 + +vpmuludq %ymm5,%ymm14,%ymm14 +vpmuludq %ymm7,%ymm15,%ymm15 +vpaddd %ymm8,%ymm9,%ymm8 + +vpaddd %ymm10,%ymm11,%ymm10 + +vpmuludq %ymm0,%ymm12,%ymm5 +vpmuludq %ymm0,%ymm13,%ymm7 +vpmuludq %ymm0,%ymm14,%ymm9 +vpmuludq %ymm0,%ymm15,%ymm11 +vpmuludq %ymm1,%ymm5,%ymm5 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm9,%ymm9 +vpmuludq %ymm1,%ymm11,%ymm11 +vpaddq %ymm12,%ymm5,%ymm5 +vpaddq %ymm13,%ymm7,%ymm7 +vpaddq %ymm14,%ymm9,%ymm9 +vpaddq %ymm15,%ymm11,%ymm11 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm7,%ymm7 +vpsrlq $32,%ymm9,%ymm9 +vpsrlq $32,%ymm11,%ymm11 + +level6: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd 16(%rdx),%ymm15 +vpbroadcastd 20(%rdx),%ymm3 + +butterfly 4,5,8,9,6,7,10,11 + +level7: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd 24(%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 3,3 + +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xdiv(%rip),%ymm3 + +vpmuludq %ymm3,%ymm4,%ymm4 +vpmuludq %ymm3,%ymm5,%ymm5 +vpmuludq %ymm3,%ymm6,%ymm6 +vpmuludq %ymm3,%ymm7,%ymm7 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm0,%ymm6,%ymm14 +vpmuludq %ymm0,%ymm7,%ymm15 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm12,%ymm4,%ymm4 +vpaddq %ymm13,%ymm5,%ymm5 +vpaddq %ymm14,%ymm6,%ymm6 +vpaddq %ymm15,%ymm7,%ymm7 +vpsrlq $32,%ymm4,%ymm4 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm6,%ymm6 +vpsrlq $32,%ymm7,%ymm7 + +#store +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_mask(%rip),%ymm3 +vpermd %ymm4,%ymm3,%ymm4 +vpermd %ymm5,%ymm3,%ymm5 +vpermd %ymm6,%ymm3,%ymm6 +vpermd %ymm7,%ymm3,%ymm7 +vpermd %ymm8,%ymm3,%ymm8 +vpermd %ymm9,%ymm3,%ymm9 +vpermd %ymm10,%ymm3,%ymm10 +vpermd %ymm11,%ymm3,%ymm11 +vmovdqa %xmm4,(%rdi) +vmovdqa %xmm5,128(%rdi) +vmovdqa %xmm6,256(%rdi) +vmovdqa %xmm7,384(%rdi) +vmovdqa %xmm8,512(%rdi) +vmovdqa %xmm9,640(%rdi) +vmovdqa %xmm10,768(%rdi) +vmovdqa %xmm11,896(%rdi) + +ret diff --git a/crypto_sign/dilithium3/avx2/ntt.h b/crypto_sign/dilithium3/avx2/ntt.h new file mode 100644 index 00000000..a5474dc6 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/ntt.h @@ -0,0 +1,26 @@ +#ifndef NTT_H +#define NTT_H + +#include + +#include "nttconsts.h" +#include "params.h" + +void PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx(uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas); +void PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx(uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas); + +void PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx(uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas_inv); +void PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx(uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas_inv); + +void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); + +#endif diff --git a/crypto_sign/dilithium3/avx2/ntt.s b/crypto_sign/dilithium3/avx2/ntt.s new file mode 100644 index 00000000..9fb961c6 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/ntt.s @@ -0,0 +1,178 @@ +.include "shuffle.inc" + +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 +#mul +vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 +vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 +vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2 +vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3 + +#reduce +vpmuludq %ymm0,%ymm\rh0,%ymm12 +vpmuludq %ymm0,%ymm\rh1,%ymm13 +vpmuludq %ymm0,%ymm\rh2,%ymm14 +vpmuludq %ymm0,%ymm\rh3,%ymm15 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm\rh0,%ymm12,%ymm12 +vpaddq %ymm\rh1,%ymm13,%ymm13 +vpaddq %ymm\rh2,%ymm14,%ymm14 +vpaddq %ymm\rh3,%ymm15,%ymm15 +vpsrlq $32,%ymm12,%ymm12 +vpsrlq $32,%ymm13,%ymm13 +vpsrlq $32,%ymm14,%ymm14 +vpsrlq $32,%ymm15,%ymm15 + +#update +vpaddd %ymm2,%ymm\rl0,%ymm\rh0 +vpaddd %ymm2,%ymm\rl1,%ymm\rh1 +vpaddd %ymm2,%ymm\rl2,%ymm\rh2 +vpaddd %ymm2,%ymm\rl3,%ymm\rh3 +vpaddd %ymm12,%ymm\rl0,%ymm\rl0 +vpaddd %ymm13,%ymm\rl1,%ymm\rl1 +vpaddd %ymm14,%ymm\rl2,%ymm\rl2 +vpaddd %ymm15,%ymm\rl3,%ymm\rl3 +vpsubd %ymm12,%ymm\rh0,%ymm\rh0 +vpsubd %ymm13,%ymm\rh1,%ymm\rh1 +vpsubd %ymm14,%ymm\rh2,%ymm\rh2 +vpsubd %ymm15,%ymm\rh3,%ymm\rh3 +.endm + +.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx +PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2 + +level0: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd (%rdx),%ymm3 + +#load +vpmovzxdq (%rsi),%ymm4 +vpmovzxdq 128(%rsi),%ymm5 +vpmovzxdq 256(%rsi),%ymm6 +vpmovzxdq 384(%rsi),%ymm7 +vpmovzxdq 512(%rsi),%ymm8 +vpmovzxdq 640(%rsi),%ymm9 +vpmovzxdq 768(%rsi),%ymm10 +vpmovzxdq 896(%rsi),%ymm11 + +butterfly 4,5,6,7,8,9,10,11 + +level1: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd 4(%rdx),%ymm12 +vpbroadcastd 8(%rdx),%ymm13 + +butterfly 4,5,8,9,6,7,10,11 12,12,13,13 + +level2: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd 12(%rdx),%ymm12 +vpbroadcastd 16(%rdx),%ymm13 +vpbroadcastd 20(%rdx),%ymm14 +vpbroadcastd 24(%rdx),%ymm15 + +butterfly 4,6,8,10,5,7,9,11 12,13,14,15 + +#store +vmovdqa %ymm4,(%rdi) +vmovdqa %ymm5,256(%rdi) +vmovdqa %ymm6,512(%rdi) +vmovdqa %ymm7,768(%rdi) +vmovdqa %ymm8,1024(%rdi) +vmovdqa %ymm9,1280(%rdi) +vmovdqa %ymm10,1536(%rdi) +vmovdqa %ymm11,1792(%rdi) + +ret + +.global PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx +PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x2q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm4 +vmovdqa 32(%rsi),%ymm5 +vmovdqa 64(%rsi),%ymm6 +vmovdqa 96(%rsi),%ymm7 +vmovdqa 128(%rsi),%ymm8 +vmovdqa 160(%rsi),%ymm9 +vmovdqa 192(%rsi),%ymm10 +vmovdqa 224(%rsi),%ymm11 + +level3: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd (%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 + +level4: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpbroadcastd 4(%rdx),%ymm12 +vpbroadcastd 8(%rdx),%ymm13 +vpblendd $0xF0,%ymm13,%ymm12,%ymm12 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +butterfly 3,8,4,9,5,10,6,11 12,12,12,12 + +level5: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpmovzxdq 12(%rdx),%ymm12 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +butterfly 7,5,3,10,8,6,4,11 12,12,12,12 + +level6: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpmovzxdq 28(%rdx),%ymm12 +vpmovzxdq 44(%rdx),%ymm13 + +butterfly 7,5,8,6,3,10,4,11 12,12,13,13 + +level7: +#PQCLEAN_DILITHIUM3_AVX2_zetas +vpmovzxdq 60(%rdx),%ymm12 +vpmovzxdq 76(%rdx),%ymm13 +vpmovzxdq 92(%rdx),%ymm14 +vpmovzxdq 108(%rdx),%ymm15 + +butterfly 7,3,8,4,5,10,6,11 12,13,14,15 + +#store +vpsllq $32,%ymm5,%ymm5 +vpsllq $32,%ymm10,%ymm10 +vpsllq $32,%ymm6,%ymm6 +vpsllq $32,%ymm11,%ymm11 +vpblendd $0xAA,%ymm5,%ymm7,%ymm7 +vpblendd $0xAA,%ymm10,%ymm3,%ymm3 +vpblendd $0xAA,%ymm6,%ymm8,%ymm8 +vpblendd $0xAA,%ymm11,%ymm4,%ymm4 + +shuffle4 7,3,5,3 +shuffle4 8,4,7,4 + +shuffle8 5,7,6,7 +shuffle8 3,4,5,4 + +vmovdqa %ymm6,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm4,96(%rdi) + +ret diff --git a/crypto_sign/dilithium3/avx2/nttconsts.c b/crypto_sign/dilithium3/avx2/nttconsts.c new file mode 100644 index 00000000..12d9ceac --- /dev/null +++ b/crypto_sign/dilithium3/avx2/nttconsts.c @@ -0,0 +1,80 @@ +#include "nttconsts.h" + +#define QINV 4236238847 // -q^(-1) mod 2^32 +#define MONT 4193792ULL +#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) + + +const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, + 256 * Q + } + }; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, + 0x7FFFFF, 0x7FFFFF + } + }; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; + +#undef QINV +#undef MONT +#undef DIV + + +const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas = { + .as_arr = { + 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776, + 3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667, + 5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191, + 3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439, + 7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422, + 1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579, + 8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390, + 7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620, + 5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868, + 3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076, + 6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435, + 5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599, + 3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165, + 7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031, + 6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064, + 2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112, + 162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237, + 8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977, + 1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735, + 6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892, + 5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443, + 7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090, + 4834730, 7018208, 1976782 + } +}; + +const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv = { + .as_arr = { + 6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985, + 4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748, + 2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646, + 1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087, + 177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422, + 6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573, + 5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061, + 6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386, + 1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252, + 1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818, + 1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496, + 7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525, + 6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443, + 7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093, + 5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647, + 4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669, + 43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330, + 1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900, + 6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326, + 4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745, + 3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293, + 518909, 2608894, 3975713 + } +}; diff --git a/crypto_sign/dilithium3/avx2/nttconsts.h b/crypto_sign/dilithium3/avx2/nttconsts.h new file mode 100644 index 00000000..ed8df189 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/nttconsts.h @@ -0,0 +1,27 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H +#define PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H + +#include +#include + +#include "alignment.h" +#include "params.h" + +typedef ALIGNED_UINT32(8) aligned_uint32x8_t; + +typedef ALIGNED_UINT32(N) aligned_uint32xN_t; + + +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xqinv; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xq; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x2q; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x256q; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_mask; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8x23ones; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM3_AVX2_8xdiv; + +extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas; +extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv; + +#endif //PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H + diff --git a/crypto_sign/dilithium3/avx2/packing.c b/crypto_sign/dilithium3/avx2/packing.c new file mode 100644 index 00000000..6b10b15d --- /dev/null +++ b/crypto_sign/dilithium3/avx2/packing.c @@ -0,0 +1,305 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_pack_pk( + uint8_t *pk, + const uint8_t *rho, + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_unpack_pk( + uint8_t *rho, + polyveck *t1, + const uint8_t *pk) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_pack_sk +* +* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t key[]: byte array containing key +* - const uint8_t tr[]: byte array containing tr +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +* - const polyveck *t0: pointer to vector t0 +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); + } + sk += L * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); + } + sk += K * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sk +* +* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t key[]: output byte array for key +* - const uint8_t tr[]: output byte array for tr +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - const polyveck *r0: pointer to output vector t0 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); + } + sk += L * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); + } + sk += K * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_pack_sig +* +* Description: Bit-pack signature sig = (z, h, c). +* +* Arguments: - uint8_t sig[]: output byte array +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +* - const poly *c: pointer to PQCLEAN_DILITHIUM3_AVX2_challenge polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_pack_sig( + uint8_t *sig, + const polyvecl *z, + const polyveck *h, + const poly *c) { + unsigned int i, j, k; + uint64_t signs, mask; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); + } + sig += L * POLZ_SIZE_PACKED; + + /* Encode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t)j; + } + } + + sig[OMEGA + i] = (uint8_t)k; + } + while (k < OMEGA) { + sig[k++] = 0; + } + sig += OMEGA + K; + + /* Encode c */ + signs = 0; + mask = 1; + for (i = 0; i < N / 8; ++i) { + sig[i] = 0; + for (j = 0; j < 8; ++j) { + if (c->coeffs[8 * i + j] != 0) { + sig[i] |= (uint8_t)(1u << j); + if (c->coeffs[8 * i + j] == (Q - 1)) { + signs |= mask; + } + mask <<= 1; + } + } + } + sig += N / 8; + for (i = 0; i < 8; ++i) { + sig[i] = (uint8_t)(signs >> 8u * i); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sig +* +* Description: Unpack signature sig = (z, h, c). +* +* Arguments: - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - poly *c: pointer to output PQCLEAN_DILITHIUM3_AVX2_challenge polynomial +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_unpack_sig( + polyvecl *z, + polyveck *h, + poly *c, + const uint8_t *sig) { + unsigned int i, j, k; + uint64_t signs; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); + } + sig += L * POLZ_SIZE_PACKED; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + sig += OMEGA + K; + + /* Decode c */ + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)sig[N / 8 + i] << 8 * i; + } + + /* Extra sign bits are zero for strong unforgeability */ + if (signs >> 60) { + return 1; + } + + for (i = 0; i < N / 8; ++i) { + for (j = 0; j < 8; ++j) { + if ((sig[i] >> j) & 0x01) { + c->coeffs[8 * i + j] = 1; + c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); + signs >>= 1; + } + } + } + + return 0; +} diff --git a/crypto_sign/dilithium3/avx2/packing.h b/crypto_sign/dilithium3/avx2/packing.h new file mode 100644 index 00000000..5fb7dc00 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/packing.h @@ -0,0 +1,36 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_PACKING_H +#define PQCLEAN_DILITHIUM3_AVX2_PACKING_H + +#include "params.h" +#include "polyvec.h" + +void PQCLEAN_DILITHIUM3_AVX2_pack_pk( + uint8_t *pk, + const uint8_t *rho, const polyveck *t1); +void PQCLEAN_DILITHIUM3_AVX2_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0); +void PQCLEAN_DILITHIUM3_AVX2_pack_sig( + uint8_t *sig, + const polyvecl *z, const polyveck *h, const poly *c); + +void PQCLEAN_DILITHIUM3_AVX2_unpack_pk( + uint8_t *rho, polyveck *t1, + const uint8_t *pk); +void PQCLEAN_DILITHIUM3_AVX2_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk); +int PQCLEAN_DILITHIUM3_AVX2_unpack_sig( + polyvecl *z, polyveck *h, poly *c, const uint8_t *sig); + +#endif diff --git a/crypto_sign/dilithium3/avx2/params.h b/crypto_sign/dilithium3/avx2/params.h new file mode 100644 index 00000000..727bee85 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/params.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_PARAMS_H +#define PQCLEAN_DILITHIUM3_AVX2_PARAMS_H + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define QBITS 23 +#define D 14 +#define GAMMA1 ((Q - 1)/16) +#define GAMMA2 (GAMMA1/2) +#define ALPHA (2*GAMMA2) + +#define K 5 +#define L 4 +#define ETA 5 +#define SETABITS 4 +#define BETA 275 +#define OMEGA 96 + + +#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) +#define POLT0_SIZE_PACKED ((N*D)/8) +#define POLETA_SIZE_PACKED ((N*SETABITS)/8) +#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) +#define POLW1_SIZE_PACKED ((N*4)/8) + +#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLT1_SIZE_PACKED) +#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + (L + K)*POLETA_SIZE_PACKED + CRHBYTES + K*POLT0_SIZE_PACKED) +#define CRYPTO_BYTES (L*POLZ_SIZE_PACKED + (OMEGA + K) + (N/8 + 8)) + +#endif diff --git a/crypto_sign/dilithium3/avx2/pointwise.S b/crypto_sign/dilithium3/avx2/pointwise.S new file mode 100644 index 00000000..320a91d8 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/pointwise.S @@ -0,0 +1,191 @@ +#include "params.h" + +.global PQCLEAN_DILITHIUM3_AVX2_pointwise_avx +PQCLEAN_DILITHIUM3_AVX2_pointwise_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 + +xor %eax,%eax +_looptop1: +#load +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa 64(%rsi),%ymm6 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vmovdqa 64(%rdx),%ymm14 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 +vpsrlq $32,%ymm14,%ymm15 + +#mul +vpmuludq %ymm2,%ymm10,%ymm2 +vpmuludq %ymm3,%ymm11,%ymm3 +vpmuludq %ymm4,%ymm12,%ymm4 +vpmuludq %ymm5,%ymm13,%ymm5 +vpmuludq %ymm6,%ymm14,%ymm6 +vpmuludq %ymm7,%ymm15,%ymm7 + +#reduce +vpmuludq %ymm0,%ymm2,%ymm10 +vpmuludq %ymm0,%ymm3,%ymm11 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm0,%ymm6,%ymm14 +vpmuludq %ymm0,%ymm7,%ymm15 +vpmuludq %ymm1,%ymm10,%ymm10 +vpmuludq %ymm1,%ymm11,%ymm11 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm2,%ymm10,%ymm2 +vpaddq %ymm3,%ymm11,%ymm3 +vpaddq %ymm4,%ymm12,%ymm4 +vpaddq %ymm5,%ymm13,%ymm5 +vpaddq %ymm6,%ymm14,%ymm6 +vpaddq %ymm7,%ymm15,%ymm7 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 +vpsrlq $32,%ymm6,%ymm6 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 +vpblendd $0xAA,%ymm7,%ymm6,%ymm6 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm6,64(%rdi) + +add $96,%rdi +add $96,%rsi +add $96,%rdx +add $1,%eax +cmp $10,%eax +jb _looptop1 + +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 + +#mul +vpmuludq %ymm2,%ymm10,%ymm2 +vpmuludq %ymm3,%ymm11,%ymm3 +vpmuludq %ymm4,%ymm12,%ymm4 +vpmuludq %ymm5,%ymm13,%ymm5 + +#reduce +vpmuludq %ymm0,%ymm2,%ymm10 +vpmuludq %ymm0,%ymm3,%ymm11 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm1,%ymm10,%ymm10 +vpmuludq %ymm1,%ymm11,%ymm11 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpaddq %ymm2,%ymm10,%ymm2 +vpaddq %ymm3,%ymm11,%ymm3 +vpaddq %ymm4,%ymm12,%ymm4 +vpaddq %ymm5,%ymm13,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 + +#store +vpblendd $0x55,%ymm2,%ymm3,%ymm2 +vpblendd $0x55,%ymm4,%ymm5,%ymm4 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +ret + +.macro pointwise off +#load +vmovdqa \off(%rsi),%ymm6 +vmovdqa \off+32(%rsi),%ymm8 +vmovdqa \off(%rdx),%ymm10 +vmovdqa \off+32(%rdx),%ymm12 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 + +#mul +vpmuludq %ymm6,%ymm10,%ymm6 +vpmuludq %ymm7,%ymm11,%ymm7 +vpmuludq %ymm8,%ymm12,%ymm8 +vpmuludq %ymm9,%ymm13,%ymm9 +.endm + +.macro acc +vpaddq %ymm6,%ymm2,%ymm2 +vpaddq %ymm7,%ymm3,%ymm3 +vpaddq %ymm8,%ymm4,%ymm4 +vpaddq %ymm9,%ymm5,%ymm5 +.endm + +.global PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx +PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm1 + +xor %eax,%eax +_looptop2: +pointwise 0 + +#mov +vmovdqa %ymm6,%ymm2 +vmovdqa %ymm7,%ymm3 +vmovdqa %ymm8,%ymm4 +vmovdqa %ymm9,%ymm5 + +pointwise 1024 +acc + +pointwise 2048 +acc + +pointwise 3072 +acc + + +#reduce +vpmuludq %ymm0,%ymm2,%ymm6 +vpmuludq %ymm0,%ymm3,%ymm7 +vpmuludq %ymm0,%ymm4,%ymm8 +vpmuludq %ymm0,%ymm5,%ymm9 +vpmuludq %ymm1,%ymm6,%ymm6 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm8,%ymm8 +vpmuludq %ymm1,%ymm9,%ymm9 +vpaddq %ymm2,%ymm6,%ymm2 +vpaddq %ymm3,%ymm7,%ymm3 +vpaddq %ymm4,%ymm8,%ymm4 +vpaddq %ymm5,%ymm9,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 + +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +add $64,%rsi +add $64,%rdx +add $64,%rdi +add $1,%eax +cmp $16,%eax +jb _looptop2 + +ret diff --git a/crypto_sign/dilithium3/avx2/poly.c b/crypto_sign/dilithium3/avx2/poly.c new file mode 100644 index 00000000..5e4258ff --- /dev/null +++ b/crypto_sign/dilithium3/avx2/poly.c @@ -0,0 +1,914 @@ +#include +#include + +#include "fips202x4.h" +#include "ntt.h" +#include "nttconsts.h" +#include "params.h" +#include "poly.h" +#include "reduce.h" +#include "rejsample.h" +#include "rounding.h" +#include "symmetric.h" + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_reduce +* +* Description: Reduce all coefficients of input polynomial to representative +* in [0,2*Q[. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_reduce(poly *a) { + PQCLEAN_DILITHIUM3_AVX2_reduce_avx(a->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_csubq +* +* Description: For all coefficients of input polynomial subtract Q if +* coefficient is bigger than Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_csubq(poly *a) { + PQCLEAN_DILITHIUM3_AVX2_csubq_avx(a->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_freeze +* +* Description: Reduce all coefficients of the polynomial to standard +* representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a) { + PQCLEAN_DILITHIUM3_AVX2_reduce_avx(a->coeffs); + PQCLEAN_DILITHIUM3_AVX2_csubq_avx(a->coeffs); + +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i vec0, vec1; + for (i = 0; i < N / 8; i++) { + vec0 = _mm256_load_si256(&a->coeffs_x8[i]); + vec1 = _mm256_load_si256(&b->coeffs_x8[i]); + vec0 = _mm256_add_epi32(vec0, vec1); + _mm256_store_si256(&c->coeffs_x8[i], vec0); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_sub +* +* Description: Subtract polynomials. Assumes coefficients of second input +* polynomial to be less than 2*Q. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i vec0, vec1; + const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec); + + for (i = 0; i < N / 8; i++) { + vec0 = _mm256_load_si256(&a->coeffs_x8[i]); + vec1 = _mm256_load_si256(&b->coeffs_x8[i]); + vec0 = _mm256_add_epi32(vec0, twoq); + vec0 = _mm256_sub_epi32(vec0, vec1); + _mm256_store_si256(&c->coeffs_x8[i], vec0); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{32-D}. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(poly *a) { + unsigned int i; + __m256i vec; + + for (i = 0; i < N / 8; i++) { + vec = _mm256_load_si256(&a->coeffs_x8[i]); + vec = _mm256_slli_epi32(vec, D); + _mm256_store_si256(&a->coeffs_x8[i], vec); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_ntt +* +* Description: Forward NTT. Output coefficients can be up to 16*Q larger than +* input coefficients. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_ntt(poly *a) { + unsigned int i; + ALIGNED_UINT64(N) tmp; + + for (i = 0; i < N / 32; ++i) { + PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx(tmp.as_arr + 4 * i, a->coeffs + 4 * i, PQCLEAN_DILITHIUM3_AVX2_zetas.as_arr + 1); + } + for (i = 0; i < N / 32; ++i) { + PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx(a->coeffs + 32 * i, tmp.as_arr + 32 * i, PQCLEAN_DILITHIUM3_AVX2_zetas.as_arr + 8 + 31 * i); + } +} + +/************************************************* +* Name: poly_invntt_montgomery +* +* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients +* need to be less than 2*Q. Output coefficients are less than 2*Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(poly *a) { + unsigned int i; + ALIGNED_UINT64(N) tmp; + + for (i = 0; i < N / 32; i++) { + PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx(tmp.as_arr + 32 * i, a->coeffs + 32 * i, PQCLEAN_DILITHIUM3_AVX2_zetas_inv.as_arr + 31 * i); + } + for (i = 0; i < N / 32; i++) { + PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx(a->coeffs + 4 * i, tmp.as_arr + 4 * i, PQCLEAN_DILITHIUM3_AVX2_zetas_inv.as_arr + 248); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* with 2^{-32}. Output coefficients are less than 2*Q if input +* coefficient are less than 22*Q. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { + PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(c->coeffs, a->coeffs, b->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients Q + a0 +* - const poly *v: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *restrict a1, + poly *restrict a0, + const poly *restrict a) { + for (size_t i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_power2round(a->coeffs[i], &a0->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients Q + a0 +* - const poly *c: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_decompose( + poly *restrict a1, + poly *restrict a0, + const poly *restrict a) { + unsigned int i; + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_decompose(a->coeffs[i], &a0->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint( + poly *restrict h, + const poly *restrict a0, + const poly *restrict a1) { + unsigned int i, s = 0; + for (i = 0; i < N; ++i) { + h->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + return s; +} + +/************************************************* + * Name: PQCLEAN_DILITHIUM3_AVX2_poly_use_hint + * + * Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *a: pointer to output polynomial with corrected high bits +* - const poly *b: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint( + poly *restrict a, + const poly *restrict b, + const poly *restrict h) { + unsigned int i; + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_use_hint(b->coeffs[i], h->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const poly *a: pointer to polynomial +* - uint32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, uint32_t B) { + unsigned int i; + int32_t t; + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for (i = 0; i < N; ++i) { + /* Absolute value of centralized representative */ + t = (Q - 1) / 2 - a->coeffs[i]; + t ^= (t >> 31); + t = (Q - 1) / 2 - t; + + if ((uint32_t)t >= B) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: rej_uniform_ref +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + return ctr; +} + +/************************************************* +* Name: poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling using the +* output stream from SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a, + const unsigned char seed[SEEDBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int nblocks = POLY_UNIFORM_NBLOCKS; + unsigned int buflen = POLY_UNIFORM_BUFLEN; + unsigned char buf[POLY_UNIFORM_BUFLEN + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, nblocks, &state); + + ctr = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a->coeffs, N, buf, buflen); + + while (ctr < N) { + off = buflen % 3; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + buflen = STREAM128_BLOCKBYTES + off; + stream128_squeezeblocks(buf + off, 1, &state); + ctr += rej_uniform_ref(a->coeffs + ctr, N - ctr, buf, buflen); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[SEEDBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][SEEDBYTES + 2]; + unsigned char outbuf[4][5 * SHAKE128_RATE]; + __m256i state[25]; + + for (i = 0; i < SEEDBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][SEEDBYTES + 0] = nonce0; + inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; + inbuf[1][SEEDBYTES + 0] = nonce1; + inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; + inbuf[2][SEEDBYTES + 0] = nonce2; + inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; + inbuf[3][SEEDBYTES + 0] = nonce3; + inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + SEEDBYTES + 2); + PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, + state); + + ctr0 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a0->coeffs, N, outbuf[0], 5 * SHAKE128_RATE); + ctr1 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a1->coeffs, N, outbuf[1], 5 * SHAKE128_RATE); + ctr2 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a2->coeffs, N, outbuf[2], 5 * SHAKE128_RATE); + ctr3 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a3->coeffs, N, outbuf[3], 5 * SHAKE128_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_uniform_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], + SHAKE128_RATE); + ctr1 += rej_uniform_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], + SHAKE128_RATE); + ctr2 += rej_uniform_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], + SHAKE128_RATE); + ctr3 += rej_uniform_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], + SHAKE128_RATE); + } +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 <= 2 * ETA) { + a[ctr++] = Q + ETA - t0; + } + if (t1 <= 2 * ETA && ctr < len) { + a[ctr++] = Q + ETA - t1; + } + } + + return ctr; +} + +/************************************************* +* Name: poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling using the +* output stream from SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a, + const unsigned char seed[SEEDBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned char buf[POLY_UNIFORM_ETA_BUFLEN]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); + + while (ctr < N) { + stream128_squeezeblocks(buf, 1, &state); + ctr += rej_eta_ref(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[SEEDBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][SEEDBYTES + 2]; + unsigned char outbuf[4][2 * SHAKE128_RATE]; + __m256i state[25]; + + for (i = 0; i < SEEDBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][SEEDBYTES + 0] = nonce0; + inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; + inbuf[1][SEEDBYTES + 0] = nonce1; + inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; + inbuf[2][SEEDBYTES + 0] = nonce2; + inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; + inbuf[3][SEEDBYTES + 0] = nonce3; + inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + SEEDBYTES + 2); + PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 2, + state); + + ctr0 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a0->coeffs, N, outbuf[0], 2 * SHAKE128_RATE); + ctr1 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a1->coeffs, N, outbuf[1], 2 * SHAKE128_RATE); + ctr2 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a2->coeffs, N, outbuf[2], 2 * SHAKE128_RATE); + ctr3 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a3->coeffs, N, outbuf[3], 2 * SHAKE128_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_eta_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], SHAKE128_RATE); + ctr1 += rej_eta_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], SHAKE128_RATE); + ctr2 += rej_eta_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], SHAKE128_RATE); + ctr3 += rej_eta_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], SHAKE128_RATE); + } +} + +/************************************************* +* Name: rej_gamma1m1_ref +* +* Description: Sample uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling +* using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_gamma1m1_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + + ctr = pos = 0; + while (ctr < len && pos + 5 <= buflen) { + t0 = buf[pos]; + t0 |= (uint32_t)buf[pos + 1] << 8; + t0 |= (uint32_t)buf[pos + 2] << 16; + t0 &= 0xFFFFF; + + t1 = buf[pos + 2] >> 4; + t1 |= (uint32_t)buf[pos + 3] << 4; + t1 |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + + if (t0 <= 2 * GAMMA1 - 2) { + a[ctr++] = Q + GAMMA1 - 1 - t0; + } + if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { + a[ctr++] = Q + GAMMA1 - 1 - t1; + } + } + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection +* sampling on output stream of SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) +#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1(poly *a, + const unsigned char seed[CRHBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; + unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); + + ctr = PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1(a->coeffs, N, buf, POLY_UNIFORM_GAMMA1M1_BUFLEN); + + while (ctr < N) { + off = buflen % 5; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + buflen = STREAM256_BLOCKBYTES + off; + stream256_squeezeblocks(buf + off, 1, &state); + ctr += rej_gamma1m1_ref(a->coeffs + ctr, N - ctr, buf, buflen); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[CRHBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][CRHBYTES + 2]; + unsigned char outbuf[4][5 * SHAKE256_RATE]; + __m256i state[25]; + + for (i = 0; i < CRHBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][CRHBYTES + 0] = nonce0 & 0xFF; + inbuf[0][CRHBYTES + 1] = nonce0 >> 8; + inbuf[1][CRHBYTES + 0] = nonce1 & 0xFF; + inbuf[1][CRHBYTES + 1] = nonce1 >> 8; + inbuf[2][CRHBYTES + 0] = nonce2 & 0xFF; + inbuf[2][CRHBYTES + 1] = nonce2 >> 8; + inbuf[3][CRHBYTES + 0] = nonce3 & 0xFF; + inbuf[3][CRHBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + CRHBYTES + 2); + PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, + state); + + ctr0 = rej_gamma1m1_ref(a0->coeffs, N, outbuf[0], 5 * SHAKE256_RATE); + ctr1 = rej_gamma1m1_ref(a1->coeffs, N, outbuf[1], 5 * SHAKE256_RATE); + ctr2 = rej_gamma1m1_ref(a2->coeffs, N, outbuf[2], 5 * SHAKE256_RATE); + ctr3 = rej_gamma1m1_ref(a3->coeffs, N, outbuf[3], 5 * SHAKE256_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_gamma1m1_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], + SHAKE256_RATE); + ctr1 += rej_gamma1m1_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], + SHAKE256_RATE); + ctr2 += rej_gamma1m1_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], + SHAKE256_RATE); + ctr3 += rej_gamma1m1_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], + SHAKE256_RATE); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLETA_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + unsigned char t[8]; + + for (i = 0; i < N / 2; ++i) { + t[0] = Q + ETA - a->coeffs[2 * i + 0]; + t[1] = Q + ETA - a->coeffs[2 * i + 1]; + r[i] = t[0] | (t[1] << 4); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* Output coefficients lie in [Q-ETA,Q+ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[i] & 0x0F; + r->coeffs[2 * i + 1] = a[i] >> 4; + r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1]; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLT1_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + + for (i = 0; i < N / 8; ++i) { + r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); + r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); + r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); + r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); + r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); + r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); + r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); + r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); + r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack +* +* Description: Unpack polynomial t1 with 9-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = ((a[9 * i + 0] >> 0) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; + r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; + r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; + r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; + r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; + r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; + r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; + r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLT0_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + + for (i = 0; i < N / 4; ++i) { + t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; + t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; + t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; + t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; + + r[7 * i + 0] = t[0]; + r[7 * i + 1] = t[0] >> 8; + r[7 * i + 1] |= t[1] << 6; + r[7 * i + 2] = t[1] >> 2; + r[7 * i + 3] = t[1] >> 10; + r[7 * i + 3] |= t[2] << 4; + r[7 * i + 4] = t[2] >> 4; + r[7 * i + 5] = t[2] >> 12; + r[7 * i + 5] |= t[3] << 2; + r[7 * i + 6] = t[3] >> 6; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[7 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; + + r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; + r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; + r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; + + r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; + r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; + + r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; + r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; + + r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; + r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; + r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; + r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyz_pack +* +* Description: Bit-pack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLZ_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + uint32_t t[2]; + + for (i = 0; i < N / 2; ++i) { + /* Map to {0,...,2*GAMMA1 - 2} */ + t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; + t[0] += ((int32_t)t[0] >> 31) & Q; + t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; + t[1] += ((int32_t)t[1] >> 31) & Q; + + r[5 * i + 0] = t[0]; + r[5 * i + 1] = t[0] >> 8; + r[5 * i + 2] = t[0] >> 16; + r[5 * i + 2] |= t[1] << 4; + r[5 * i + 3] = t[1] >> 4; + r[5 * i + 4] = t[1] >> 12; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1]. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[5 * i + 0]; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; + + r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + + r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; + r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; + r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; + } + +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLW1_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + + for (i = 0; i < N / 2; ++i) { + r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); + } +} diff --git a/crypto_sign/dilithium3/avx2/poly.h b/crypto_sign/dilithium3/avx2/poly.h new file mode 100644 index 00000000..a944b600 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/poly.h @@ -0,0 +1,83 @@ +#ifndef POLY_H +#define POLY_H + +#include +#include + +#include "alignment.h" +#include "params.h" + +typedef union { + uint32_t coeffs[N]; + __m256i coeffs_x8[N / 8]; +} poly; + +void PQCLEAN_DILITHIUM3_AVX2_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM3_AVX2_poly_csubq(poly *a); +void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM3_AVX2_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(poly *a); +void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h); + +int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, uint32_t B); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); + +void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(uint8_t *r, const poly *a); +#endif diff --git a/crypto_sign/dilithium3/avx2/polyvec.c b/crypto_sign/dilithium3/avx2/polyvec.c new file mode 100644 index 00000000..0c1647bd --- /dev/null +++ b/crypto_sign/dilithium3/avx2/polyvec.c @@ -0,0 +1,353 @@ +#include + +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* Input coefficients are assumed to be less than 22*Q. Output +* coeffcient are less than 2*L*Q. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const polyvecl *v: pointer to vector +* - uint32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B and 1 +* otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [0,2*Q[. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq +* +* Description: For all coefficients of polynomials in vector of length K +* subtract Q if coefficient is bigger than Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_csubq(&v->vec[i]); + } +} + +/************************************************* +* Name: polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* Assumes coefficients of polynomials in second input vector +* to be less than 2*Q. No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{32-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const polyveck *v: pointer to vector +* - uint32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B and 1 +* otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients Q + a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients Q + a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; + + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + } + + return s; +} + +/************************************************* +* Name: polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *v: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); + } +} diff --git a/crypto_sign/dilithium3/avx2/polyvec.h b/crypto_sign/dilithium3/avx2/polyvec.h new file mode 100644 index 00000000..581319f7 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/polyvec.h @@ -0,0 +1,52 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H +#define PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H + +#include + +#include "params.h" +#include "poly.h" + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + +int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(polyveck *v); + +int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, uint32_t B); + +void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h); + +#endif diff --git a/crypto_sign/dilithium3/avx2/reduce.h b/crypto_sign/dilithium3/avx2/reduce.h new file mode 100644 index 00000000..2488cbfd --- /dev/null +++ b/crypto_sign/dilithium3/avx2/reduce.h @@ -0,0 +1,9 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include + +void PQCLEAN_DILITHIUM3_AVX2_reduce_avx(uint32_t a[N]); +void PQCLEAN_DILITHIUM3_AVX2_csubq_avx(uint32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium3/avx2/reduce.s b/crypto_sign/dilithium3/avx2/reduce.s new file mode 100644 index 00000000..ca14e432 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/reduce.s @@ -0,0 +1,91 @@ +.global PQCLEAN_DILITHIUM3_AVX2_reduce_avx +PQCLEAN_DILITHIUM3_AVX2_reduce_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8x23ones(%rip),%ymm0 + +xor %eax,%eax +_looptop_rdc32: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm7 + +#reduce +vpsrld $23,%ymm1,%ymm2 +vpsrld $23,%ymm3,%ymm4 +vpsrld $23,%ymm5,%ymm6 +vpsrld $23,%ymm7,%ymm8 +vpand %ymm0,%ymm1,%ymm1 +vpand %ymm0,%ymm3,%ymm3 +vpand %ymm0,%ymm5,%ymm5 +vpand %ymm0,%ymm7,%ymm7 +vpsubd %ymm2,%ymm1,%ymm1 +vpsubd %ymm4,%ymm3,%ymm3 +vpsubd %ymm6,%ymm5,%ymm5 +vpsubd %ymm8,%ymm7,%ymm7 +vpslld $13,%ymm2,%ymm2 +vpslld $13,%ymm4,%ymm4 +vpslld $13,%ymm6,%ymm6 +vpslld $13,%ymm8,%ymm8 +vpaddd %ymm2,%ymm1,%ymm1 +vpaddd %ymm4,%ymm3,%ymm3 +vpaddd %ymm6,%ymm5,%ymm5 +vpaddd %ymm8,%ymm7,%ymm7 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm7,96(%rdi) + +add $128,%rdi +add $1,%eax +cmp $8,%eax +jb _looptop_rdc32 + +ret + +.global PQCLEAN_DILITHIUM3_AVX2_csubq_avx +PQCLEAN_DILITHIUM3_AVX2_csubq_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM3_AVX2_8xq(%rip),%ymm0 + +xor %eax,%eax +_looptop_csubq: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm7 + +#PQCLEAN_DILITHIUM3_AVX2_csubq +vpsubd %ymm0,%ymm1,%ymm1 +vpsubd %ymm0,%ymm3,%ymm3 +vpsubd %ymm0,%ymm5,%ymm5 +vpsubd %ymm0,%ymm7,%ymm7 +vpsrad $31,%ymm1,%ymm2 +vpsrad $31,%ymm3,%ymm4 +vpsrad $31,%ymm5,%ymm6 +vpsrad $31,%ymm7,%ymm8 +vpand %ymm0,%ymm2,%ymm2 +vpand %ymm0,%ymm4,%ymm4 +vpand %ymm0,%ymm6,%ymm6 +vpand %ymm0,%ymm8,%ymm8 +vpaddd %ymm2,%ymm1,%ymm1 +vpaddd %ymm4,%ymm3,%ymm3 +vpaddd %ymm6,%ymm5,%ymm5 +vpaddd %ymm8,%ymm7,%ymm7 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm7,96(%rdi) + +add $128,%rdi +add $1,%eax +cmp $8,%eax +jb _looptop_csubq + +ret diff --git a/crypto_sign/dilithium3/avx2/rejsample.c b/crypto_sign/dilithium3/avx2/rejsample.c new file mode 100644 index 00000000..ae3c2933 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/rejsample.c @@ -0,0 +1,443 @@ +#include +#include + +#include "params.h" +#include "rejsample.h" + +static const uint8_t idx[256][8] = { + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 1, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 0, 0, 0, 0, 0, 0}, + { 2, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0}, + { 1, 2, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 0, 0, 0, 0, 0}, + { 3, 0, 0, 0, 0, 0, 0, 0}, + { 0, 3, 0, 0, 0, 0, 0, 0}, + { 1, 3, 0, 0, 0, 0, 0, 0}, + { 0, 1, 3, 0, 0, 0, 0, 0}, + { 2, 3, 0, 0, 0, 0, 0, 0}, + { 0, 2, 3, 0, 0, 0, 0, 0}, + { 1, 2, 3, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 0, 0, 0, 0}, + { 4, 0, 0, 0, 0, 0, 0, 0}, + { 0, 4, 0, 0, 0, 0, 0, 0}, + { 1, 4, 0, 0, 0, 0, 0, 0}, + { 0, 1, 4, 0, 0, 0, 0, 0}, + { 2, 4, 0, 0, 0, 0, 0, 0}, + { 0, 2, 4, 0, 0, 0, 0, 0}, + { 1, 2, 4, 0, 0, 0, 0, 0}, + { 0, 1, 2, 4, 0, 0, 0, 0}, + { 3, 4, 0, 0, 0, 0, 0, 0}, + { 0, 3, 4, 0, 0, 0, 0, 0}, + { 1, 3, 4, 0, 0, 0, 0, 0}, + { 0, 1, 3, 4, 0, 0, 0, 0}, + { 2, 3, 4, 0, 0, 0, 0, 0}, + { 0, 2, 3, 4, 0, 0, 0, 0}, + { 1, 2, 3, 4, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 0, 0, 0}, + { 5, 0, 0, 0, 0, 0, 0, 0}, + { 0, 5, 0, 0, 0, 0, 0, 0}, + { 1, 5, 0, 0, 0, 0, 0, 0}, + { 0, 1, 5, 0, 0, 0, 0, 0}, + { 2, 5, 0, 0, 0, 0, 0, 0}, + { 0, 2, 5, 0, 0, 0, 0, 0}, + { 1, 2, 5, 0, 0, 0, 0, 0}, + { 0, 1, 2, 5, 0, 0, 0, 0}, + { 3, 5, 0, 0, 0, 0, 0, 0}, + { 0, 3, 5, 0, 0, 0, 0, 0}, + { 1, 3, 5, 0, 0, 0, 0, 0}, + { 0, 1, 3, 5, 0, 0, 0, 0}, + { 2, 3, 5, 0, 0, 0, 0, 0}, + { 0, 2, 3, 5, 0, 0, 0, 0}, + { 1, 2, 3, 5, 0, 0, 0, 0}, + { 0, 1, 2, 3, 5, 0, 0, 0}, + { 4, 5, 0, 0, 0, 0, 0, 0}, + { 0, 4, 5, 0, 0, 0, 0, 0}, + { 1, 4, 5, 0, 0, 0, 0, 0}, + { 0, 1, 4, 5, 0, 0, 0, 0}, + { 2, 4, 5, 0, 0, 0, 0, 0}, + { 0, 2, 4, 5, 0, 0, 0, 0}, + { 1, 2, 4, 5, 0, 0, 0, 0}, + { 0, 1, 2, 4, 5, 0, 0, 0}, + { 3, 4, 5, 0, 0, 0, 0, 0}, + { 0, 3, 4, 5, 0, 0, 0, 0}, + { 1, 3, 4, 5, 0, 0, 0, 0}, + { 0, 1, 3, 4, 5, 0, 0, 0}, + { 2, 3, 4, 5, 0, 0, 0, 0}, + { 0, 2, 3, 4, 5, 0, 0, 0}, + { 1, 2, 3, 4, 5, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 0, 0}, + { 6, 0, 0, 0, 0, 0, 0, 0}, + { 0, 6, 0, 0, 0, 0, 0, 0}, + { 1, 6, 0, 0, 0, 0, 0, 0}, + { 0, 1, 6, 0, 0, 0, 0, 0}, + { 2, 6, 0, 0, 0, 0, 0, 0}, + { 0, 2, 6, 0, 0, 0, 0, 0}, + { 1, 2, 6, 0, 0, 0, 0, 0}, + { 0, 1, 2, 6, 0, 0, 0, 0}, + { 3, 6, 0, 0, 0, 0, 0, 0}, + { 0, 3, 6, 0, 0, 0, 0, 0}, + { 1, 3, 6, 0, 0, 0, 0, 0}, + { 0, 1, 3, 6, 0, 0, 0, 0}, + { 2, 3, 6, 0, 0, 0, 0, 0}, + { 0, 2, 3, 6, 0, 0, 0, 0}, + { 1, 2, 3, 6, 0, 0, 0, 0}, + { 0, 1, 2, 3, 6, 0, 0, 0}, + { 4, 6, 0, 0, 0, 0, 0, 0}, + { 0, 4, 6, 0, 0, 0, 0, 0}, + { 1, 4, 6, 0, 0, 0, 0, 0}, + { 0, 1, 4, 6, 0, 0, 0, 0}, + { 2, 4, 6, 0, 0, 0, 0, 0}, + { 0, 2, 4, 6, 0, 0, 0, 0}, + { 1, 2, 4, 6, 0, 0, 0, 0}, + { 0, 1, 2, 4, 6, 0, 0, 0}, + { 3, 4, 6, 0, 0, 0, 0, 0}, + { 0, 3, 4, 6, 0, 0, 0, 0}, + { 1, 3, 4, 6, 0, 0, 0, 0}, + { 0, 1, 3, 4, 6, 0, 0, 0}, + { 2, 3, 4, 6, 0, 0, 0, 0}, + { 0, 2, 3, 4, 6, 0, 0, 0}, + { 1, 2, 3, 4, 6, 0, 0, 0}, + { 0, 1, 2, 3, 4, 6, 0, 0}, + { 5, 6, 0, 0, 0, 0, 0, 0}, + { 0, 5, 6, 0, 0, 0, 0, 0}, + { 1, 5, 6, 0, 0, 0, 0, 0}, + { 0, 1, 5, 6, 0, 0, 0, 0}, + { 2, 5, 6, 0, 0, 0, 0, 0}, + { 0, 2, 5, 6, 0, 0, 0, 0}, + { 1, 2, 5, 6, 0, 0, 0, 0}, + { 0, 1, 2, 5, 6, 0, 0, 0}, + { 3, 5, 6, 0, 0, 0, 0, 0}, + { 0, 3, 5, 6, 0, 0, 0, 0}, + { 1, 3, 5, 6, 0, 0, 0, 0}, + { 0, 1, 3, 5, 6, 0, 0, 0}, + { 2, 3, 5, 6, 0, 0, 0, 0}, + { 0, 2, 3, 5, 6, 0, 0, 0}, + { 1, 2, 3, 5, 6, 0, 0, 0}, + { 0, 1, 2, 3, 5, 6, 0, 0}, + { 4, 5, 6, 0, 0, 0, 0, 0}, + { 0, 4, 5, 6, 0, 0, 0, 0}, + { 1, 4, 5, 6, 0, 0, 0, 0}, + { 0, 1, 4, 5, 6, 0, 0, 0}, + { 2, 4, 5, 6, 0, 0, 0, 0}, + { 0, 2, 4, 5, 6, 0, 0, 0}, + { 1, 2, 4, 5, 6, 0, 0, 0}, + { 0, 1, 2, 4, 5, 6, 0, 0}, + { 3, 4, 5, 6, 0, 0, 0, 0}, + { 0, 3, 4, 5, 6, 0, 0, 0}, + { 1, 3, 4, 5, 6, 0, 0, 0}, + { 0, 1, 3, 4, 5, 6, 0, 0}, + { 2, 3, 4, 5, 6, 0, 0, 0}, + { 0, 2, 3, 4, 5, 6, 0, 0}, + { 1, 2, 3, 4, 5, 6, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 0}, + { 7, 0, 0, 0, 0, 0, 0, 0}, + { 0, 7, 0, 0, 0, 0, 0, 0}, + { 1, 7, 0, 0, 0, 0, 0, 0}, + { 0, 1, 7, 0, 0, 0, 0, 0}, + { 2, 7, 0, 0, 0, 0, 0, 0}, + { 0, 2, 7, 0, 0, 0, 0, 0}, + { 1, 2, 7, 0, 0, 0, 0, 0}, + { 0, 1, 2, 7, 0, 0, 0, 0}, + { 3, 7, 0, 0, 0, 0, 0, 0}, + { 0, 3, 7, 0, 0, 0, 0, 0}, + { 1, 3, 7, 0, 0, 0, 0, 0}, + { 0, 1, 3, 7, 0, 0, 0, 0}, + { 2, 3, 7, 0, 0, 0, 0, 0}, + { 0, 2, 3, 7, 0, 0, 0, 0}, + { 1, 2, 3, 7, 0, 0, 0, 0}, + { 0, 1, 2, 3, 7, 0, 0, 0}, + { 4, 7, 0, 0, 0, 0, 0, 0}, + { 0, 4, 7, 0, 0, 0, 0, 0}, + { 1, 4, 7, 0, 0, 0, 0, 0}, + { 0, 1, 4, 7, 0, 0, 0, 0}, + { 2, 4, 7, 0, 0, 0, 0, 0}, + { 0, 2, 4, 7, 0, 0, 0, 0}, + { 1, 2, 4, 7, 0, 0, 0, 0}, + { 0, 1, 2, 4, 7, 0, 0, 0}, + { 3, 4, 7, 0, 0, 0, 0, 0}, + { 0, 3, 4, 7, 0, 0, 0, 0}, + { 1, 3, 4, 7, 0, 0, 0, 0}, + { 0, 1, 3, 4, 7, 0, 0, 0}, + { 2, 3, 4, 7, 0, 0, 0, 0}, + { 0, 2, 3, 4, 7, 0, 0, 0}, + { 1, 2, 3, 4, 7, 0, 0, 0}, + { 0, 1, 2, 3, 4, 7, 0, 0}, + { 5, 7, 0, 0, 0, 0, 0, 0}, + { 0, 5, 7, 0, 0, 0, 0, 0}, + { 1, 5, 7, 0, 0, 0, 0, 0}, + { 0, 1, 5, 7, 0, 0, 0, 0}, + { 2, 5, 7, 0, 0, 0, 0, 0}, + { 0, 2, 5, 7, 0, 0, 0, 0}, + { 1, 2, 5, 7, 0, 0, 0, 0}, + { 0, 1, 2, 5, 7, 0, 0, 0}, + { 3, 5, 7, 0, 0, 0, 0, 0}, + { 0, 3, 5, 7, 0, 0, 0, 0}, + { 1, 3, 5, 7, 0, 0, 0, 0}, + { 0, 1, 3, 5, 7, 0, 0, 0}, + { 2, 3, 5, 7, 0, 0, 0, 0}, + { 0, 2, 3, 5, 7, 0, 0, 0}, + { 1, 2, 3, 5, 7, 0, 0, 0}, + { 0, 1, 2, 3, 5, 7, 0, 0}, + { 4, 5, 7, 0, 0, 0, 0, 0}, + { 0, 4, 5, 7, 0, 0, 0, 0}, + { 1, 4, 5, 7, 0, 0, 0, 0}, + { 0, 1, 4, 5, 7, 0, 0, 0}, + { 2, 4, 5, 7, 0, 0, 0, 0}, + { 0, 2, 4, 5, 7, 0, 0, 0}, + { 1, 2, 4, 5, 7, 0, 0, 0}, + { 0, 1, 2, 4, 5, 7, 0, 0}, + { 3, 4, 5, 7, 0, 0, 0, 0}, + { 0, 3, 4, 5, 7, 0, 0, 0}, + { 1, 3, 4, 5, 7, 0, 0, 0}, + { 0, 1, 3, 4, 5, 7, 0, 0}, + { 2, 3, 4, 5, 7, 0, 0, 0}, + { 0, 2, 3, 4, 5, 7, 0, 0}, + { 1, 2, 3, 4, 5, 7, 0, 0}, + { 0, 1, 2, 3, 4, 5, 7, 0}, + { 6, 7, 0, 0, 0, 0, 0, 0}, + { 0, 6, 7, 0, 0, 0, 0, 0}, + { 1, 6, 7, 0, 0, 0, 0, 0}, + { 0, 1, 6, 7, 0, 0, 0, 0}, + { 2, 6, 7, 0, 0, 0, 0, 0}, + { 0, 2, 6, 7, 0, 0, 0, 0}, + { 1, 2, 6, 7, 0, 0, 0, 0}, + { 0, 1, 2, 6, 7, 0, 0, 0}, + { 3, 6, 7, 0, 0, 0, 0, 0}, + { 0, 3, 6, 7, 0, 0, 0, 0}, + { 1, 3, 6, 7, 0, 0, 0, 0}, + { 0, 1, 3, 6, 7, 0, 0, 0}, + { 2, 3, 6, 7, 0, 0, 0, 0}, + { 0, 2, 3, 6, 7, 0, 0, 0}, + { 1, 2, 3, 6, 7, 0, 0, 0}, + { 0, 1, 2, 3, 6, 7, 0, 0}, + { 4, 6, 7, 0, 0, 0, 0, 0}, + { 0, 4, 6, 7, 0, 0, 0, 0}, + { 1, 4, 6, 7, 0, 0, 0, 0}, + { 0, 1, 4, 6, 7, 0, 0, 0}, + { 2, 4, 6, 7, 0, 0, 0, 0}, + { 0, 2, 4, 6, 7, 0, 0, 0}, + { 1, 2, 4, 6, 7, 0, 0, 0}, + { 0, 1, 2, 4, 6, 7, 0, 0}, + { 3, 4, 6, 7, 0, 0, 0, 0}, + { 0, 3, 4, 6, 7, 0, 0, 0}, + { 1, 3, 4, 6, 7, 0, 0, 0}, + { 0, 1, 3, 4, 6, 7, 0, 0}, + { 2, 3, 4, 6, 7, 0, 0, 0}, + { 0, 2, 3, 4, 6, 7, 0, 0}, + { 1, 2, 3, 4, 6, 7, 0, 0}, + { 0, 1, 2, 3, 4, 6, 7, 0}, + { 5, 6, 7, 0, 0, 0, 0, 0}, + { 0, 5, 6, 7, 0, 0, 0, 0}, + { 1, 5, 6, 7, 0, 0, 0, 0}, + { 0, 1, 5, 6, 7, 0, 0, 0}, + { 2, 5, 6, 7, 0, 0, 0, 0}, + { 0, 2, 5, 6, 7, 0, 0, 0}, + { 1, 2, 5, 6, 7, 0, 0, 0}, + { 0, 1, 2, 5, 6, 7, 0, 0}, + { 3, 5, 6, 7, 0, 0, 0, 0}, + { 0, 3, 5, 6, 7, 0, 0, 0}, + { 1, 3, 5, 6, 7, 0, 0, 0}, + { 0, 1, 3, 5, 6, 7, 0, 0}, + { 2, 3, 5, 6, 7, 0, 0, 0}, + { 0, 2, 3, 5, 6, 7, 0, 0}, + { 1, 2, 3, 5, 6, 7, 0, 0}, + { 0, 1, 2, 3, 5, 6, 7, 0}, + { 4, 5, 6, 7, 0, 0, 0, 0}, + { 0, 4, 5, 6, 7, 0, 0, 0}, + { 1, 4, 5, 6, 7, 0, 0, 0}, + { 0, 1, 4, 5, 6, 7, 0, 0}, + { 2, 4, 5, 6, 7, 0, 0, 0}, + { 0, 2, 4, 5, 6, 7, 0, 0}, + { 1, 2, 4, 5, 6, 7, 0, 0}, + { 0, 1, 2, 4, 5, 6, 7, 0}, + { 3, 4, 5, 6, 7, 0, 0, 0}, + { 0, 3, 4, 5, 6, 7, 0, 0}, + { 1, 3, 4, 5, 6, 7, 0, 0}, + { 0, 1, 3, 4, 5, 6, 7, 0}, + { 2, 3, 4, 5, 6, 7, 0, 0}, + { 0, 2, 3, 4, 5, 6, 7, 0}, + { 1, 2, 3, 4, 5, 6, 7, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7} +}; + +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint32_t vec[8]; + __m256i d, tmp; + uint32_t good; + const __m256i bound = _mm256_set1_epi32(Q); + + ctr = pos = 0; + while (ctr + 8 <= len && pos + 24 <= buflen) { + for (i = 0; i < 8; i++) { + vec[i] = buf[pos++]; + vec[i] |= (uint32_t)buf[pos++] << 8; + vec[i] |= (uint32_t)buf[pos++] << 16; + vec[i] &= 0x7FFFFF; + } + + d = _mm256_loadu_si256((__m256i_u *)vec); + tmp = _mm256_cmpgt_epi32(bound, d); + good = _mm256_movemask_ps((__m256)tmp); + + __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); + tmp = _mm256_cvtepu8_epi32(rid); + d = _mm256_permutevar8x32_epi32(d, tmp); + _mm256_storeu_si256((__m256i_u *)&r[ctr], d); + ctr += __builtin_popcount(good); + } + + while (ctr < len && pos + 3 <= buflen) { + vec[0] = buf[pos++]; + vec[0] |= (uint32_t)buf[pos++] << 8; + vec[0] |= (uint32_t)buf[pos++] << 16; + vec[0] &= 0x7FFFFF; + + if (vec[0] < Q) { + r[ctr++] = vec[0]; + } + } + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint8_t vec[32]; + __m256i tmp0, tmp1; + __m128i d0, d1, rid; + uint32_t good; + const __m256i bound = _mm256_set1_epi8(2 * ETA + 1); + const __m256i off = _mm256_set1_epi32(Q + ETA); + + ctr = pos = 0; + while (ctr + 32 <= len && pos + 16 <= buflen) { + for (i = 0; i < 16; i++) { + vec[2 * i + 0] = buf[pos] & 0x0F; + vec[2 * i + 1] = buf[pos++] >> 4; + } + + tmp0 = _mm256_loadu_si256((__m256i_u *)vec); + tmp1 = _mm256_cmpgt_epi8(bound, tmp0); + good = _mm256_movemask_epi8(tmp1); + + d0 = _mm256_castsi256_si128(tmp0); + rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount(good & 0xFF); + + d0 = _mm_bsrli_si128(d0, 8); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 8) & 0xFF); + + d0 = _mm256_extracti128_si256(tmp0, 1); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 16) & 0xFF); + + d0 = _mm_bsrli_si128(d0, 8); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 24) & 0xFF); + } + + while (ctr < len && pos < buflen) { + vec[0] = buf[pos] & 0x0F; + vec[1] = buf[pos++] >> 4; + + if (vec[0] <= 2 * ETA) { + r[ctr++] = Q + ETA - vec[0]; + } + if (vec[1] <= 2 * ETA && ctr < len) { + r[ctr++] = Q + ETA - vec[1]; + } + } + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint32_t vec[8]; + __m256i d, tmp; + uint32_t good; + const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1); + const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1); + + ctr = pos = 0; + while (ctr + 8 <= len && pos + 20 <= buflen) { + for (i = 0; i < 4; i++) { + vec[2 * i + 0] = buf[pos + 0]; + vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8; + vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16; + vec[2 * i + 0] &= 0xFFFFF; + + vec[2 * i + 1] = buf[pos + 2] >> 4; + vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4; + vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + } + + d = _mm256_loadu_si256((__m256i_u *)vec); + tmp = _mm256_cmpgt_epi32(bound, d); + good = _mm256_movemask_ps((__m256)tmp); + d = _mm256_sub_epi32(off, d); + + __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); + tmp = _mm256_cvtepu8_epi32(rid); + d = _mm256_permutevar8x32_epi32(d, tmp); + _mm256_storeu_si256((__m256i_u *)&r[ctr], d); + ctr += __builtin_popcount(good); + } + + while (ctr < len && pos + 5 <= buflen) { + vec[0] = buf[pos + 0]; + vec[0] |= (uint32_t)buf[pos + 1] << 8; + vec[0] |= (uint32_t)buf[pos + 2] << 16; + vec[0] &= 0xFFFFF; + + vec[1] = buf[pos + 2] >> 4; + vec[1] |= (uint32_t)buf[pos + 3] << 4; + vec[1] |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + + if (vec[0] <= 2 * GAMMA1 - 2) { + r[ctr++] = Q + GAMMA1 - 1 - vec[0]; + } + if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) { + r[ctr++] = Q + GAMMA1 - 1 - vec[1]; + } + } + + return ctr; +} diff --git a/crypto_sign/dilithium3/avx2/rejsample.h b/crypto_sign/dilithium3/avx2/rejsample.h new file mode 100644 index 00000000..6813c1ce --- /dev/null +++ b/crypto_sign/dilithium3/avx2/rejsample.h @@ -0,0 +1,26 @@ +#ifndef REJSAMPLE_H +#define REJSAMPLE_H + +#include + +#include "poly.h" + +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +#endif diff --git a/crypto_sign/dilithium3/avx2/rounding.c b/crypto_sign/dilithium3/avx2/rounding.c new file mode 100644 index 00000000..920f0f70 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/rounding.c @@ -0,0 +1,115 @@ +#include "rounding.h" + +/************************************************* +* Name: power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - uint32_t a: input element +* - uint32_t *a0: pointer to output element Q + a0 +* +* Returns a1. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM3_AVX2_power2round(uint32_t a, uint32_t *a0) { + int32_t t; + + /* Centralized remainder mod 2^D */ + t = a & ((1U << D) - 1); + t -= (1U << (D - 1)) + 1; + t += (t >> 31) & (1U << D); + t -= (1U << (D - 1)) - 1; + *a0 = Q + t; + a = (a - t) >> D; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - uint32_t a: input element +* - uint32_t *a0: pointer to output element Q + a0 +* +* Returns a1. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM3_AVX2_decompose(uint32_t a, uint32_t *a0) { + int32_t t, u; + + /* Centralized remainder mod ALPHA */ + t = a & 0x7FFFF; + t += (a >> 19) << 9; + t -= ALPHA / 2 + 1; + t += (t >> 31) & ALPHA; + t -= ALPHA / 2 - 1; + a -= t; + + /* Divide by ALPHA (possible to avoid) */ + u = a - 1; + u >>= 31; + a = (a >> 19) + 1; + a -= u & 1; + + /* Border case */ + *a0 = Q + t - (a >> 4); + a &= 0xF; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. Inputs assumed to be +* standard representatives. +* +* Arguments: - uint32_t a0: low bits of input element +* - uint32_t a1: high bits of input element +* +* Returns 1 if high bits of a and b differ and 0 otherwise. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint(const uint32_t a0, const uint32_t a1) { + if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { + return 0; + } + + return 1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - uint32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM3_AVX2_use_hint(const uint32_t a, const unsigned int hint) { + uint32_t a0, a1; + + a1 = PQCLEAN_DILITHIUM3_AVX2_decompose(a, &a0); + if (hint == 0) { + return a1; + } + if (a0 > Q) { + return (a1 + 1) & 0xF; + } + return (a1 - 1) & 0xF; + + /* If decompose does not divide out ALPHA: + if(hint == 0) + return a1; + else if(a0 > Q) + return (a1 + ALPHA) % (Q - 1); + else + return (a1 - ALPHA) % (Q - 1); + */ +} diff --git a/crypto_sign/dilithium3/avx2/rounding.h b/crypto_sign/dilithium3/avx2/rounding.h new file mode 100644 index 00000000..048e8aaa --- /dev/null +++ b/crypto_sign/dilithium3/avx2/rounding.h @@ -0,0 +1,12 @@ +#ifndef ROUNDING_H +#define ROUNDING_H + +#include "params.h" +#include + +uint32_t PQCLEAN_DILITHIUM3_AVX2_power2round(uint32_t a, uint32_t *a0); +uint32_t PQCLEAN_DILITHIUM3_AVX2_decompose(uint32_t a, uint32_t *a0); +unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint(uint32_t a0, uint32_t a1); +uint32_t PQCLEAN_DILITHIUM3_AVX2_use_hint(uint32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium3/avx2/shuffle.inc b/crypto_sign/dilithium3/avx2/shuffle.inc new file mode 100644 index 00000000..df352030 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/shuffle.inc @@ -0,0 +1,23 @@ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +vpsllq $32,%ymm\r1,%ymm12 +vpsrlq $32,%ymm\r0,%ymm13 +vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 +vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm12 +vpsrld $16,%ymm\r0,%ymm13 +vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 +vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +.endm diff --git a/crypto_sign/dilithium3/avx2/sign.c b/crypto_sign/dilithium3/avx2/sign.c new file mode 100644 index 00000000..0f0530d4 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/sign.c @@ -0,0 +1,446 @@ +#include +#include + +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|i|j). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ + +void PQCLEAN_DILITHIUM3_AVX2_expand_mat(polyvecl mat[5], const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[0].vec[0], + &mat[0].vec[1], + &mat[0].vec[2], + &mat[0].vec[3], + rho, 0, 1, 2, 3); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[1].vec[0], + &mat[1].vec[1], + &mat[1].vec[2], + &mat[1].vec[3], + rho, 256, 257, 258, 259); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[2].vec[0], + &mat[2].vec[1], + &mat[2].vec[2], + &mat[2].vec[3], + rho, 512, 513, 514, 515); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[3].vec[0], + &mat[3].vec[1], + &mat[3].vec[2], + &mat[3].vec[3], + rho, 768, 769, 770, 771); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[4].vec[0], + &mat[4].vec[1], + &mat[4].vec[2], + &mat[4].vec[3], + rho, 1024, 1025, 1026, 1027); +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with 60 nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(mu|w1). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing mu +* - const polyveck *w1: pointer to vector w1 +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, + const uint8_t mu[CRHBYTES], + const polyveck *w1) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; + uint8_t outbuf[SHAKE256_RATE]; + shake256ctx state; + + for (i = 0; i < CRHBYTES; ++i) { + inbuf[i] = mu[i]; + } + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); + } + + shake256_absorb(&state, inbuf, sizeof(inbuf)); + shake256_squeezeblocks(outbuf, 1, &state); + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t) outbuf[i] << 8 * i; + } + + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + + for (i = 196; i < 256; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_squeezeblocks(outbuf, 1, &state); + pos = 0; + } + + b = outbuf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1; + c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1)); + signs >>= 1; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; + uint16_t nonce = 0; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t, t1, t0; + + /* Expand 32 bytes of randomness into rho, rhoprime and key */ + randombytes(seedbuf, 3 * SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Expand matrix */ + PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, + nonce, nonce + 1, nonce + 2, nonce + 3); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s2.vec[0], &s2.vec[1], &s2.vec[2], &s2.vec[3], rhoprime, + nonce + 4, nonce + 5, nonce + 6, nonce + 7); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&s2.vec[4], rhoprime, nonce + 8); + + /* Matrix-vector multiplication */ + s1hat = s1; + PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1hat); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); + //PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&t.vec[i]); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&t.vec[i]); + } + + /* Add error vector s2 */ + PQCLEAN_DILITHIUM3_AVX2_polyveck_add(&t, &t, &s2); + + /* Extract t1 and write public key */ + PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(&t); + PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(&t1, &t0, &t); + PQCLEAN_DILITHIUM3_AVX2_pack_pk(pk, rho, &t1); + + /* Compute CRH(rho, t1) and write secret key */ + crh(tr, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES); + PQCLEAN_DILITHIUM3_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES +* of len) +* - size_t *siglen: pointer to output length of signed message +* (should be PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + size_t i; + unsigned int n; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint16_t nonce = 0; + poly c, chat; + polyvecl mat[K], s1, y, yhat, z; + polyveck t0, s2, w, w1, w0; + polyveck h, cs2, ct0; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM3_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); + + + // use incremental hash API instead of copying around buffers + /* Compute CRH(tr, m) */ + shake256incctx state; + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho); + PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &y.vec[3], + rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); + nonce += 4; + + /* Matrix-vector multiplication */ + yhat = y; + PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&yhat); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); + PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&w.vec[i]); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&w.vec[i]); + } + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&w); + PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(&w1, &w0, &w); + PQCLEAN_DILITHIUM3_AVX2_challenge(&c, mu, &w1); + chat = c; + PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&chat); + + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&cs2.vec[i]); + } + PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(&w0, &w0, &cs2); + PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(&w0); + if (PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) { + goto rej; + } + + /* Compute z, reject if it reveals secret */ + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&z.vec[i]); + } + PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(&z, &z, &y); + PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(&z); + if (PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + goto rej; + } + + /* Compute hints for w1 */ + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&ct0.vec[i]); + } + + PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&ct0); + if (PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(&ct0, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM3_AVX2_polyveck_add(&w0, &w0, &ct0); + PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&w0); + n = PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(&h, &w0, &w1); + if (n > OMEGA) { + goto rej; + } + + /* Write signature */ + PQCLEAN_DILITHIUM3_AVX2_pack_sig(sig, &z, &h, &c); + *siglen = PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - unsigned long long *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - unsigned long long mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + int rc; + memmove(sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, m, mlen); + rc = PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk); + *smlen += mlen; + return rc; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *sig: signature +* - size_t siglen: length of signature (PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) +* - uint8_t *m: pointer to message +* - size_t *mlen: pointer to output length of message +* - uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk) { + size_t i; + uint8_t rho[SEEDBYTES]; + uint8_t mu[CRHBYTES]; + poly c, chat, cp; + polyvecl mat[K], z; + polyveck t1, w1, h, tmp1, tmp2; + + if (siglen < PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) { + return -1; + } + + PQCLEAN_DILITHIUM3_AVX2_unpack_pk(rho, &t1, pk); + if (PQCLEAN_DILITHIUM3_AVX2_unpack_sig(&z, &h, &c, sig)) { + return -1; + } + if (PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES); + + shake256incctx state; + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho); + PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&z); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); + } + + chat = c; + PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&chat); + PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(&t1); + PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&t1); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); + } + + PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2); + PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(&tmp1); + PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(&tmp1); + + /* Reconstruct w1 */ + PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&tmp1); + PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(&w1, &tmp1, &h); + + /* Call random oracle and verify challenge */ + PQCLEAN_DILITHIUM3_AVX2_challenge(&cp, mu, &w1); + for (i = 0; i < N; ++i) { + if (c.coeffs[i] != cp.coeffs[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - unsigned char *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - unsigned long long *mlen: pointer to output length of message +* - const unsigned char *sm: pointer to signed message +* - unsigned long long smlen: length of signed message +* - const unsigned char *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk) { + size_t i; + if (smlen < PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) { + goto badsig; + } + *mlen = smlen - PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES; + + if (PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, + sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + i]; + } + return 0; + } + + /* Signature verification failed */ +badsig: + *mlen = (size_t) -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium3/avx2/sign.h b/crypto_sign/dilithium3/avx2/sign.h new file mode 100644 index 00000000..15112b4d --- /dev/null +++ b/crypto_sign/dilithium3/avx2/sign.h @@ -0,0 +1,15 @@ +#ifndef SIGN_H +#define SIGN_H + +#include "api.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +void PQCLEAN_DILITHIUM3_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES], + const polyveck *w1); + + +#endif + diff --git a/crypto_sign/dilithium3/avx2/stream.c b/crypto_sign/dilithium3/avx2/stream.c new file mode 100644 index 00000000..eaa4326b --- /dev/null +++ b/crypto_sign/dilithium3/avx2/stream.c @@ -0,0 +1,26 @@ +#include "stream.h" + +#include + +void PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + + uint8_t buf[SEEDBYTES + 2]; + memcpy(buf, seed, SEEDBYTES); + buf[SEEDBYTES] = (uint8_t)nonce; + buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); + + shake128_absorb(state, buf, SEEDBYTES + 2); +} + + +void PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + + uint8_t buf[CRHBYTES + 2]; + memcpy(buf, seed, CRHBYTES); + buf[CRHBYTES] = (uint8_t)nonce; + buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); + + shake256_absorb(state, buf, CRHBYTES + 2); +} diff --git a/crypto_sign/dilithium3/avx2/stream.h b/crypto_sign/dilithium3/avx2/stream.h new file mode 100644 index 00000000..93ce3d06 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/stream.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_STREAM_H +#define PQCLEAN_DILITHIUM3_AVX2_STREAM_H + +#include + +#include "fips202.h" +#include "params.h" + +void PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#endif diff --git a/crypto_sign/dilithium3/avx2/symmetric.h b/crypto_sign/dilithium3/avx2/symmetric.h new file mode 100644 index 00000000..9a29e8e5 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/symmetric.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H +#define PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H + +#include "params.h" +#include "stream.h" + + +#include "fips202.h" + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) + +#define STREAM128_BLOCKBYTES SHAKE128_RATE +#define STREAM256_BLOCKBYTES SHAKE256_RATE + +typedef shake128ctx stream128_state; +typedef shake256ctx stream256_state; + + +#endif diff --git a/crypto_sign/dilithium3/clean/LICENSE b/crypto_sign/dilithium3/clean/LICENSE index 0299dbff..40541676 100644 --- a/crypto_sign/dilithium3/clean/LICENSE +++ b/crypto_sign/dilithium3/clean/LICENSE @@ -1,2 +1,6 @@ -Public Domain -Authors: Léo Ducas, Eike Kiltz, Tancrède Lepoint, Vadim Lyubashevsky, Gregor Seiler, Peter Schwabe, Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and the random number generator +we are using public-domain code from sources +and by authors listed in comments on top of +the respective files. diff --git a/crypto_sign/dilithium3/clean/Makefile b/crypto_sign/dilithium3/clean/Makefile index 452ef92e..cd309004 100644 --- a/crypto_sign/dilithium3/clean/Makefile +++ b/crypto_sign/dilithium3/clean/Makefile @@ -2,10 +2,10 @@ LIB=libdilithium3_clean.a -SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c symmetric.c -OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o symmetric.o +SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c +OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - reduce.h rounding.h symmetric.h + reduce.h rounding.h symmetric.h stream.h CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake index 7bd9fd50..f41af919 100644 --- a/crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake +++ b/crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libdilithium3_clean.lib -OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj symmetric.obj +OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX all: $(LIBRARY) diff --git a/crypto_sign/dilithium3/clean/api.h b/crypto_sign/dilithium3/clean/api.h index 411160d8..18def166 100644 --- a/crypto_sign/dilithium3/clean/api.h +++ b/crypto_sign/dilithium3/clean/api.h @@ -4,14 +4,25 @@ #include #include - #define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES 1472U #define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES 3504U #define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES 2701U #define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_ALGNAME "Dilithium3" -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair( + uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *msg, size_t len, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature( uint8_t *sig, size_t *siglen, @@ -21,13 +32,6 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); #endif diff --git a/crypto_sign/dilithium3/clean/ntt.c b/crypto_sign/dilithium3/clean/ntt.c index 95bdd46c..2a45f808 100644 --- a/crypto_sign/dilithium3/clean/ntt.c +++ b/crypto_sign/dilithium3/clean/ntt.c @@ -1,11 +1,12 @@ +#include + #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" -#include -/* Roots of unity in order needed by forward ntt */ -static const uint32_t zetas[N] = { +/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM3_CLEAN_ntt */ +static const uint32_t PQCLEAN_DILITHIUM3_CLEAN_zetas[N] = { 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231, @@ -40,8 +41,8 @@ static const uint32_t zetas[N] = { 8332111, 7018208, 3937738, 1400424, 7534263, 1976782 }; -/* Roots of unity in order needed by inverse ntt */ -static const uint32_t zetas_inv[N] = { +/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM3_CLEAN_ntt */ +static const uint32_t PQCLEAN_DILITHIUM3_CLEAN_zetas_inv[N] = { 6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416, 3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036, 3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683, @@ -77,7 +78,7 @@ static const uint32_t zetas_inv[N] = { }; /************************************************* -* Name: ntt +* Name: PQCLEAN_DILITHIUM3_CLEAN_ntt * * Description: Forward NTT, in-place. No modular reduction is performed after * additions or subtractions. Hence output coefficients can be up @@ -86,16 +87,16 @@ static const uint32_t zetas_inv[N] = { * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]) { +void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t *p) { unsigned int len, start, j, k; uint32_t zeta, t; k = 1; for (len = 128; len > 0; len >>= 1) { for (start = 0; start < N; start = j + len) { - zeta = zetas[k++]; + zeta = PQCLEAN_DILITHIUM3_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]); + t = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); p[j + len] = p[j] + 2 * Q - t; p[j] = p[j] + t; } @@ -104,7 +105,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]) { } /************************************************* -* Name: invntt_frominvmont +* Name: PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont * * Description: Inverse NTT and multiplication by Montgomery factor 2^32. * In-place. No modular reductions after additions or @@ -113,7 +114,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]) { * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t p[N]) { +void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t *p) { unsigned int start, len, j, k; uint32_t t, zeta; const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q; @@ -121,17 +122,17 @@ void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t p[N]) { k = 0; for (len = 1; len < N; len <<= 1) { for (start = 0; start < N; start = j + len) { - zeta = zetas_inv[k++]; + zeta = PQCLEAN_DILITHIUM3_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { t = p[j]; p[j] = t + p[j + len]; p[j + len] = t + 256 * Q - p[j + len]; - p[j + len] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]); + p[j + len] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); } } } for (j = 0; j < N; ++j) { - p[j] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)f * p[j]); + p[j] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) f * p[j]); } } diff --git a/crypto_sign/dilithium3/clean/ntt.h b/crypto_sign/dilithium3/clean/ntt.h index 8ab80e45..e0176177 100644 --- a/crypto_sign/dilithium3/clean/ntt.h +++ b/crypto_sign/dilithium3/clean/ntt.h @@ -1,8 +1,9 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_NTT_H +#define PQCLEAN_DILITHIUM3_CLEAN_NTT_H + +#include #include "params.h" -#include void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]); void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t p[N]); diff --git a/crypto_sign/dilithium3/clean/packing.c b/crypto_sign/dilithium3/clean/packing.c index f52bb072..60d4952c 100644 --- a/crypto_sign/dilithium3/clean/packing.c +++ b/crypto_sign/dilithium3/clean/packing.c @@ -4,17 +4,18 @@ #include "polyvec.h" /************************************************* -* Name: pack_pk +* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_pk * * Description: Bit-pack public key pk = (rho, t1). * -* Arguments: - unsigned char pk[]: output byte array -* - const unsigned char rho[]: byte array containing rho +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], - const unsigned char rho[SEEDBYTES], - const polyveck *t1) { +void PQCLEAN_DILITHIUM3_CLEAN_pack_pk( + uint8_t *pk, + const uint8_t *rho, + const polyveck *t1) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -28,17 +29,18 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], } /************************************************* -* Name: unpack_pk +* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_pk * * Description: Unpack public key pk = (rho, t1). * -* Arguments: - const unsigned char rho[]: output byte array for rho +* Arguments: - const uint8_t rho[]: output byte array for rho * - const polyveck *t1: pointer to output vector t1 -* - unsigned char pk[]: byte array containing bit-packed pk +* - uint8_t pk[]: byte array containing bit-packed pk **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], - polyveck *t1, - const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) { +void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk( + uint8_t *rho, + polyveck *t1, + const uint8_t *pk) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -52,25 +54,26 @@ void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], } /************************************************* -* Name: pack_sk +* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sk * * Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). * -* Arguments: - unsigned char sk[]: output byte array -* - const unsigned char rho[]: byte array containing rho -* - const unsigned char key[]: byte array containing key -* - const unsigned char tr[]: byte array containing tr +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t key[]: byte array containing key +* - const uint8_t tr[]: byte array containing tr * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 * - const polyveck *t0: pointer to vector t0 **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], - const unsigned char rho[SEEDBYTES], - const unsigned char key[SEEDBYTES], - const unsigned char tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { +void PQCLEAN_DILITHIUM3_CLEAN_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -104,25 +107,26 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], } /************************************************* -* Name: unpack_sk +* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sk * * Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). * -* Arguments: - const unsigned char rho[]: output byte array for rho -* - const unsigned char key[]: output byte array for key -* - const unsigned char tr[]: output byte array for tr +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t key[]: output byte array for key +* - const uint8_t tr[]: output byte array for tr * - const polyvecl *s1: pointer to output vector s1 * - const polyveck *s2: pointer to output vector s2 * - const polyveck *r0: pointer to output vector t0 -* - unsigned char sk[]: byte array containing bit-packed sk +* - uint8_t sk[]: byte array containing bit-packed sk **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], - unsigned char key[SEEDBYTES], - unsigned char tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const unsigned char sk[CRYPTO_SECRETKEYBYTES]) { +void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -156,19 +160,20 @@ void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], } /************************************************* -* Name: pack_sig +* Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sig * * Description: Bit-pack signature sig = (z, h, c). * -* Arguments: - unsigned char sig[]: output byte array +* Arguments: - uint8_t sig[]: output byte array * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial +* - const poly *c: pointer to PQCLEAN_DILITHIUM3_CLEAN_challenge polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { +void PQCLEAN_DILITHIUM3_CLEAN_pack_sig( + uint8_t *sig, + const polyvecl *z, + const polyveck *h, + const poly *c) { unsigned int i, j, k; uint64_t signs, mask; @@ -182,10 +187,11 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], for (i = 0; i < K; ++i) { for (j = 0; j < N; ++j) { if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (unsigned char) j; + sig[k++] = (uint8_t)j; } } - sig[OMEGA + i] = (unsigned char) k; + + sig[OMEGA + i] = (uint8_t)k; } while (k < OMEGA) { sig[k++] = 0; @@ -199,7 +205,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], sig[i] = 0; for (j = 0; j < 8; ++j) { if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (unsigned char) (1U << j); + sig[i] |= (uint8_t)(1u << j); if (c->coeffs[8 * i + j] == (Q - 1)) { signs |= mask; } @@ -209,27 +215,28 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], } sig += N / 8; for (i = 0; i < 8; ++i) { - sig[i] = (unsigned char) (signs >> 8 * i); + sig[i] = (uint8_t)(signs >> 8u * i); } } /************************************************* -* Name: unpack_sig +* Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sig * * Description: Unpack signature sig = (z, h, c). * * Arguments: - polyvecl *z: pointer to output vector z * - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial -* - const unsigned char sig[]: byte array containing +* - poly *c: pointer to output PQCLEAN_DILITHIUM3_CLEAN_challenge polynomial +* - const uint8_t sig[]: byte array containing * bit-packed signature * * Returns 1 in case of malformed signature; otherwise 0. **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(polyvecl *z, - polyveck *h, - poly *c, - const unsigned char sig[CRYPTO_BYTES]) { +int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig( + polyvecl *z, + polyveck *h, + poly *c, + const uint8_t *sig) { unsigned int i, j, k; uint64_t signs; @@ -266,6 +273,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(polyvecl *z, return 1; } } + sig += OMEGA + K; /* Decode c */ diff --git a/crypto_sign/dilithium3/clean/packing.h b/crypto_sign/dilithium3/clean/packing.h index ef1634c0..298dac16 100644 --- a/crypto_sign/dilithium3/clean/packing.h +++ b/crypto_sign/dilithium3/clean/packing.h @@ -1,31 +1,36 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_PACKING_H +#define PQCLEAN_DILITHIUM3_CLEAN_PACKING_H #include "params.h" #include "polyvec.h" -void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], - const unsigned char rho[SEEDBYTES], const polyveck *t1); -void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], - const unsigned char rho[SEEDBYTES], - const unsigned char key[SEEDBYTES], - const unsigned char tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], - const polyvecl *z, const polyveck *h, const poly *c); +void PQCLEAN_DILITHIUM3_CLEAN_pack_pk( + uint8_t *pk, + const uint8_t *rho, const polyveck *t1); +void PQCLEAN_DILITHIUM3_CLEAN_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0); +void PQCLEAN_DILITHIUM3_CLEAN_pack_sig( + uint8_t *sig, + const polyvecl *z, const polyveck *h, const poly *c); -void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], polyveck *t1, - const unsigned char pk[CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], - unsigned char key[SEEDBYTES], - unsigned char tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const unsigned char sk[CRYPTO_SECRETKEYBYTES]); -int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(polyvecl *z, polyveck *h, poly *c, - const unsigned char sig[CRYPTO_BYTES]); +void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk( + uint8_t *rho, polyveck *t1, + const uint8_t *pk); +void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk); +int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig( + polyvecl *z, polyveck *h, poly *c, const uint8_t *sig); #endif diff --git a/crypto_sign/dilithium3/clean/params.h b/crypto_sign/dilithium3/clean/params.h index edae8066..8632ac0f 100644 --- a/crypto_sign/dilithium3/clean/params.h +++ b/crypto_sign/dilithium3/clean/params.h @@ -1,18 +1,17 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H +#define PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H + #define SEEDBYTES 32 #define CRHBYTES 48 #define N 256 #define Q 8380417 #define QBITS 23 -#define ROOT_OF_UNITY 1753 #define D 14 #define GAMMA1 ((Q - 1)/16) #define GAMMA2 (GAMMA1/2) #define ALPHA (2*GAMMA2) -// DilithiumIII parameters #define K 5 #define L 4 #define ETA 5 @@ -20,6 +19,7 @@ #define BETA 275 #define OMEGA 96 + #define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) #define POLT0_SIZE_PACKED ((N*D)/8) #define POLETA_SIZE_PACKED ((N*SETABITS)/8) diff --git a/crypto_sign/dilithium3/clean/poly.c b/crypto_sign/dilithium3/clean/poly.c index 2b9ff9a1..0d363285 100644 --- a/crypto_sign/dilithium3/clean/poly.c +++ b/crypto_sign/dilithium3/clean/poly.c @@ -1,10 +1,11 @@ +#include + #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "rounding.h" #include "symmetric.h" -#include /************************************************* @@ -16,8 +17,7 @@ * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_reduce32(a->coeffs[i]); } } @@ -31,8 +31,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_csubq(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_csubq(a->coeffs[i]); } } @@ -46,8 +45,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_csubq(poly *a) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_freeze(a->coeffs[i]); } } @@ -61,9 +59,8 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a) { * - const poly *a: pointer to first summand * - const poly *b: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { - unsigned int i; - for (i = 0; i < N; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } @@ -78,11 +75,10 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial to be -* subtraced from first input polynomial +* subtracted from first input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i]; } } @@ -96,8 +92,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] <<= D; } } @@ -139,9 +134,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(poly *a) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]); } @@ -160,12 +153,9 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly * * - const poly *v: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_power2round(a->coeffs[i], &a0->coeffs[i]); } - } /************************************************* @@ -182,12 +172,9 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a * - const poly *c: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_decompose(a->coeffs[i], &a0->coeffs[i]); } - } /************************************************* @@ -204,13 +191,11 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) * Returns number of 1 bits. **************************************************/ unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { - unsigned int i, s = 0; - - for (i = 0; i < N; ++i) { + unsigned int s = 0; + for (size_t i = 0; i < N; ++i) { h->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); s += h->coeffs[i]; } - return s; } @@ -224,12 +209,9 @@ unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, co * - const poly *h: pointer to input hint polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *h) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_use_hint(b->coeffs[i], h->coeffs[i]); } - } /************************************************* @@ -244,15 +226,13 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *a, const poly *b, const poly * * Returns 0 if norm is strictly smaller than B and 1 otherwise. **************************************************/ int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) { - unsigned int i; int32_t t; - /* It is ok to leak which coefficient violates the bound since the probability for each coefficient is independent of secret data but we must not leak the sign of the centralized representative. */ - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { /* Absolute value of centralized representative */ - t = (int32_t) ((Q - 1) / 2 - a->coeffs[i]); + t = (int32_t)((Q - 1) / 2 - a->coeffs[i]); t ^= (t >> 31); t = (Q - 1) / 2 - t; @@ -260,7 +240,6 @@ int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) { return 1; } } - return 0; } @@ -272,7 +251,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) { * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -280,8 +259,8 @@ int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) { **************************************************/ static unsigned int rej_uniform(uint32_t *a, unsigned int len, - const unsigned char *buf, - unsigned int buflen) { + const uint8_t *buf, + size_t buflen) { unsigned int ctr, pos; uint32_t t; @@ -308,19 +287,20 @@ static unsigned int rej_uniform(uint32_t *a, * output stream from SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES)/STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t seed[SEEDBYTES], uint16_t nonce) { - unsigned int i, ctr, off; - unsigned int buflen = POLY_UNIFORM_BUFLEN; - unsigned char buf[POLY_UNIFORM_BUFLEN + 2]; - shake128ctx state; + unsigned int i, ctr; + size_t buflen = POLY_UNIFORM_BUFLEN; + uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; + stream128_state state; + size_t off; stream128_init(&state, seed, nonce); stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); @@ -347,7 +327,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -355,7 +335,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, **************************************************/ static unsigned int rej_eta(uint32_t *a, unsigned int len, - const unsigned char *buf, + const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; uint32_t t0, t1; @@ -384,19 +364,18 @@ static unsigned int rej_eta(uint32_t *a, * output stream from SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N/2 * (1U << SETABITS)) / (2*ETA + 1)\ - + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) #define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce) { unsigned int ctr; - unsigned char buf[POLY_UNIFORM_ETA_BUFLEN]; - shake128ctx state; + uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; + stream128_state state; stream128_init(&state, seed, nonce); stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); @@ -418,7 +397,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a, * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -426,7 +405,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a, **************************************************/ static unsigned int rej_gamma1m1(uint32_t *a, unsigned int len, - const unsigned char *buf, + const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; uint32_t t0, t1; @@ -438,7 +417,7 @@ static unsigned int rej_gamma1m1(uint32_t *a, t0 |= (uint32_t)buf[pos + 2] << 16; t0 &= 0xFFFFF; - t1 = buf[pos + 2] >> 4; + t1 = buf[pos + 2] >> 4; t1 |= (uint32_t)buf[pos + 3] << 4; t1 |= (uint32_t)buf[pos + 4] << 12; @@ -463,19 +442,19 @@ static unsigned int rej_gamma1m1(uint32_t *a, * sampling on output stream of SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * CRHBYTES * - uint16_t nonce: 16-bit nonce **************************************************/ #define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) #define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1(poly *a, - const unsigned char seed[CRHBYTES], + const uint8_t seed[CRHBYTES], uint16_t nonce) { unsigned int i, ctr, off; unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; - shake256ctx state; + uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; + stream256_state state; stream256_init(&state, seed, nonce); stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); @@ -500,18 +479,18 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1(poly *a, * Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. * Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLETA_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(unsigned char *r, const poly *a) { +void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { unsigned int i; - unsigned char t[8]; + uint8_t t[8]; for (i = 0; i < N / 2; ++i) { - t[0] = (uint8_t) (Q + ETA - a->coeffs[2 * i + 0]); - t[1] = (uint8_t) (Q + ETA - a->coeffs[2 * i + 1]); - r[i] = (uint8_t) (t[0] | (t[1] << 4)); + t[0] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 0]); + t[1] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 1]); + r[i] = (uint8_t)(t[0] | (t[1] << 4)); } } @@ -522,68 +501,67 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(unsigned char *r, const poly *a) { * Output coefficients lie in [Q-ETA,Q+ETA]. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const unsigned char *a) { +void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; + for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[i] & 0x0F; r->coeffs[2 * i + 1] = a[i] >> 4; r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0]; r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1]; } + } /************************************************* -* Name: polyt1_pack +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack * * Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLT1_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(unsigned char *r, const poly *a) { +void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { unsigned int i; for (i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t) ((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t) ((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t) ((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t) ((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t) ((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t) ((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t) ((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t) ((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t) ((a->coeffs[8 * i + 7] >> 1)); + r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); + r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); + r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); + r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); + r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); + r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); + r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); + r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); + r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); } } /************************************************* -* Name: polyt1_unpack +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack * * Description: Unpack polynomial t1 with 9-bit coefficients. * Output coefficients are standard representatives. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) { - unsigned int i; - - for (i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; +void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { + for (size_t i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t) a[9 * i + 1] << 8)) & 0x1FF; + r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t) a[9 * i + 2] << 7)) & 0x1FF; + r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t) a[9 * i + 3] << 6)) & 0x1FF; + r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t) a[9 * i + 4] << 5)) & 0x1FF; + r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t) a[9 * i + 5] << 4)) & 0x1FF; + r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t) a[9 * i + 6] << 3)) & 0x1FF; + r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t) a[9 * i + 7] << 2)) & 0x1FF; + r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t) a[9 * i + 8] << 1)) & 0x1FF; } - } /************************************************* @@ -592,32 +570,30 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) { * Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. * Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLT0_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(unsigned char *r, const poly *a) { - unsigned int i; +void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { uint32_t t[4]; - for (i = 0; i < N / 4; ++i) { - t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; + for (size_t i = 0; i < N / 4; ++i) { + t[0] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 0]; + t[1] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 1]; + t[2] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 2]; + t[3] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 3]; - r[7 * i + 0] = (uint8_t) (t[0]); - r[7 * i + 1] = (uint8_t) (t[0] >> 8); - r[7 * i + 1] |= (uint8_t) (t[1] << 6); - r[7 * i + 2] = (uint8_t) (t[1] >> 2); - r[7 * i + 3] = (uint8_t) (t[1] >> 10); - r[7 * i + 3] |= (uint8_t) (t[2] << 4); - r[7 * i + 4] = (uint8_t) (t[2] >> 4); - r[7 * i + 5] = (uint8_t) (t[2] >> 12); - r[7 * i + 5] |= (uint8_t) (t[3] << 2); - r[7 * i + 6] = (uint8_t) (t[3] >> 6); + r[7 * i + 0] = (uint8_t)(t[0]); + r[7 * i + 1] = (uint8_t)(t[0] >> 8); + r[7 * i + 1] |= (uint8_t)(t[1] << 6); + r[7 * i + 2] = (uint8_t)(t[1] >> 2); + r[7 * i + 3] = (uint8_t)(t[1] >> 10); + r[7 * i + 3] |= (uint8_t)(t[2] << 4); + r[7 * i + 4] = (uint8_t)(t[2] >> 4); + r[7 * i + 5] = (uint8_t)(t[2] >> 12); + r[7 * i + 5] |= (uint8_t)(t[3] << 2); + r[7 * i + 6] = (uint8_t)(t[3] >> 6); } - } /************************************************* @@ -627,32 +603,30 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(unsigned char *r, const poly *a) { * Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) { - unsigned int i; +void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { - for (i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; + for (size_t i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[7 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t) (a[7 * i + 1] & 0x3F) << 8; - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; + r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; + r->coeffs[4 * i + 1] |= (uint32_t) a[7 * i + 2] << 2; + r->coeffs[4 * i + 1] |= (uint32_t) (a[7 * i + 3] & 0x0F) << 10; - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; + r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t) a[7 * i + 4] << 4; + r->coeffs[4 * i + 2] |= (uint32_t) (a[7 * i + 5] & 0x03) << 12; - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; + r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; + r->coeffs[4 * i + 3] |= (uint32_t) a[7 * i + 6] << 6; r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; } - } /************************************************* @@ -662,29 +636,27 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) { * in [-(GAMMA1 - 1), GAMMA1 - 1]. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLZ_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(unsigned char *r, const poly *a) { - unsigned int i; +void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a) { uint32_t t[2]; - for (i = 0; i < N / 2; ++i) { + for (size_t i = 0; i < N / 2; ++i) { /* Map to {0,...,2*GAMMA1 - 2} */ t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; t[0] += ((int32_t)t[0] >> 31) & Q; t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; t[1] += ((int32_t)t[1] >> 31) & Q; - r[5 * i + 0] = (uint8_t) (t[0]); - r[5 * i + 1] = (uint8_t) (t[0] >> 8); - r[5 * i + 2] = (uint8_t) (t[0] >> 16); - r[5 * i + 2] |= (uint8_t) (t[1] << 4); - r[5 * i + 3] = (uint8_t) (t[1] >> 4); - r[5 * i + 4] = (uint8_t) (t[1] >> 12); + r[5 * i + 0] = (uint8_t)t[0]; + r[5 * i + 1] = (uint8_t)(t[0] >> 8); + r[5 * i + 2] = (uint8_t)(t[0] >> 16); + r[5 * i + 2] |= (uint8_t)(t[1] << 4); + r[5 * i + 3] = (uint8_t)(t[1] >> 4); + r[5 * i + 4] = (uint8_t)(t[1] >> 12); } - } /************************************************* @@ -695,26 +667,23 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(unsigned char *r, const poly *a) { * Output coefficients are standard representatives. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const unsigned char *a) { - unsigned int i; - - for (i = 0; i < N / 2; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { + for (size_t i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; + r->coeffs[2 * i + 0] |= (uint32_t) a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t) (a[5 * i + 2] & 0x0F) << 16; r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 4] << 12; - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; + r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; } - } /************************************************* @@ -723,15 +692,13 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const unsigned char *a) { * Description: Bit-pack polynomial w1 with coefficients in [0, 15]. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLW1_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(unsigned char *r, const poly *a) { - unsigned int i; - - for (i = 0; i < N / 2; ++i) { - r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)); +void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { + for (size_t i = 0; i < N / 2; ++i) { + r[i] = (uint8_t)(a->coeffs[2 * i + 0] | a->coeffs[2 * i + 1] << 4); } } diff --git a/crypto_sign/dilithium3/clean/poly.h b/crypto_sign/dilithium3/clean/poly.h index ce1aa8ce..593c0524 100644 --- a/crypto_sign/dilithium3/clean/poly.h +++ b/crypto_sign/dilithium3/clean/poly.h @@ -1,8 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_POLY_H +#define PQCLEAN_DILITHIUM3_CLEAN_POLY_H + +#include #include "params.h" -#include typedef struct { uint32_t coeffs[N]; @@ -27,27 +28,27 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *a, const poly *b, const poly * int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B); void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce); void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce); void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1(poly *a, - const unsigned char seed[CRHBYTES], + const uint8_t seed[CRHBYTES], uint16_t nonce); -void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(unsigned char *r, const poly *a); +void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a); #endif diff --git a/crypto_sign/dilithium3/clean/polyvec.c b/crypto_sign/dilithium3/clean/polyvec.c index 6850032e..00aba30b 100644 --- a/crypto_sign/dilithium3/clean/polyvec.c +++ b/crypto_sign/dilithium3/clean/polyvec.c @@ -1,14 +1,15 @@ +#include + #include "params.h" #include "poly.h" #include "polyvec.h" -#include /**************************************************************/ /************ Vectors of polynomials of length L **************/ /**************************************************************/ /************************************************* -* Name: polyvecl_freeze +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze * * Description: Reduce coefficients of polynomials in vector of length L * to standard representatives. @@ -24,7 +25,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v) { } /************************************************* -* Name: polyvecl_add +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add * * Description: Add vectors of polynomials of length L. * No modular reduction is performed. @@ -42,7 +43,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const } /************************************************* -* Name: polyvecl_ntt +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt * * Description: Forward NTT of all polynomials in vector of length L. Output * coefficients can be up to 16*Q larger than input coefficients. @@ -58,7 +59,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(polyvecl *v) { } /************************************************* -* Name: polyvecl_pointwise_acc_invmontgomery +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery * * Description: Pointwise multiply vectors of polynomials of length L, multiply * resulting vector by 2^{-32} and add (accumulate) polynomials @@ -85,7 +86,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w, } /************************************************* -* Name: polyvecl_chknorm +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm * * Description: Check infinity norm of polynomials in vector of length L. * Assumes input coefficients to be standard representatives. @@ -96,14 +97,15 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w, * Returns 0 if norm of all polynomials is strictly smaller than B and 1 * otherwise. **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { +int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { unsigned int i; for (i = 0; i < L; ++i) { - if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) { + if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], B)) { return 1; } } + return 0; } @@ -113,7 +115,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound) /************************************************* -* Name: polyveck_reduce +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce * * Description: Reduce coefficients of polynomials in vector of length K * to representatives in [0,2*Q[. @@ -129,7 +131,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(polyveck *v) { } /************************************************* -* Name: polyveck_csubq +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq * * Description: For all coefficients of polynomials in vector of length K * subtract Q if coefficient is bigger than Q. @@ -145,7 +147,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(polyveck *v) { } /************************************************* -* Name: polyveck_freeze +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze * * Description: Reduce coefficients of polynomials in vector of length K * to standard representatives. @@ -161,7 +163,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v) { } /************************************************* -* Name: polyveck_add +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_add * * Description: Add vectors of polynomials of length K. * No modular reduction is performed. @@ -179,7 +181,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const } /************************************************* -* Name: polyveck_sub +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub * * Description: Subtract vectors of polynomials of length K. * Assumes coefficients of polynomials in second input vector @@ -199,7 +201,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const } /************************************************* -* Name: polyveck_shiftl +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl * * Description: Multiply vector of polynomials of Length K by 2^D without modular * reduction. Assumes input coefficients to be less than 2^{32-D}. @@ -215,7 +217,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v) { } /************************************************* -* Name: polyveck_ntt +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt * * Description: Forward NTT of all polynomials in vector of length K. Output * coefficients can be up to 16*Q larger than input coefficients. @@ -231,7 +233,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v) { } /************************************************* -* Name: polyveck_invntt_montgomery +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery * * Description: Inverse NTT and multiplication by 2^{32} of polynomials * in vector of length K. Input coefficients need to be less @@ -248,7 +250,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery(polyveck *v) { } /************************************************* -* Name: polyveck_chknorm +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm * * Description: Check infinity norm of polynomials in vector of length K. * Assumes input coefficients to be standard representatives. @@ -259,19 +261,20 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery(polyveck *v) { * Returns 0 if norm of all polynomials are strictly smaller than B and 1 * otherwise. **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t bound) { +int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { unsigned int i; for (i = 0; i < K; ++i) { - if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) { + if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], B)) { return 1; } } + return 0; } /************************************************* -* Name: polyveck_power2round +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round * * Description: For all coefficients a of polynomials in vector of length K, * compute a0, a1 such that a mod Q = a1*2^D + a0 @@ -293,7 +296,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, c } /************************************************* -* Name: polyveck_decompose +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose * * Description: For all coefficients a of polynomials in vector of length K, * compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 @@ -316,7 +319,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con } /************************************************* -* Name: polyveck_make_hint +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint * * Description: Compute hint vector. * @@ -339,19 +342,19 @@ unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h, } /************************************************* -* Name: polyveck_use_hint +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint * * Description: Use hint vector to correct the high bits of input vector. * * Arguments: - polyveck *w: pointer to output vector of polynomials with * corrected high bits -* - const polyveck *u: pointer to input vector +* - const polyveck *v: pointer to input vector * - const polyveck *h: pointer to input hint vector **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { unsigned int i; for (i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); } } diff --git a/crypto_sign/dilithium3/clean/polyvec.h b/crypto_sign/dilithium3/clean/polyvec.h index 73b8341f..55902065 100644 --- a/crypto_sign/dilithium3/clean/polyvec.h +++ b/crypto_sign/dilithium3/clean/polyvec.h @@ -1,9 +1,10 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H +#define PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H + +#include #include "params.h" #include "poly.h" -#include /* Vectors of polynomials of length L */ typedef struct { @@ -46,6 +47,6 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h, const polyveck *v0, const polyveck *v1); -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h); #endif diff --git a/crypto_sign/dilithium3/clean/reduce.c b/crypto_sign/dilithium3/clean/reduce.c index 651d0a2b..02da8968 100644 --- a/crypto_sign/dilithium3/clean/reduce.c +++ b/crypto_sign/dilithium3/clean/reduce.c @@ -1,9 +1,10 @@ -#include "params.h" -#include "reduce.h" #include +#include "params.h" +#include "reduce.h" + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce * * Description: For finite field element a with 0 <= a <= Q*2^32, * compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q. @@ -20,11 +21,11 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(uint64_t a) { t *= Q; t = a + t; t >>= 32; - return (uint32_t) t; + return (uint32_t)t; } /************************************************* -* Name: reduce32 +* Name: PQCLEAN_DILITHIUM3_CLEAN_reduce32 * * Description: For finite field element a, compute r \equiv a (mod Q) * such that 0 <= r < 2*Q. @@ -43,7 +44,7 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(uint32_t a) { } /************************************************* -* Name: csubq +* Name: PQCLEAN_DILITHIUM3_CLEAN_csubq * * Description: Subtract Q if input coefficient is bigger than Q. * @@ -58,7 +59,7 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_csubq(uint32_t a) { } /************************************************* -* Name: freeze +* Name: PQCLEAN_DILITHIUM3_CLEAN_freeze * * Description: For finite field element a, compute standard * representative r = a mod Q. diff --git a/crypto_sign/dilithium3/clean/reduce.h b/crypto_sign/dilithium3/clean/reduce.h index 4c6e8578..ba94792e 100644 --- a/crypto_sign/dilithium3/clean/reduce.h +++ b/crypto_sign/dilithium3/clean/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_REDUCE_H +#define PQCLEAN_DILITHIUM3_CLEAN_REDUCE_H #include diff --git a/crypto_sign/dilithium3/clean/rounding.c b/crypto_sign/dilithium3/clean/rounding.c index 603a7d59..6dcb8101 100644 --- a/crypto_sign/dilithium3/clean/rounding.c +++ b/crypto_sign/dilithium3/clean/rounding.c @@ -1,7 +1,10 @@ +#include + #include "params.h" #include "rounding.h" + /************************************************* -* Name: power2round +* Name: PQCLEAN_DILITHIUM3_CLEAN_power2round * * Description: For finite field element a, compute a0, a1 such that * a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. @@ -17,16 +20,16 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(uint32_t a, uint32_t *a0) { /* Centralized remainder mod 2^D */ t = a & ((1U << D) - 1); - t -= ((1U << (D - 1)) + 1); - t += ((uint32_t)((int32_t)t >> 31) & (1U << D)); - t -= ((1U << (D - 1)) - 1); - *a0 = (Q + t); + t -= (1U << (D - 1)) + 1; + t += ((uint32_t)((int32_t)t >> 31) & (1 << D)); + t -= (1U << (D - 1)) - 1; + *a0 = Q + t; a = (a - t) >> D; return a; } /************************************************* -* Name: decompose +* Name: PQCLEAN_DILITHIUM3_CLEAN_decompose * * Description: For finite field element a, compute high and low bits a0, a1 such * that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except @@ -41,28 +44,29 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(uint32_t a, uint32_t *a0) { **************************************************/ uint32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(uint32_t a, uint32_t *a0) { int32_t t, u; + /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFF; - t += (int32_t) ((a >> 19) << 9); + t = a & 0x7FFFFu; + t += (int32_t)((a >> 19u) << 9u); t -= ALPHA / 2 + 1; t += (t >> 31) & ALPHA; t -= ALPHA / 2 - 1; - a -= (uint32_t) t; + a -= (uint32_t)t; /* Divide by ALPHA (possible to avoid) */ - u = (int32_t) a - 1; + u = (int32_t)(a - 1); u >>= 31; a = (a >> 19) + 1; a -= u & 1; /* Border case */ - *a0 = Q + (uint32_t)t - (a >> 4); - a &= 0xF; + *a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u)); + a &= 0xFu; return a; } /************************************************* -* Name: make_hint +* Name: PQCLEAN_DILITHIUM3_CLEAN_make_hint * * Description: Compute hint bit indicating whether the low bits of the * input element overflow into the high bits. Inputs assumed to be @@ -73,7 +77,7 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(uint32_t a, uint32_t *a0) { * * Returns 1 if high bits of a and b differ and 0 otherwise. **************************************************/ -unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(uint32_t a0, uint32_t a1) { +unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(const uint32_t a0, const uint32_t a1) { if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { return 0; } @@ -82,7 +86,7 @@ unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(uint32_t a0, uint32_t a1) { } /************************************************* -* Name: use_hint +* Name: PQCLEAN_DILITHIUM3_CLEAN_use_hint * * Description: Correct high bits according to hint. * @@ -91,7 +95,7 @@ unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(uint32_t a0, uint32_t a1) { * * Returns corrected high bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(uint32_t a, unsigned int hint) { +uint32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(const uint32_t a, const unsigned int hint) { uint32_t a0, a1; a1 = PQCLEAN_DILITHIUM3_CLEAN_decompose(a, &a0); @@ -101,5 +105,15 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(uint32_t a, unsigned int hint) { if (a0 > Q) { return (a1 + 1) & 0xF; } + return (a1 - 1) & 0xF; + + /* If PQCLEAN_DILITHIUM3_CLEAN_decompose does not divide out ALPHA: + if(hint == 0) + return a1; + else if(a0 > Q) + return (a1 + ALPHA) % (Q - 1); + else + return (a1 - ALPHA) % (Q - 1); + */ } diff --git a/crypto_sign/dilithium3/clean/rounding.h b/crypto_sign/dilithium3/clean/rounding.h index ad979f09..acb2fbdd 100644 --- a/crypto_sign/dilithium3/clean/rounding.h +++ b/crypto_sign/dilithium3/clean/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_ROUNDING_H +#define PQCLEAN_DILITHIUM3_CLEAN_ROUNDING_H #include diff --git a/crypto_sign/dilithium3/clean/sign.c b/crypto_sign/dilithium3/clean/sign.c index 8818a8ab..3438b308 100644 --- a/crypto_sign/dilithium3/clean/sign.c +++ b/crypto_sign/dilithium3/clean/sign.c @@ -1,3 +1,6 @@ +#include +#include + #include "fips202.h" #include "packing.h" #include "params.h" @@ -7,19 +10,17 @@ #include "sign.h" #include "symmetric.h" -#include - /************************************************* -* Name: expand_mat +* Name: PQCLEAN_DILITHIUM3_CLEAN_expand_mat * * Description: Implementation of ExpandA. Generates matrix A with uniformly * random coefficients a_{i,j} by performing rejection * sampling on the output stream of SHAKE128(rho|i|j). * * Arguments: - polyvecl mat[K]: output matrix -* - const unsigned char rho[]: byte array containing seed rho +* - const uint8_t rho[]: byte array containing seed rho **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]) { +void PQCLEAN_DILITHIUM3_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { unsigned int i, j; for (i = 0; i < K; ++i) { @@ -30,23 +31,23 @@ void PQCLEAN_DILITHIUM3_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rh } /************************************************* -* Name: challenge +* Name: PQCLEAN_DILITHIUM3_CLEAN_challenge * * Description: Implementation of H. Samples polynomial with 60 nonzero * coefficients in {-1,1} using the output stream of * SHAKE256(mu|w1). * * Arguments: - poly *c: pointer to output polynomial -* - const unsigned char mu[]: byte array containing mu +* - const uint8_t mu[]: byte array containing mu * - const polyveck *w1: pointer to vector w1 **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, - const unsigned char mu[CRHBYTES], + const uint8_t mu[CRHBYTES], const polyveck *w1) { unsigned int i, b, pos; uint64_t signs; - unsigned char inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - unsigned char outbuf[SHAKE256_RATE]; + uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; + uint8_t outbuf[SHAKE256_RATE]; shake256ctx state; for (i = 0; i < CRHBYTES; ++i) { @@ -88,22 +89,22 @@ void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, } /************************************************* -* Name: crypto_sign_keypair +* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair * * Description: Generates public and private key. * -* Arguments: - unsigned char *pk: pointer to output public key (allocated -* array of CRYPTO_PUBLICKEYBYTES bytes) -* - unsigned char *sk: pointer to output private key (allocated -* array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { unsigned int i; - unsigned char seedbuf[3 * SEEDBYTES]; - unsigned char tr[CRHBYTES]; - const unsigned char *rho, *rhoprime, *key; + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; uint16_t nonce = 0; polyvecl mat[K]; polyvecl s1, s1hat; @@ -144,19 +145,35 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { PQCLEAN_DILITHIUM3_CLEAN_pack_pk(pk, rho, &t1); /* Compute CRH(rho, t1) and write secret key */ - crh(tr, pk, CRYPTO_PUBLICKEYBYTES); + crh(tr, pk, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES); PQCLEAN_DILITHIUM3_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); return 0; } +/************************************************* +* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES +* of len) +* - size_t *smlen: pointer to output length of signed message +* (should be PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature( uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, const uint8_t *sk) { + const uint8_t *msg, size_t mlen, + const uint8_t *sk) { unsigned long long i; unsigned int n; - unsigned char seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; - unsigned char *rho, *tr, *key, *mu, *rhoprime; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; uint16_t nonce = 0; poly c, chat; polyvecl mat[K], s1, y, yhat, z; @@ -170,13 +187,12 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature( rhoprime = mu + CRHBYTES; PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); - // use incremental hash API instead of copying around buffers /* Compute CRH(tr, msg) */ shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, tr, CRHBYTES); - shake256_inc_absorb(&state, m, mlen); + shake256_inc_absorb(&state, msg, mlen); shake256_inc_finalize(&state); shake256_inc_squeeze(mu, CRHBYTES, &state); @@ -253,11 +269,51 @@ rej: /* Write signature */ PQCLEAN_DILITHIUM3_CLEAN_pack_sig(sig, &z, &h, &c); - - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES; return 0; } +/************************************************* +* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - unsigned long long *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - unsigned long long mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + int rc; + memmove(sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, m, mlen); + rc = PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); + *smlen += mlen; + return rc; +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *sig: signature +* - size_t siglen: length of signature (PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) +* - uint8_t *m: pointer to message +* - size_t *mlen: pointer to output length of message +* - uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { @@ -268,7 +324,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( polyvecl mat[K], z; polyveck t1, w1, h, tmp1, tmp2; - if (siglen < CRYPTO_BYTES) { + if (siglen < PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) { return -1; } @@ -281,7 +337,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( } /* Compute CRH(CRH(rho, t1), msg) */ - crh(mu, pk, CRYPTO_PUBLICKEYBYTES); + crh(mu, pk, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES); shake256incctx state; shake256_inc_init(&state); @@ -325,40 +381,9 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( // All good return 0; } -/************************************************* -* Name: crypto_sign -* -* Description: Compute signed message. -* -* Arguments: - unsigned char *sm: pointer to output signed message (allocated -* array with CRYPTO_BYTES + mlen bytes), -* can be equal to m -* - unsigned long long *smlen: pointer to output length of signed -* message -* - const unsigned char *m: pointer to message to be signed -* - unsigned long long mlen: length of message -* - const unsigned char *sk: pointer to bit-packed secret key -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, - size_t *smlen, - const uint8_t *m, - size_t mlen, - const uint8_t *sk) { - size_t i; - int rc; - for (i = 0; i < mlen; i++) { - sm[CRYPTO_BYTES + i] = m[i]; - } - rc = PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); - *smlen += mlen; - return rc; - -} /************************************************* -* Name: crypto_sign_open +* Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open * * Description: Verify signed message. * @@ -371,24 +396,23 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, * * Returns 0 if signed message could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, - size_t *mlen, - const uint8_t *sm, - size_t smlen, - const uint8_t *pk) { +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; + *mlen = smlen - PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES; - if (PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(sm, CRYPTO_BYTES, - sm + CRYPTO_BYTES, *mlen, pk)) { + if (PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, + sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + i]; } return 0; } diff --git a/crypto_sign/dilithium3/clean/sign.h b/crypto_sign/dilithium3/clean/sign.h index 4c309571..ae80256a 100644 --- a/crypto_sign/dilithium3/clean/sign.h +++ b/crypto_sign/dilithium3/clean/sign.h @@ -1,30 +1,12 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_SIGN_H +#define PQCLEAN_DILITHIUM3_CLEAN_SIGN_H +#include "api.h" #include "params.h" #include "poly.h" #include "polyvec.h" -void PQCLEAN_DILITHIUM3_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, const unsigned char mu[CRHBYTES], +void PQCLEAN_DILITHIUM3_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES], const polyveck *w1); - -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, const uint8_t *sk); - -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, const uint8_t *pk); - -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); - #endif diff --git a/crypto_sign/dilithium3/clean/stream.c b/crypto_sign/dilithium3/clean/stream.c new file mode 100644 index 00000000..a1ac2ff0 --- /dev/null +++ b/crypto_sign/dilithium3/clean/stream.c @@ -0,0 +1,26 @@ +#include "stream.h" + +#include + +void PQCLEAN_DILITHIUM3_CLEAN_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + + uint8_t buf[SEEDBYTES + 2]; + memcpy(buf, seed, SEEDBYTES); + buf[SEEDBYTES] = (uint8_t)nonce; + buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); + + shake128_absorb(state, buf, SEEDBYTES + 2); +} + + +void PQCLEAN_DILITHIUM3_CLEAN_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + + uint8_t buf[CRHBYTES + 2]; + memcpy(buf, seed, CRHBYTES); + buf[CRHBYTES] = (uint8_t)nonce; + buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); + + shake256_absorb(state, buf, CRHBYTES + 2); +} diff --git a/crypto_sign/dilithium3/clean/stream.h b/crypto_sign/dilithium3/clean/stream.h new file mode 100644 index 00000000..711b266f --- /dev/null +++ b/crypto_sign/dilithium3/clean/stream.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_DILITHIUM3_CLEAN_STREAM_H +#define PQCLEAN_DILITHIUM3_CLEAN_STREAM_H + +#include + +#include "fips202.h" +#include "params.h" + +void PQCLEAN_DILITHIUM3_CLEAN_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3_CLEAN_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#endif diff --git a/crypto_sign/dilithium3/clean/symmetric.c b/crypto_sign/dilithium3/clean/symmetric.c deleted file mode 100644 index 955e686d..00000000 --- a/crypto_sign/dilithium3/clean/symmetric.c +++ /dev/null @@ -1,32 +0,0 @@ -#include "symmetric.h" -#include "fips202.h" - -void PQCLEAN_DILITHIUM3_CLEAN_shake128_stream_init(shake128ctx *state, - const unsigned char seed[SEEDBYTES], - uint16_t nonce) { - unsigned int i; - unsigned char buf[SEEDBYTES + 2]; - - for (i = 0; i < SEEDBYTES; ++i) { - buf[i] = seed[i]; - } - buf[SEEDBYTES] = (uint8_t) nonce; - buf[SEEDBYTES + 1] = (uint8_t) (nonce >> 8); - - shake128_absorb(state, buf, sizeof(buf)); -} - -void PQCLEAN_DILITHIUM3_CLEAN_shake256_stream_init(shake256ctx *state, - const unsigned char seed[CRHBYTES], - uint16_t nonce) { - unsigned int i; - unsigned char buf[CRHBYTES + 2]; - - for (i = 0; i < CRHBYTES; ++i) { - buf[i] = seed[i]; - } - buf[CRHBYTES] = (uint8_t) nonce; - buf[CRHBYTES + 1] = (uint8_t) (nonce >> 8); - - shake256_absorb(state, buf, sizeof(buf)); -} diff --git a/crypto_sign/dilithium3/clean/symmetric.h b/crypto_sign/dilithium3/clean/symmetric.h index 53e9760f..d90623d6 100644 --- a/crypto_sign/dilithium3/clean/symmetric.h +++ b/crypto_sign/dilithium3/clean/symmetric.h @@ -1,8 +1,11 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM3_CLEAN_SYMMETRIC_H +#define PQCLEAN_DILITHIUM3_CLEAN_SYMMETRIC_H + +#include "params.h" +#include "stream.h" + #include "fips202.h" -#include "params.h" #define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) #define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_CLEAN_shake128_stream_init(STATE, SEED, NONCE) @@ -13,11 +16,8 @@ #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE -void PQCLEAN_DILITHIUM3_CLEAN_shake128_stream_init(shake128ctx *state, - const unsigned char *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM3_CLEAN_shake256_stream_init(shake256ctx *state, - const unsigned char *seed, - uint16_t nonce); +typedef shake128ctx stream128_state; +typedef shake256ctx stream256_state; + #endif diff --git a/crypto_sign/dilithium4/META.yml b/crypto_sign/dilithium4/META.yml index 7dffe06a..822003b8 100644 --- a/crypto_sign/dilithium4/META.yml +++ b/crypto_sign/dilithium4/META.yml @@ -17,4 +17,13 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/dilithium/commit/40f79645879b5c69835cd91d06945d7c24f39922 + version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + - name: avx2 + version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + required_flags: + - avx2 + - bmi2 diff --git a/crypto_sign/dilithium4/avx2/LICENSE b/crypto_sign/dilithium4/avx2/LICENSE new file mode 100644 index 00000000..40541676 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/LICENSE @@ -0,0 +1,6 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and the random number generator +we are using public-domain code from sources +and by authors listed in comments on top of +the respective files. diff --git a/crypto_sign/dilithium4/avx2/Makefile b/crypto_sign/dilithium4/avx2/Makefile new file mode 100644 index 00000000..c0023b57 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/Makefile @@ -0,0 +1,43 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium4_avx2.a + +SOURCES = fips202x4.c invntt.s nttconsts.c ntt.s packing.c pointwise.S poly.c \ + polyvec.c reduce.s rejsample.c rounding.c sign.c stream.c +OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \ + polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o +HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ + nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \ + fips202x4.h shuffle.inc + +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \ + -Wmissing-prototypes -Wredundant-decls -std=c99 \ + -Wcast-align \ + -mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +KECCAK4XDIR=../../../common/keccak4x +KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o +KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.s $(HEADERS) + $(AS) -o $@ $< + +%.o: %.S $(HEADERS) + $(AS) -c -o $@ $< + +$(LIB): $(OBJECTS) $(KECCAK4X) + $(AR) -r $@ $^ + +$(KECCAK4X): + $(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) + $(MAKE) -C $(KECCAK4XDIR) clean + diff --git a/crypto_sign/dilithium4/avx2/alignment.h b/crypto_sign/dilithium4/avx2/alignment.h new file mode 100644 index 00000000..fd6bbf6e --- /dev/null +++ b/crypto_sign/dilithium4/avx2/alignment.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_ALIGNMENT_H +#define PQCLEAN_DILITHIUM4_AVX2_ALIGNMENT_H + +#define ALIGNED_UINT8(N) \ + union { \ + uint32_t as_arr[N]; \ + __m256i as_vec[(N)/32]; \ + } + +#define ALIGNED_UINT32(N) \ + union { \ + uint32_t as_arr[N]; \ + __m256i as_vec[(N)/8]; \ + } + +#define ALIGNED_UINT64(N) \ + union { \ + uint64_t as_arr[N]; \ + __m256i as_vec[(N)/8]; \ + } + +#endif //PQCLEAN_DILITHIUM4_AVX2_ALIGNMENT_H diff --git a/crypto_sign/dilithium4/avx2/api.h b/crypto_sign/dilithium4/avx2/api.h new file mode 100644 index 00000000..bfdd2770 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/api.h @@ -0,0 +1,37 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_API_H +#define PQCLEAN_DILITHIUM4_AVX2_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES 1760U +#define PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES 3856U +#define PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES 3366U + +#define PQCLEAN_DILITHIUM4_AVX2_CRYPTO_ALGNAME "Dilithium4" + + +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_keypair( + uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *msg, size_t len, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + + + +#endif diff --git a/crypto_sign/dilithium4/avx2/fips202x4.c b/crypto_sign/dilithium4/avx2/fips202x4.c new file mode 100644 index 00000000..35224d74 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/fips202x4.c @@ -0,0 +1,239 @@ +#include +#include + +#include "fips202.h" +#include "fips202x4.h" +#include "params.h" + +#define NROUNDS 24 +#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) + +static uint64_t load64(const uint8_t *x) { + unsigned int i; + uint64_t r = 0; + + for (i = 0; i < 8; ++i) { + r |= (uint64_t)x[i] << 8 * i; + } + + return r; +} + +static void store64(uint8_t *x, uint64_t u) { + unsigned int i; + + for (i = 0; i < 8; ++i) { + x[i] = (uint8_t)(u >> 8 * i); + } +} + +/* Use implementation from the Keccak Code Package */ +extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); +#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds + +static void keccak_absorb4x(__m256i *s, + unsigned int r, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen, + uint8_t p) { + unsigned long long i; + uint8_t t0[200]; + uint8_t t1[200]; + uint8_t t2[200]; + uint8_t t3[200]; + uint64_t *ss = (uint64_t *)s; + + for (i = 0; i < 25; ++i) { + s[i] = _mm256_xor_si256(s[i], s[i]); + } + + while (mlen >= r) { + for (i = 0; i < r / 8; ++i) { + ss[4 * i + 0] ^= load64(m0 + 8 * i); + ss[4 * i + 1] ^= load64(m1 + 8 * i); + ss[4 * i + 2] ^= load64(m2 + 8 * i); + ss[4 * i + 3] ^= load64(m3 + 8 * i); + } + + KeccakF1600_StatePermute4x(s); + mlen -= r; + m0 += r; + m1 += r; + m2 += r; + m3 += r; + } + + for (i = 0; i < r; ++i) { + t0[i] = 0; + t1[i] = 0; + t2[i] = 0; + t3[i] = 0; + } + for (i = 0; i < mlen; ++i) { + t0[i] = m0[i]; + t1[i] = m1[i]; + t2[i] = m2[i]; + t3[i] = m3[i]; + } + + t0[i] = p; + t1[i] = p; + t2[i] = p; + t3[i] = p; + + t0[r - 1] |= 128; + t1[r - 1] |= 128; + t2[r - 1] |= 128; + t3[r - 1] |= 128; + + for (i = 0; i < r / 8; ++i) { + ss[4 * i + 0] ^= load64(t0 + 8 * i); + ss[4 * i + 1] ^= load64(t1 + 8 * i); + ss[4 * i + 2] ^= load64(t2 + 8 * i); + ss[4 * i + 3] ^= load64(t3 + 8 * i); + } +} + + +static void keccak_squeezeblocks4x(uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + unsigned int r, + __m256i *s) { + unsigned int i; + uint64_t *ss = (uint64_t *)s; + + while (nblocks > 0) { + KeccakF1600_StatePermute4x(s); + for (i = 0; i < r / 8; ++i) { + store64(h0 + 8 * i, ss[4 * i + 0]); + store64(h1 + 8 * i, ss[4 * i + 1]); + store64(h2 + 8 * i, ss[4 * i + 2]); + store64(h3 + 8 * i, ss[4 * i + 3]); + } + + h0 += r; + h1 += r; + h2 += r; + h3 += r; + --nblocks; + } + +} + +void PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F); +} + +void PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s) { + keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s); +} + +void PQCLEAN_DILITHIUM4_AVX2_shake256_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F); +} + +void PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s) { + keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s); +} + +void PQCLEAN_DILITHIUM4_AVX2_shake128_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + unsigned int i; + unsigned long nblocks = hlen / SHAKE128_RATE; + uint8_t t[4][SHAKE128_RATE]; + __m256i s[25]; + + PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen); + PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + + h0 += nblocks * SHAKE128_RATE; + h1 += nblocks * SHAKE128_RATE; + h2 += nblocks * SHAKE128_RATE; + h3 += nblocks * SHAKE128_RATE; + hlen -= nblocks * SHAKE128_RATE; + + if (hlen) { + PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); + for (i = 0; i < hlen; ++i) { + h0[i] = t[0][i]; + h1[i] = t[1][i]; + h2[i] = t[2][i]; + h3[i] = t[3][i]; + } + } +} + +void PQCLEAN_DILITHIUM4_AVX2_shake256_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen) { + unsigned int i; + unsigned long nblocks = hlen / SHAKE256_RATE; + uint8_t t[4][SHAKE256_RATE]; + __m256i s[25]; + + PQCLEAN_DILITHIUM4_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen); + PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + + h0 += nblocks * SHAKE256_RATE; + h1 += nblocks * SHAKE256_RATE; + h2 += nblocks * SHAKE256_RATE; + h3 += nblocks * SHAKE256_RATE; + hlen -= nblocks * SHAKE256_RATE; + + if (hlen) { + PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); + for (i = 0; i < hlen; ++i) { + h0[i] = t[0][i]; + h1[i] = t[1][i]; + h2[i] = t[2][i]; + h3[i] = t[3][i]; + } + } +} diff --git a/crypto_sign/dilithium4/avx2/fips202x4.h b/crypto_sign/dilithium4/avx2/fips202x4.h new file mode 100644 index 00000000..475e5899 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/fips202x4.h @@ -0,0 +1,65 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_FIPS202X4_H +#define PQCLEAN_DILITHIUM4_AVX2_FIPS202X4_H + +#include +#include + +#include "params.h" + +void PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s); + +void PQCLEAN_DILITHIUM4_AVX2_shake256_absorb4x( + __m256i *s, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long nblocks, + __m256i *s); + +void PQCLEAN_DILITHIUM4_AVX2_shake128_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +void PQCLEAN_DILITHIUM4_AVX2_shake256_4x( + uint8_t *h0, + uint8_t *h1, + uint8_t *h2, + uint8_t *h3, + unsigned long long hlen, + const uint8_t *m0, + const uint8_t *m1, + const uint8_t *m2, + const uint8_t *m3, + unsigned long long mlen); + +#endif diff --git a/crypto_sign/dilithium4/avx2/invntt.s b/crypto_sign/dilithium4/avx2/invntt.s new file mode 100644 index 00000000..e8c4acb0 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/invntt.s @@ -0,0 +1,281 @@ +.include "shuffle.inc" + +.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3 z0=15,z1=3 +vpaddd %ymm2,%ymm\l0,%ymm12 +vpaddd %ymm2,%ymm\l1,%ymm13 +vpaddd %ymm2,%ymm\l2,%ymm14 + +vpsubd %ymm\h0,%ymm12,%ymm12 +vpsubd %ymm\h1,%ymm13,%ymm13 +vpsubd %ymm\h2,%ymm14,%ymm14 + +vpmuludq %ymm\z0,%ymm12,%ymm12 +vpmuludq %ymm\z0,%ymm13,%ymm13 +vpaddd %ymm2,%ymm\l3,%ymm15 + +vpmuludq %ymm\z1,%ymm14,%ymm14 +vpsubd %ymm\h3,%ymm15,%ymm15 +vpaddd %ymm\l0,%ymm\h0,%ymm\l0 + +vpmuludq %ymm\z1,%ymm15,%ymm15 +vpaddd %ymm\l1,%ymm\h1,%ymm\l1 +vpaddd %ymm\l2,%ymm\h2,%ymm\l2 + +vpaddd %ymm\l3,%ymm\h3,%ymm\l3 + +vpmuludq %ymm0,%ymm12,%ymm\h0 +vpmuludq %ymm0,%ymm13,%ymm\h1 +vpmuludq %ymm0,%ymm14,%ymm\h2 +vpmuludq %ymm0,%ymm15,%ymm\h3 +vpmuludq %ymm1,%ymm\h0,%ymm\h0 +vpmuludq %ymm1,%ymm\h1,%ymm\h1 +vpmuludq %ymm1,%ymm\h2,%ymm\h2 +vpmuludq %ymm1,%ymm\h3,%ymm\h3 +vpaddq %ymm12,%ymm\h0,%ymm\h0 +vpaddq %ymm13,%ymm\h1,%ymm\h1 +vpaddq %ymm14,%ymm\h2,%ymm\h2 +vpaddq %ymm15,%ymm\h3,%ymm\h3 +vpsrlq $32,%ymm\h0,%ymm\h0 +vpsrlq $32,%ymm\h1,%ymm\h1 +vpsrlq $32,%ymm\h2,%ymm\h2 +vpsrlq $32,%ymm\h3,%ymm\h3 +.endm + +.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx +PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm6 +vmovdqa 32(%rsi),%ymm7 +vmovdqa 64(%rsi),%ymm5 +vmovdqa 96(%rsi),%ymm10 + +#reorder +shuffle8 6,5,8,5 +shuffle8 7,10,6,10 + +shuffle4 8,6,4,6 +shuffle4 5,10,8,10 + +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vpsrlq $32,%ymm10,%ymm11 + +level0: +vpmovzxdq (%rdx),%ymm3 +vpmovzxdq 16(%rdx),%ymm15 +vpaddd %ymm2,%ymm4,%ymm12 +vpaddd %ymm2,%ymm6,%ymm13 +vpaddd %ymm2,%ymm8,%ymm14 + +vpsubd %ymm5,%ymm12,%ymm12 +vpsubd %ymm7,%ymm13,%ymm13 +vpsubd %ymm9,%ymm14,%ymm14 + +vpmuludq %ymm3,%ymm12,%ymm12 +vpmuludq %ymm15,%ymm13,%ymm13 +vpaddd %ymm2,%ymm10,%ymm15 + +vpsubd %ymm11,%ymm15,%ymm15 +vpaddd %ymm4,%ymm5,%ymm4 +vpaddd %ymm6,%ymm7,%ymm6 +vpmovzxdq 32(%rdx),%ymm5 +vpmovzxdq 48(%rdx),%ymm7 + +vpmuludq %ymm5,%ymm14,%ymm14 +vpmuludq %ymm7,%ymm15,%ymm15 +vpaddd %ymm8,%ymm9,%ymm8 + +vpaddd %ymm10,%ymm11,%ymm10 + +vpmuludq %ymm0,%ymm12,%ymm5 +vpmuludq %ymm0,%ymm13,%ymm7 +vpmuludq %ymm0,%ymm14,%ymm9 +vpmuludq %ymm0,%ymm15,%ymm11 +vpmuludq %ymm1,%ymm5,%ymm5 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm9,%ymm9 +vpmuludq %ymm1,%ymm11,%ymm11 +vpaddq %ymm12,%ymm5,%ymm5 +vpaddq %ymm13,%ymm7,%ymm7 +vpaddq %ymm14,%ymm9,%ymm9 +vpaddq %ymm15,%ymm11,%ymm11 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm7,%ymm7 +vpsrlq $32,%ymm9,%ymm9 +vpsrlq $32,%ymm11,%ymm11 + +level1: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpmovzxdq 64(%rdx),%ymm15 +vpmovzxdq 80(%rdx),%ymm3 + +butterfly 4,5,8,9,6,7,10,11 + +level2: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpmovzxdq 96(%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 3,3 + +#shuffle +shuffle4 4,5,3,5 +shuffle4 6,7,4,7 +shuffle4 8,9,6,9 +shuffle4 10,11,8,11 + +level3: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd 112(%rdx),%ymm14 +vpbroadcastd 116(%rdx),%ymm15 +vpblendd $0xF0,%ymm15,%ymm14,%ymm10 + +butterfly 3,4,6,8,5,7,9,11 10,10 + +#shuffle +shuffle8 3,4,10,4 +shuffle8 6,8,3,8 +shuffle8 5,7,6,7 +shuffle8 9,11,5,11 + +level4: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd 120(%rdx),%ymm9 + +butterfly 10,3,6,5,4,8,7,11 9,9 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm4,128(%rdi) +vmovdqa %ymm8,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx +PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x256q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm4 +vmovdqa 256(%rsi),%ymm5 +vmovdqa 512(%rsi),%ymm6 +vmovdqa 768(%rsi),%ymm7 +vmovdqa 1024(%rsi),%ymm8 +vmovdqa 1280(%rsi),%ymm9 +vmovdqa 1536(%rsi),%ymm10 +vmovdqa 1792(%rsi),%ymm11 + +level5: +vpbroadcastd (%rdx),%ymm3 +vpbroadcastd 4(%rdx),%ymm15 +vpaddd %ymm2,%ymm4,%ymm12 +vpaddd %ymm2,%ymm6,%ymm13 +vpaddd %ymm2,%ymm8,%ymm14 + +vpsubd %ymm5,%ymm12,%ymm12 +vpsubd %ymm7,%ymm13,%ymm13 +vpsubd %ymm9,%ymm14,%ymm14 + +vpmuludq %ymm3,%ymm12,%ymm12 +vpmuludq %ymm15,%ymm13,%ymm13 +vpaddd %ymm2,%ymm10,%ymm15 + +vpsubd %ymm11,%ymm15,%ymm15 +vpaddd %ymm4,%ymm5,%ymm4 +vpaddd %ymm6,%ymm7,%ymm6 +vpbroadcastd 8(%rdx),%ymm5 +vpbroadcastd 12(%rdx),%ymm7 + +vpmuludq %ymm5,%ymm14,%ymm14 +vpmuludq %ymm7,%ymm15,%ymm15 +vpaddd %ymm8,%ymm9,%ymm8 + +vpaddd %ymm10,%ymm11,%ymm10 + +vpmuludq %ymm0,%ymm12,%ymm5 +vpmuludq %ymm0,%ymm13,%ymm7 +vpmuludq %ymm0,%ymm14,%ymm9 +vpmuludq %ymm0,%ymm15,%ymm11 +vpmuludq %ymm1,%ymm5,%ymm5 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm9,%ymm9 +vpmuludq %ymm1,%ymm11,%ymm11 +vpaddq %ymm12,%ymm5,%ymm5 +vpaddq %ymm13,%ymm7,%ymm7 +vpaddq %ymm14,%ymm9,%ymm9 +vpaddq %ymm15,%ymm11,%ymm11 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm7,%ymm7 +vpsrlq $32,%ymm9,%ymm9 +vpsrlq $32,%ymm11,%ymm11 + +level6: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd 16(%rdx),%ymm15 +vpbroadcastd 20(%rdx),%ymm3 + +butterfly 4,5,8,9,6,7,10,11 + +level7: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd 24(%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 3,3 + +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xdiv(%rip),%ymm3 + +vpmuludq %ymm3,%ymm4,%ymm4 +vpmuludq %ymm3,%ymm5,%ymm5 +vpmuludq %ymm3,%ymm6,%ymm6 +vpmuludq %ymm3,%ymm7,%ymm7 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm0,%ymm6,%ymm14 +vpmuludq %ymm0,%ymm7,%ymm15 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm12,%ymm4,%ymm4 +vpaddq %ymm13,%ymm5,%ymm5 +vpaddq %ymm14,%ymm6,%ymm6 +vpaddq %ymm15,%ymm7,%ymm7 +vpsrlq $32,%ymm4,%ymm4 +vpsrlq $32,%ymm5,%ymm5 +vpsrlq $32,%ymm6,%ymm6 +vpsrlq $32,%ymm7,%ymm7 + +#store +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_mask(%rip),%ymm3 +vpermd %ymm4,%ymm3,%ymm4 +vpermd %ymm5,%ymm3,%ymm5 +vpermd %ymm6,%ymm3,%ymm6 +vpermd %ymm7,%ymm3,%ymm7 +vpermd %ymm8,%ymm3,%ymm8 +vpermd %ymm9,%ymm3,%ymm9 +vpermd %ymm10,%ymm3,%ymm10 +vpermd %ymm11,%ymm3,%ymm11 +vmovdqa %xmm4,(%rdi) +vmovdqa %xmm5,128(%rdi) +vmovdqa %xmm6,256(%rdi) +vmovdqa %xmm7,384(%rdi) +vmovdqa %xmm8,512(%rdi) +vmovdqa %xmm9,640(%rdi) +vmovdqa %xmm10,768(%rdi) +vmovdqa %xmm11,896(%rdi) + +ret diff --git a/crypto_sign/dilithium4/avx2/ntt.h b/crypto_sign/dilithium4/avx2/ntt.h new file mode 100644 index 00000000..39d02bef --- /dev/null +++ b/crypto_sign/dilithium4/avx2/ntt.h @@ -0,0 +1,26 @@ +#ifndef NTT_H +#define NTT_H + +#include + +#include "nttconsts.h" +#include "params.h" + +void PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx(uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas); +void PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx(uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas); + +void PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx(uint64_t *tmp, + const uint32_t *a, + const uint32_t *zetas_inv); +void PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx(uint32_t *a, + const uint64_t *tmp, + const uint32_t *zetas_inv); + +void PQCLEAN_DILITHIUM4_AVX2_pointwise_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx(uint32_t *c, const uint32_t *a, const uint32_t *b); + +#endif diff --git a/crypto_sign/dilithium4/avx2/ntt.s b/crypto_sign/dilithium4/avx2/ntt.s new file mode 100644 index 00000000..12e8f513 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/ntt.s @@ -0,0 +1,178 @@ +.include "shuffle.inc" + +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 z0=3,z1=3,z2=3,z3=3 +#mul +vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 +vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 +vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2 +vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3 + +#reduce +vpmuludq %ymm0,%ymm\rh0,%ymm12 +vpmuludq %ymm0,%ymm\rh1,%ymm13 +vpmuludq %ymm0,%ymm\rh2,%ymm14 +vpmuludq %ymm0,%ymm\rh3,%ymm15 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm\rh0,%ymm12,%ymm12 +vpaddq %ymm\rh1,%ymm13,%ymm13 +vpaddq %ymm\rh2,%ymm14,%ymm14 +vpaddq %ymm\rh3,%ymm15,%ymm15 +vpsrlq $32,%ymm12,%ymm12 +vpsrlq $32,%ymm13,%ymm13 +vpsrlq $32,%ymm14,%ymm14 +vpsrlq $32,%ymm15,%ymm15 + +#update +vpaddd %ymm2,%ymm\rl0,%ymm\rh0 +vpaddd %ymm2,%ymm\rl1,%ymm\rh1 +vpaddd %ymm2,%ymm\rl2,%ymm\rh2 +vpaddd %ymm2,%ymm\rl3,%ymm\rh3 +vpaddd %ymm12,%ymm\rl0,%ymm\rl0 +vpaddd %ymm13,%ymm\rl1,%ymm\rl1 +vpaddd %ymm14,%ymm\rl2,%ymm\rl2 +vpaddd %ymm15,%ymm\rl3,%ymm\rl3 +vpsubd %ymm12,%ymm\rh0,%ymm\rh0 +vpsubd %ymm13,%ymm\rh1,%ymm\rh1 +vpsubd %ymm14,%ymm\rh2,%ymm\rh2 +vpsubd %ymm15,%ymm\rh3,%ymm\rh3 +.endm + +.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx +PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2 + +level0: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd (%rdx),%ymm3 + +#load +vpmovzxdq (%rsi),%ymm4 +vpmovzxdq 128(%rsi),%ymm5 +vpmovzxdq 256(%rsi),%ymm6 +vpmovzxdq 384(%rsi),%ymm7 +vpmovzxdq 512(%rsi),%ymm8 +vpmovzxdq 640(%rsi),%ymm9 +vpmovzxdq 768(%rsi),%ymm10 +vpmovzxdq 896(%rsi),%ymm11 + +butterfly 4,5,6,7,8,9,10,11 + +level1: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd 4(%rdx),%ymm12 +vpbroadcastd 8(%rdx),%ymm13 + +butterfly 4,5,8,9,6,7,10,11 12,12,13,13 + +level2: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd 12(%rdx),%ymm12 +vpbroadcastd 16(%rdx),%ymm13 +vpbroadcastd 20(%rdx),%ymm14 +vpbroadcastd 24(%rdx),%ymm15 + +butterfly 4,6,8,10,5,7,9,11 12,13,14,15 + +#store +vmovdqa %ymm4,(%rdi) +vmovdqa %ymm5,256(%rdi) +vmovdqa %ymm6,512(%rdi) +vmovdqa %ymm7,768(%rdi) +vmovdqa %ymm8,1024(%rdi) +vmovdqa %ymm9,1280(%rdi) +vmovdqa %ymm10,1536(%rdi) +vmovdqa %ymm11,1792(%rdi) + +ret + +.global PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx +PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x2q(%rip),%ymm2 + +#load +vmovdqa (%rsi),%ymm4 +vmovdqa 32(%rsi),%ymm5 +vmovdqa 64(%rsi),%ymm6 +vmovdqa 96(%rsi),%ymm7 +vmovdqa 128(%rsi),%ymm8 +vmovdqa 160(%rsi),%ymm9 +vmovdqa 192(%rsi),%ymm10 +vmovdqa 224(%rsi),%ymm11 + +level3: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd (%rdx),%ymm3 + +butterfly 4,5,6,7,8,9,10,11 + +level4: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpbroadcastd 4(%rdx),%ymm12 +vpbroadcastd 8(%rdx),%ymm13 +vpblendd $0xF0,%ymm13,%ymm12,%ymm12 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +butterfly 3,8,4,9,5,10,6,11 12,12,12,12 + +level5: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpmovzxdq 12(%rdx),%ymm12 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +butterfly 7,5,3,10,8,6,4,11 12,12,12,12 + +level6: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpmovzxdq 28(%rdx),%ymm12 +vpmovzxdq 44(%rdx),%ymm13 + +butterfly 7,5,8,6,3,10,4,11 12,12,13,13 + +level7: +#PQCLEAN_DILITHIUM4_AVX2_zetas +vpmovzxdq 60(%rdx),%ymm12 +vpmovzxdq 76(%rdx),%ymm13 +vpmovzxdq 92(%rdx),%ymm14 +vpmovzxdq 108(%rdx),%ymm15 + +butterfly 7,3,8,4,5,10,6,11 12,13,14,15 + +#store +vpsllq $32,%ymm5,%ymm5 +vpsllq $32,%ymm10,%ymm10 +vpsllq $32,%ymm6,%ymm6 +vpsllq $32,%ymm11,%ymm11 +vpblendd $0xAA,%ymm5,%ymm7,%ymm7 +vpblendd $0xAA,%ymm10,%ymm3,%ymm3 +vpblendd $0xAA,%ymm6,%ymm8,%ymm8 +vpblendd $0xAA,%ymm11,%ymm4,%ymm4 + +shuffle4 7,3,5,3 +shuffle4 8,4,7,4 + +shuffle8 5,7,6,7 +shuffle8 3,4,5,4 + +vmovdqa %ymm6,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm4,96(%rdi) + +ret diff --git a/crypto_sign/dilithium4/avx2/nttconsts.c b/crypto_sign/dilithium4/avx2/nttconsts.c new file mode 100644 index 00000000..7bf71ea7 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/nttconsts.c @@ -0,0 +1,80 @@ +#include "nttconsts.h" + +#define QINV 4236238847 // -q^(-1) mod 2^32 +#define MONT 4193792ULL +#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) + + +const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, + 256 * Q + } + }; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, + 0x7FFFFF, 0x7FFFFF + } + }; +const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; + +#undef QINV +#undef MONT +#undef DIV + + +const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas = { + .as_arr = { + 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776, + 3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667, + 5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191, + 3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439, + 7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422, + 1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579, + 8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390, + 7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620, + 5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868, + 3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076, + 6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435, + 5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599, + 3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165, + 7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031, + 6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064, + 2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112, + 162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237, + 8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977, + 1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735, + 6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892, + 5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443, + 7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090, + 4834730, 7018208, 1976782 + } +}; + +const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv = { + .as_arr = { + 6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985, + 4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748, + 2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646, + 1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087, + 177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422, + 6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573, + 5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061, + 6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386, + 1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252, + 1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818, + 1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496, + 7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525, + 6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443, + 7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093, + 5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647, + 4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669, + 43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330, + 1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900, + 6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326, + 4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745, + 3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293, + 518909, 2608894, 3975713 + } +}; diff --git a/crypto_sign/dilithium4/avx2/nttconsts.h b/crypto_sign/dilithium4/avx2/nttconsts.h new file mode 100644 index 00000000..1904a67b --- /dev/null +++ b/crypto_sign/dilithium4/avx2/nttconsts.h @@ -0,0 +1,27 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_NTTCONSTS_H +#define PQCLEAN_DILITHIUM4_AVX2_NTTCONSTS_H + +#include +#include + +#include "alignment.h" +#include "params.h" + +typedef ALIGNED_UINT32(8) aligned_uint32x8_t; + +typedef ALIGNED_UINT32(N) aligned_uint32xN_t; + + +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xqinv; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xq; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x2q; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x256q; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_mask; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8x23ones; +extern const aligned_uint32x8_t _PQCLEAN_DILITHIUM4_AVX2_8xdiv; + +extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas; +extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv; + +#endif //PQCLEAN_DILITHIUM4_AVX2_NTTCONSTS_H + diff --git a/crypto_sign/dilithium4/avx2/packing.c b/crypto_sign/dilithium4/avx2/packing.c new file mode 100644 index 00000000..137ba25c --- /dev/null +++ b/crypto_sign/dilithium4/avx2/packing.c @@ -0,0 +1,305 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_pack_pk( + uint8_t *pk, + const uint8_t *rho, + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_unpack_pk( + uint8_t *rho, + polyveck *t1, + const uint8_t *pk) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_pack_sk +* +* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t key[]: byte array containing key +* - const uint8_t tr[]: byte array containing tr +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +* - const polyveck *t0: pointer to vector t0 +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); + } + sk += L * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); + } + sk += K * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_unpack_sk +* +* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t key[]: output byte array for key +* - const uint8_t tr[]: output byte array for tr +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - const polyveck *r0: pointer to output vector t0 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); + } + sk += L * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); + } + sk += K * POLETA_SIZE_PACKED; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_pack_sig +* +* Description: Bit-pack signature sig = (z, h, c). +* +* Arguments: - uint8_t sig[]: output byte array +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +* - const poly *c: pointer to PQCLEAN_DILITHIUM4_AVX2_challenge polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_pack_sig( + uint8_t *sig, + const polyvecl *z, + const polyveck *h, + const poly *c) { + unsigned int i, j, k; + uint64_t signs, mask; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); + } + sig += L * POLZ_SIZE_PACKED; + + /* Encode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t)j; + } + } + + sig[OMEGA + i] = (uint8_t)k; + } + while (k < OMEGA) { + sig[k++] = 0; + } + sig += OMEGA + K; + + /* Encode c */ + signs = 0; + mask = 1; + for (i = 0; i < N / 8; ++i) { + sig[i] = 0; + for (j = 0; j < 8; ++j) { + if (c->coeffs[8 * i + j] != 0) { + sig[i] |= (uint8_t)(1u << j); + if (c->coeffs[8 * i + j] == (Q - 1)) { + signs |= mask; + } + mask <<= 1; + } + } + } + sig += N / 8; + for (i = 0; i < 8; ++i) { + sig[i] = (uint8_t)(signs >> 8u * i); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_unpack_sig +* +* Description: Unpack signature sig = (z, h, c). +* +* Arguments: - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - poly *c: pointer to output PQCLEAN_DILITHIUM4_AVX2_challenge polynomial +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_unpack_sig( + polyvecl *z, + polyveck *h, + poly *c, + const uint8_t *sig) { + unsigned int i, j, k; + uint64_t signs; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); + } + sig += L * POLZ_SIZE_PACKED; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + sig += OMEGA + K; + + /* Decode c */ + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)sig[N / 8 + i] << 8 * i; + } + + /* Extra sign bits are zero for strong unforgeability */ + if (signs >> 60) { + return 1; + } + + for (i = 0; i < N / 8; ++i) { + for (j = 0; j < 8; ++j) { + if ((sig[i] >> j) & 0x01) { + c->coeffs[8 * i + j] = 1; + c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); + signs >>= 1; + } + } + } + + return 0; +} diff --git a/crypto_sign/dilithium4/avx2/packing.h b/crypto_sign/dilithium4/avx2/packing.h new file mode 100644 index 00000000..bcb234bd --- /dev/null +++ b/crypto_sign/dilithium4/avx2/packing.h @@ -0,0 +1,36 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_PACKING_H +#define PQCLEAN_DILITHIUM4_AVX2_PACKING_H + +#include "params.h" +#include "polyvec.h" + +void PQCLEAN_DILITHIUM4_AVX2_pack_pk( + uint8_t *pk, + const uint8_t *rho, const polyveck *t1); +void PQCLEAN_DILITHIUM4_AVX2_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0); +void PQCLEAN_DILITHIUM4_AVX2_pack_sig( + uint8_t *sig, + const polyvecl *z, const polyveck *h, const poly *c); + +void PQCLEAN_DILITHIUM4_AVX2_unpack_pk( + uint8_t *rho, polyveck *t1, + const uint8_t *pk); +void PQCLEAN_DILITHIUM4_AVX2_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk); +int PQCLEAN_DILITHIUM4_AVX2_unpack_sig( + polyvecl *z, polyveck *h, poly *c, const uint8_t *sig); + +#endif diff --git a/crypto_sign/dilithium4/avx2/params.h b/crypto_sign/dilithium4/avx2/params.h new file mode 100644 index 00000000..831285ee --- /dev/null +++ b/crypto_sign/dilithium4/avx2/params.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_PARAMS_H +#define PQCLEAN_DILITHIUM4_AVX2_PARAMS_H + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define QBITS 23 +#define D 14 +#define GAMMA1 ((Q - 1)/16) +#define GAMMA2 (GAMMA1/2) +#define ALPHA (2*GAMMA2) + +#define K 6 +#define L 5 +#define ETA 3 +#define SETABITS 3 +#define BETA 175 +#define OMEGA 120 + + +#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) +#define POLT0_SIZE_PACKED ((N*D)/8) +#define POLETA_SIZE_PACKED ((N*SETABITS)/8) +#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) +#define POLW1_SIZE_PACKED ((N*4)/8) + +#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLT1_SIZE_PACKED) +#define CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + (L + K)*POLETA_SIZE_PACKED + CRHBYTES + K*POLT0_SIZE_PACKED) +#define CRYPTO_BYTES (L*POLZ_SIZE_PACKED + (OMEGA + K) + (N/8 + 8)) + +#endif diff --git a/crypto_sign/dilithium4/avx2/pointwise.S b/crypto_sign/dilithium4/avx2/pointwise.S new file mode 100644 index 00000000..672820c1 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/pointwise.S @@ -0,0 +1,193 @@ +#include "params.h" + +.global PQCLEAN_DILITHIUM4_AVX2_pointwise_avx +PQCLEAN_DILITHIUM4_AVX2_pointwise_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 + +xor %eax,%eax +_looptop1: +#load +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa 64(%rsi),%ymm6 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vmovdqa 64(%rdx),%ymm14 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 +vpsrlq $32,%ymm14,%ymm15 + +#mul +vpmuludq %ymm2,%ymm10,%ymm2 +vpmuludq %ymm3,%ymm11,%ymm3 +vpmuludq %ymm4,%ymm12,%ymm4 +vpmuludq %ymm5,%ymm13,%ymm5 +vpmuludq %ymm6,%ymm14,%ymm6 +vpmuludq %ymm7,%ymm15,%ymm7 + +#reduce +vpmuludq %ymm0,%ymm2,%ymm10 +vpmuludq %ymm0,%ymm3,%ymm11 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm0,%ymm6,%ymm14 +vpmuludq %ymm0,%ymm7,%ymm15 +vpmuludq %ymm1,%ymm10,%ymm10 +vpmuludq %ymm1,%ymm11,%ymm11 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpmuludq %ymm1,%ymm14,%ymm14 +vpmuludq %ymm1,%ymm15,%ymm15 +vpaddq %ymm2,%ymm10,%ymm2 +vpaddq %ymm3,%ymm11,%ymm3 +vpaddq %ymm4,%ymm12,%ymm4 +vpaddq %ymm5,%ymm13,%ymm5 +vpaddq %ymm6,%ymm14,%ymm6 +vpaddq %ymm7,%ymm15,%ymm7 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 +vpsrlq $32,%ymm6,%ymm6 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 +vpblendd $0xAA,%ymm7,%ymm6,%ymm6 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm6,64(%rdi) + +add $96,%rdi +add $96,%rsi +add $96,%rdx +add $1,%eax +cmp $10,%eax +jb _looptop1 + +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 + +#mul +vpmuludq %ymm2,%ymm10,%ymm2 +vpmuludq %ymm3,%ymm11,%ymm3 +vpmuludq %ymm4,%ymm12,%ymm4 +vpmuludq %ymm5,%ymm13,%ymm5 + +#reduce +vpmuludq %ymm0,%ymm2,%ymm10 +vpmuludq %ymm0,%ymm3,%ymm11 +vpmuludq %ymm0,%ymm4,%ymm12 +vpmuludq %ymm0,%ymm5,%ymm13 +vpmuludq %ymm1,%ymm10,%ymm10 +vpmuludq %ymm1,%ymm11,%ymm11 +vpmuludq %ymm1,%ymm12,%ymm12 +vpmuludq %ymm1,%ymm13,%ymm13 +vpaddq %ymm2,%ymm10,%ymm2 +vpaddq %ymm3,%ymm11,%ymm3 +vpaddq %ymm4,%ymm12,%ymm4 +vpaddq %ymm5,%ymm13,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 + +#store +vpblendd $0x55,%ymm2,%ymm3,%ymm2 +vpblendd $0x55,%ymm4,%ymm5,%ymm4 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +ret + +.macro pointwise off +#load +vmovdqa \off(%rsi),%ymm6 +vmovdqa \off+32(%rsi),%ymm8 +vmovdqa \off(%rdx),%ymm10 +vmovdqa \off+32(%rdx),%ymm12 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 + +#mul +vpmuludq %ymm6,%ymm10,%ymm6 +vpmuludq %ymm7,%ymm11,%ymm7 +vpmuludq %ymm8,%ymm12,%ymm8 +vpmuludq %ymm9,%ymm13,%ymm9 +.endm + +.macro acc +vpaddq %ymm6,%ymm2,%ymm2 +vpaddq %ymm7,%ymm3,%ymm3 +vpaddq %ymm8,%ymm4,%ymm4 +vpaddq %ymm9,%ymm5,%ymm5 +.endm + +.global PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx +PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xqinv(%rip),%ymm0 +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm1 + +xor %eax,%eax +_looptop2: +pointwise 0 + +#mov +vmovdqa %ymm6,%ymm2 +vmovdqa %ymm7,%ymm3 +vmovdqa %ymm8,%ymm4 +vmovdqa %ymm9,%ymm5 + +pointwise 1024 +acc + +pointwise 2048 +acc + +pointwise 3072 +acc + +pointwise 4096 +acc + +#reduce +vpmuludq %ymm0,%ymm2,%ymm6 +vpmuludq %ymm0,%ymm3,%ymm7 +vpmuludq %ymm0,%ymm4,%ymm8 +vpmuludq %ymm0,%ymm5,%ymm9 +vpmuludq %ymm1,%ymm6,%ymm6 +vpmuludq %ymm1,%ymm7,%ymm7 +vpmuludq %ymm1,%ymm8,%ymm8 +vpmuludq %ymm1,%ymm9,%ymm9 +vpaddq %ymm2,%ymm6,%ymm2 +vpaddq %ymm3,%ymm7,%ymm3 +vpaddq %ymm4,%ymm8,%ymm4 +vpaddq %ymm5,%ymm9,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 + +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +add $64,%rsi +add $64,%rdx +add $64,%rdi +add $1,%eax +cmp $16,%eax +jb _looptop2 + +ret diff --git a/crypto_sign/dilithium4/avx2/poly.c b/crypto_sign/dilithium4/avx2/poly.c new file mode 100644 index 00000000..89f8e678 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/poly.c @@ -0,0 +1,936 @@ +#include +#include + +#include "fips202x4.h" +#include "ntt.h" +#include "nttconsts.h" +#include "params.h" +#include "poly.h" +#include "reduce.h" +#include "rejsample.h" +#include "rounding.h" +#include "symmetric.h" + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_reduce +* +* Description: Reduce all coefficients of input polynomial to representative +* in [0,2*Q[. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_reduce(poly *a) { + PQCLEAN_DILITHIUM4_AVX2_reduce_avx(a->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_csubq +* +* Description: For all coefficients of input polynomial subtract Q if +* coefficient is bigger than Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_csubq(poly *a) { + PQCLEAN_DILITHIUM4_AVX2_csubq_avx(a->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_freeze +* +* Description: Reduce all coefficients of the polynomial to standard +* representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_freeze(poly *a) { + PQCLEAN_DILITHIUM4_AVX2_reduce_avx(a->coeffs); + PQCLEAN_DILITHIUM4_AVX2_csubq_avx(a->coeffs); + +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i vec0, vec1; + for (i = 0; i < N / 8; i++) { + vec0 = _mm256_load_si256(&a->coeffs_x8[i]); + vec1 = _mm256_load_si256(&b->coeffs_x8[i]); + vec0 = _mm256_add_epi32(vec0, vec1); + _mm256_store_si256(&c->coeffs_x8[i], vec0); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_sub +* +* Description: Subtract polynomials. Assumes coefficients of second input +* polynomial to be less than 2*Q. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i vec0, vec1; + const __m256i twoq = _mm256_load_si256(_PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec); + + for (i = 0; i < N / 8; i++) { + vec0 = _mm256_load_si256(&a->coeffs_x8[i]); + vec1 = _mm256_load_si256(&b->coeffs_x8[i]); + vec0 = _mm256_add_epi32(vec0, twoq); + vec0 = _mm256_sub_epi32(vec0, vec1); + _mm256_store_si256(&c->coeffs_x8[i], vec0); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{32-D}. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_shiftl(poly *a) { + unsigned int i; + __m256i vec; + + for (i = 0; i < N / 8; i++) { + vec = _mm256_load_si256(&a->coeffs_x8[i]); + vec = _mm256_slli_epi32(vec, D); + _mm256_store_si256(&a->coeffs_x8[i], vec); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_ntt +* +* Description: Forward NTT. Output coefficients can be up to 16*Q larger than +* input coefficients. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_ntt(poly *a) { + unsigned int i; + ALIGNED_UINT64(N) tmp; + + for (i = 0; i < N / 32; ++i) { + PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx(tmp.as_arr + 4 * i, a->coeffs + 4 * i, PQCLEAN_DILITHIUM4_AVX2_zetas.as_arr + 1); + } + for (i = 0; i < N / 32; ++i) { + PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx(a->coeffs + 32 * i, tmp.as_arr + 32 * i, PQCLEAN_DILITHIUM4_AVX2_zetas.as_arr + 8 + 31 * i); + } +} + +/************************************************* +* Name: poly_invntt_montgomery +* +* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients +* need to be less than 2*Q. Output coefficients are less than 2*Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(poly *a) { + unsigned int i; + ALIGNED_UINT64(N) tmp; + + for (i = 0; i < N / 32; i++) { + PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx(tmp.as_arr + 32 * i, a->coeffs + 32 * i, PQCLEAN_DILITHIUM4_AVX2_zetas_inv.as_arr + 31 * i); + } + for (i = 0; i < N / 32; i++) { + PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx(a->coeffs + 4 * i, tmp.as_arr + 4 * i, PQCLEAN_DILITHIUM4_AVX2_zetas_inv.as_arr + 248); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* with 2^{-32}. Output coefficients are less than 2*Q if input +* coefficient are less than 22*Q. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { + PQCLEAN_DILITHIUM4_AVX2_pointwise_avx(c->coeffs, a->coeffs, b->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients Q + a0 +* - const poly *v: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_power2round(poly *restrict a1, + poly *restrict a0, + const poly *restrict a) { + for (size_t i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM4_AVX2_power2round(a->coeffs[i], &a0->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients Q + a0 +* - const poly *c: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_decompose( + poly *restrict a1, + poly *restrict a0, + const poly *restrict a) { + unsigned int i; + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM4_AVX2_decompose(a->coeffs[i], &a0->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM4_AVX2_poly_make_hint( + poly *restrict h, + const poly *restrict a0, + const poly *restrict a1) { + unsigned int i, s = 0; + for (i = 0; i < N; ++i) { + h->coeffs[i] = PQCLEAN_DILITHIUM4_AVX2_make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + return s; +} + +/************************************************* + * Name: PQCLEAN_DILITHIUM4_AVX2_poly_use_hint + * + * Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *a: pointer to output polynomial with corrected high bits +* - const poly *b: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_poly_use_hint( + poly *restrict a, + const poly *restrict b, + const poly *restrict h) { + unsigned int i; + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM4_AVX2_use_hint(b->coeffs[i], h->coeffs[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const poly *a: pointer to polynomial +* - uint32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_poly_chknorm(const poly *a, uint32_t B) { + unsigned int i; + int32_t t; + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for (i = 0; i < N; ++i) { + /* Absolute value of centralized representative */ + t = (Q - 1) / 2 - a->coeffs[i]; + t ^= (t >> 31); + t = (Q - 1) / 2 - t; + + if ((uint32_t)t >= B) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: rej_uniform_ref +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + return ctr; +} + +/************************************************* +* Name: poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling using the +* output stream from SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform(poly *a, + const unsigned char seed[SEEDBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int nblocks = POLY_UNIFORM_NBLOCKS; + unsigned int buflen = POLY_UNIFORM_BUFLEN; + unsigned char buf[POLY_UNIFORM_BUFLEN + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, nblocks, &state); + + ctr = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a->coeffs, N, buf, buflen); + + while (ctr < N) { + off = buflen % 3; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + buflen = STREAM128_BLOCKBYTES + off; + stream128_squeezeblocks(buf + off, 1, &state); + ctr += rej_uniform_ref(a->coeffs + ctr, N - ctr, buf, buflen); + } +} + +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[SEEDBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][SEEDBYTES + 2]; + unsigned char outbuf[4][5 * SHAKE128_RATE]; + __m256i state[25]; + + for (i = 0; i < SEEDBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][SEEDBYTES + 0] = nonce0; + inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; + inbuf[1][SEEDBYTES + 0] = nonce1; + inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; + inbuf[2][SEEDBYTES + 0] = nonce2; + inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; + inbuf[3][SEEDBYTES + 0] = nonce3; + inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + SEEDBYTES + 2); + PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, + state); + + ctr0 = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a0->coeffs, N, outbuf[0], 5 * SHAKE128_RATE); + ctr1 = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a1->coeffs, N, outbuf[1], 5 * SHAKE128_RATE); + ctr2 = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a2->coeffs, N, outbuf[2], 5 * SHAKE128_RATE); + ctr3 = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a3->coeffs, N, outbuf[3], 5 * SHAKE128_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_uniform_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], + SHAKE128_RATE); + ctr1 += rej_uniform_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], + SHAKE128_RATE); + ctr2 += rej_uniform_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], + SHAKE128_RATE); + ctr3 += rej_uniform_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], + SHAKE128_RATE); + } +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x07; + t1 = buf[pos++] >> 5; + + if (t0 <= 2 * ETA) { + a[ctr++] = Q + ETA - t0; + } + if (t1 <= 2 * ETA && ctr < len) { + a[ctr++] = Q + ETA - t1; + } + } + + return ctr; +} + +/************************************************* +* Name: poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling using the +* output stream from SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta(poly *a, + const unsigned char seed[SEEDBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned char buf[POLY_UNIFORM_ETA_BUFLEN]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); + + while (ctr < N) { + stream128_squeezeblocks(buf, 1, &state); + ctr += rej_eta_ref(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[SEEDBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][SEEDBYTES + 2]; + unsigned char outbuf[4][2 * SHAKE128_RATE]; + __m256i state[25]; + + for (i = 0; i < SEEDBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][SEEDBYTES + 0] = nonce0; + inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; + inbuf[1][SEEDBYTES + 0] = nonce1; + inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; + inbuf[2][SEEDBYTES + 0] = nonce2; + inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; + inbuf[3][SEEDBYTES + 0] = nonce3; + inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + SEEDBYTES + 2); + PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 2, + state); + + ctr0 = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a0->coeffs, N, outbuf[0], 2 * SHAKE128_RATE); + ctr1 = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a1->coeffs, N, outbuf[1], 2 * SHAKE128_RATE); + ctr2 = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a2->coeffs, N, outbuf[2], 2 * SHAKE128_RATE); + ctr3 = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a3->coeffs, N, outbuf[3], 2 * SHAKE128_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_eta_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], SHAKE128_RATE); + ctr1 += rej_eta_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], SHAKE128_RATE); + ctr2 += rej_eta_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], SHAKE128_RATE); + ctr3 += rej_eta_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], SHAKE128_RATE); + } +} + +/************************************************* +* Name: rej_gamma1m1_ref +* +* Description: Sample uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling +* using array of random bytes. +* +* Arguments: - uint32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const unsigned char *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_gamma1m1_ref(uint32_t *a, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + + ctr = pos = 0; + while (ctr < len && pos + 5 <= buflen) { + t0 = buf[pos]; + t0 |= (uint32_t)buf[pos + 1] << 8; + t0 |= (uint32_t)buf[pos + 2] << 16; + t0 &= 0xFFFFF; + + t1 = buf[pos + 2] >> 4; + t1 |= (uint32_t)buf[pos + 3] << 4; + t1 |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + + if (t0 <= 2 * GAMMA1 - 2) { + a[ctr++] = Q + GAMMA1 - 1 - t0; + } + if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { + a[ctr++] = Q + GAMMA1 - 1 - t1; + } + } + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection +* sampling on output stream of SHAKE256(seed|nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const unsigned char seed[]: byte array with seed of length +* CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) +#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1(poly *a, + const unsigned char seed[CRHBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; + unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); + + ctr = PQCLEAN_DILITHIUM4_AVX2_rej_gamma1m1(a->coeffs, N, buf, POLY_UNIFORM_GAMMA1M1_BUFLEN); + + while (ctr < N) { + off = buflen % 5; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + buflen = STREAM256_BLOCKBYTES + off; + stream256_squeezeblocks(buf + off, 1, &state); + ctr += rej_gamma1m1_ref(a->coeffs + ctr, N - ctr, buf, buflen); + } +} + +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const unsigned char seed[CRHBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int i, ctr0, ctr1, ctr2, ctr3; + unsigned char inbuf[4][CRHBYTES + 2]; + unsigned char outbuf[4][5 * SHAKE256_RATE]; + __m256i state[25]; + + for (i = 0; i < CRHBYTES; ++i) { + inbuf[0][i] = seed[i]; + inbuf[1][i] = seed[i]; + inbuf[2][i] = seed[i]; + inbuf[3][i] = seed[i]; + } + inbuf[0][CRHBYTES + 0] = nonce0 & 0xFF; + inbuf[0][CRHBYTES + 1] = nonce0 >> 8; + inbuf[1][CRHBYTES + 0] = nonce1 & 0xFF; + inbuf[1][CRHBYTES + 1] = nonce1 >> 8; + inbuf[2][CRHBYTES + 0] = nonce2 & 0xFF; + inbuf[2][CRHBYTES + 1] = nonce2 >> 8; + inbuf[3][CRHBYTES + 0] = nonce3 & 0xFF; + inbuf[3][CRHBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM4_AVX2_shake256_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], + CRHBYTES + 2); + PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, + state); + + ctr0 = rej_gamma1m1_ref(a0->coeffs, N, outbuf[0], 5 * SHAKE256_RATE); + ctr1 = rej_gamma1m1_ref(a1->coeffs, N, outbuf[1], 5 * SHAKE256_RATE); + ctr2 = rej_gamma1m1_ref(a2->coeffs, N, outbuf[2], 5 * SHAKE256_RATE); + ctr3 = rej_gamma1m1_ref(a3->coeffs, N, outbuf[3], 5 * SHAKE256_RATE); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, + state); + + ctr0 += rej_gamma1m1_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], + SHAKE256_RATE); + ctr1 += rej_gamma1m1_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], + SHAKE256_RATE); + ctr2 += rej_gamma1m1_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], + SHAKE256_RATE); + ctr3 += rej_gamma1m1_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], + SHAKE256_RATE); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLETA_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyeta_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + unsigned char t[8]; + + for (i = 0; i < N / 8; ++i) { + t[0] = Q + ETA - a->coeffs[8 * i + 0]; + t[1] = Q + ETA - a->coeffs[8 * i + 1]; + t[2] = Q + ETA - a->coeffs[8 * i + 2]; + t[3] = Q + ETA - a->coeffs[8 * i + 3]; + t[4] = Q + ETA - a->coeffs[8 * i + 4]; + t[5] = Q + ETA - a->coeffs[8 * i + 5]; + t[6] = Q + ETA - a->coeffs[8 * i + 6]; + t[7] = Q + ETA - a->coeffs[8 * i + 7]; + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* Output coefficients lie in [Q-ETA,Q+ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[3 * i + 0] & 0x07; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 0x07; + r->coeffs[8 * i + 2] = (uint32_t)((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 0x07; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 0x07; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 0x07; + r->coeffs[8 * i + 5] = (uint32_t)((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 0x07; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 0x07; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 0x07; + + r->coeffs[8 * i + 0] = Q + ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = Q + ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = Q + ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = Q + ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = Q + ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = Q + ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = Q + ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = Q + ETA - r->coeffs[8 * i + 7]; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLT1_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyt1_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + + for (i = 0; i < N / 8; ++i) { + r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); + r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); + r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); + r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); + r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); + r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); + r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); + r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); + r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyt1_unpack +* +* Description: Unpack polynomial t1 with 9-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyt1_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = ((a[9 * i + 0] >> 0) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; + r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; + r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; + r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; + r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; + r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; + r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; + r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLT0_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyt0_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + + for (i = 0; i < N / 4; ++i) { + t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; + t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; + t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; + t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; + + r[7 * i + 0] = t[0]; + r[7 * i + 1] = t[0] >> 8; + r[7 * i + 1] |= t[1] << 6; + r[7 * i + 2] = t[1] >> 2; + r[7 * i + 3] = t[1] >> 10; + r[7 * i + 3] |= t[2] << 4; + r[7 * i + 4] = t[2] >> 4; + r[7 * i + 5] = t[2] >> 12; + r[7 * i + 5] |= t[3] << 2; + r[7 * i + 6] = t[3] >> 6; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyt0_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[7 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; + + r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; + r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; + r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; + + r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; + r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; + + r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; + r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; + + r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; + r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; + r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; + r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyz_pack +* +* Description: Bit-pack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLZ_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyz_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + uint32_t t[2]; + + for (i = 0; i < N / 2; ++i) { + /* Map to {0,...,2*GAMMA1 - 2} */ + t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; + t[0] += ((int32_t)t[0] >> 31) & Q; + t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; + t[1] += ((int32_t)t[1] >> 31) & Q; + + r[5 * i + 0] = t[0]; + r[5 * i + 1] = t[0] >> 8; + r[5 * i + 2] = t[0] >> 16; + r[5 * i + 2] |= t[1] << 4; + r[5 * i + 3] = t[1] >> 4; + r[5 * i + 4] = t[1] >> 12; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1 - 1]. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const unsigned char *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyz_unpack(poly *restrict r, const unsigned char *restrict a) { + unsigned int i; + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[5 * i + 0]; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; + + r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + + r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; + r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; + r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; + } + +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - unsigned char *r: pointer to output byte array with at least +* POLW1_SIZE_PACKED bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyw1_pack(unsigned char *restrict r, const poly *restrict a) { + unsigned int i; + + for (i = 0; i < N / 2; ++i) { + r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); + } +} diff --git a/crypto_sign/dilithium4/avx2/poly.h b/crypto_sign/dilithium4/avx2/poly.h new file mode 100644 index 00000000..52e594a5 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/poly.h @@ -0,0 +1,83 @@ +#ifndef POLY_H +#define POLY_H + +#include +#include + +#include "alignment.h" +#include "params.h" + +typedef union { + uint32_t coeffs[N]; + __m256i coeffs_x8[N / 8]; +} poly; + +void PQCLEAN_DILITHIUM4_AVX2_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM4_AVX2_poly_csubq(poly *a); +void PQCLEAN_DILITHIUM4_AVX2_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM4_AVX2_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM4_AVX2_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(poly *a); +void PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM4_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM4_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM4_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM4_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h); + +int PQCLEAN_DILITHIUM4_AVX2_poly_chknorm(const poly *a, uint32_t B); +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1(poly *a, + const uint8_t *seed, + uint16_t nonce); +void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t *seed, + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); + +void PQCLEAN_DILITHIUM4_AVX2_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM4_AVX2_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM4_AVX2_polyt1_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM4_AVX2_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM4_AVX2_polyt0_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM4_AVX2_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM4_AVX2_polyz_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM4_AVX2_polyw1_pack(uint8_t *r, const poly *a); +#endif diff --git a/crypto_sign/dilithium4/avx2/polyvec.c b/crypto_sign/dilithium4/avx2/polyvec.c new file mode 100644 index 00000000..99846f07 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/polyvec.c @@ -0,0 +1,353 @@ +#include + +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* Input coefficients are assumed to be less than 22*Q. Output +* coeffcient are less than 2*L*Q. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const polyvecl *v: pointer to vector +* - uint32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B and 1 +* otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM4_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [0,2*Q[. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq +* +* Description: For all coefficients of polynomials in vector of length K +* subtract Q if coefficient is bigger than Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_csubq(&v->vec[i]); + } +} + +/************************************************* +* Name: polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* Assumes coefficients of polynomials in second input vector +* to be less than 2*Q. No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{32-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_invntt_montgomery +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_invntt_montgomery(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input coefficients to be standard representatives. +* +* Arguments: - const polyveck *v: pointer to vector +* - uint32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B and 1 +* otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM4_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients Q + a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients Q + a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM4_AVX2_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; + + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM4_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + } + + return s; +} + +/************************************************* +* Name: polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *v: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); + } +} diff --git a/crypto_sign/dilithium4/avx2/polyvec.h b/crypto_sign/dilithium4/avx2/polyvec.h new file mode 100644 index 00000000..f0d1a713 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/polyvec.h @@ -0,0 +1,52 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_POLYVEC_H +#define PQCLEAN_DILITHIUM4_AVX2_POLYVEC_H + +#include + +#include "params.h" +#include "poly.h" + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM4_AVX2_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM4_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + +int PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM4_AVX2_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(polyveck *v); +void PQCLEAN_DILITHIUM4_AVX2_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM4_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM4_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM4_AVX2_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM4_AVX2_polyveck_invntt_montgomery(polyveck *v); + +int PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm(const polyveck *v, uint32_t B); + +void PQCLEAN_DILITHIUM4_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM4_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM4_AVX2_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM4_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h); + +#endif diff --git a/crypto_sign/dilithium4/avx2/reduce.h b/crypto_sign/dilithium4/avx2/reduce.h new file mode 100644 index 00000000..ccb2f18c --- /dev/null +++ b/crypto_sign/dilithium4/avx2/reduce.h @@ -0,0 +1,9 @@ +#ifndef REDUCE_H +#define REDUCE_H + +#include + +void PQCLEAN_DILITHIUM4_AVX2_reduce_avx(uint32_t a[N]); +void PQCLEAN_DILITHIUM4_AVX2_csubq_avx(uint32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium4/avx2/reduce.s b/crypto_sign/dilithium4/avx2/reduce.s new file mode 100644 index 00000000..b4c4a567 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/reduce.s @@ -0,0 +1,91 @@ +.global PQCLEAN_DILITHIUM4_AVX2_reduce_avx +PQCLEAN_DILITHIUM4_AVX2_reduce_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8x23ones(%rip),%ymm0 + +xor %eax,%eax +_looptop_rdc32: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm7 + +#reduce +vpsrld $23,%ymm1,%ymm2 +vpsrld $23,%ymm3,%ymm4 +vpsrld $23,%ymm5,%ymm6 +vpsrld $23,%ymm7,%ymm8 +vpand %ymm0,%ymm1,%ymm1 +vpand %ymm0,%ymm3,%ymm3 +vpand %ymm0,%ymm5,%ymm5 +vpand %ymm0,%ymm7,%ymm7 +vpsubd %ymm2,%ymm1,%ymm1 +vpsubd %ymm4,%ymm3,%ymm3 +vpsubd %ymm6,%ymm5,%ymm5 +vpsubd %ymm8,%ymm7,%ymm7 +vpslld $13,%ymm2,%ymm2 +vpslld $13,%ymm4,%ymm4 +vpslld $13,%ymm6,%ymm6 +vpslld $13,%ymm8,%ymm8 +vpaddd %ymm2,%ymm1,%ymm1 +vpaddd %ymm4,%ymm3,%ymm3 +vpaddd %ymm6,%ymm5,%ymm5 +vpaddd %ymm8,%ymm7,%ymm7 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm7,96(%rdi) + +add $128,%rdi +add $1,%eax +cmp $8,%eax +jb _looptop_rdc32 + +ret + +.global PQCLEAN_DILITHIUM4_AVX2_csubq_avx +PQCLEAN_DILITHIUM4_AVX2_csubq_avx: +#consts +vmovdqa _PQCLEAN_DILITHIUM4_AVX2_8xq(%rip),%ymm0 + +xor %eax,%eax +_looptop_csubq: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm7 + +#PQCLEAN_DILITHIUM4_AVX2_csubq +vpsubd %ymm0,%ymm1,%ymm1 +vpsubd %ymm0,%ymm3,%ymm3 +vpsubd %ymm0,%ymm5,%ymm5 +vpsubd %ymm0,%ymm7,%ymm7 +vpsrad $31,%ymm1,%ymm2 +vpsrad $31,%ymm3,%ymm4 +vpsrad $31,%ymm5,%ymm6 +vpsrad $31,%ymm7,%ymm8 +vpand %ymm0,%ymm2,%ymm2 +vpand %ymm0,%ymm4,%ymm4 +vpand %ymm0,%ymm6,%ymm6 +vpand %ymm0,%ymm8,%ymm8 +vpaddd %ymm2,%ymm1,%ymm1 +vpaddd %ymm4,%ymm3,%ymm3 +vpaddd %ymm6,%ymm5,%ymm5 +vpaddd %ymm8,%ymm7,%ymm7 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm7,96(%rdi) + +add $128,%rdi +add $1,%eax +cmp $8,%eax +jb _looptop_csubq + +ret diff --git a/crypto_sign/dilithium4/avx2/rejsample.c b/crypto_sign/dilithium4/avx2/rejsample.c new file mode 100644 index 00000000..89b98cfb --- /dev/null +++ b/crypto_sign/dilithium4/avx2/rejsample.c @@ -0,0 +1,443 @@ +#include +#include + +#include "params.h" +#include "rejsample.h" + +static const uint8_t idx[256][8] = { + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 1, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 0, 0, 0, 0, 0, 0}, + { 2, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0}, + { 1, 2, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 0, 0, 0, 0, 0}, + { 3, 0, 0, 0, 0, 0, 0, 0}, + { 0, 3, 0, 0, 0, 0, 0, 0}, + { 1, 3, 0, 0, 0, 0, 0, 0}, + { 0, 1, 3, 0, 0, 0, 0, 0}, + { 2, 3, 0, 0, 0, 0, 0, 0}, + { 0, 2, 3, 0, 0, 0, 0, 0}, + { 1, 2, 3, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 0, 0, 0, 0}, + { 4, 0, 0, 0, 0, 0, 0, 0}, + { 0, 4, 0, 0, 0, 0, 0, 0}, + { 1, 4, 0, 0, 0, 0, 0, 0}, + { 0, 1, 4, 0, 0, 0, 0, 0}, + { 2, 4, 0, 0, 0, 0, 0, 0}, + { 0, 2, 4, 0, 0, 0, 0, 0}, + { 1, 2, 4, 0, 0, 0, 0, 0}, + { 0, 1, 2, 4, 0, 0, 0, 0}, + { 3, 4, 0, 0, 0, 0, 0, 0}, + { 0, 3, 4, 0, 0, 0, 0, 0}, + { 1, 3, 4, 0, 0, 0, 0, 0}, + { 0, 1, 3, 4, 0, 0, 0, 0}, + { 2, 3, 4, 0, 0, 0, 0, 0}, + { 0, 2, 3, 4, 0, 0, 0, 0}, + { 1, 2, 3, 4, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 0, 0, 0}, + { 5, 0, 0, 0, 0, 0, 0, 0}, + { 0, 5, 0, 0, 0, 0, 0, 0}, + { 1, 5, 0, 0, 0, 0, 0, 0}, + { 0, 1, 5, 0, 0, 0, 0, 0}, + { 2, 5, 0, 0, 0, 0, 0, 0}, + { 0, 2, 5, 0, 0, 0, 0, 0}, + { 1, 2, 5, 0, 0, 0, 0, 0}, + { 0, 1, 2, 5, 0, 0, 0, 0}, + { 3, 5, 0, 0, 0, 0, 0, 0}, + { 0, 3, 5, 0, 0, 0, 0, 0}, + { 1, 3, 5, 0, 0, 0, 0, 0}, + { 0, 1, 3, 5, 0, 0, 0, 0}, + { 2, 3, 5, 0, 0, 0, 0, 0}, + { 0, 2, 3, 5, 0, 0, 0, 0}, + { 1, 2, 3, 5, 0, 0, 0, 0}, + { 0, 1, 2, 3, 5, 0, 0, 0}, + { 4, 5, 0, 0, 0, 0, 0, 0}, + { 0, 4, 5, 0, 0, 0, 0, 0}, + { 1, 4, 5, 0, 0, 0, 0, 0}, + { 0, 1, 4, 5, 0, 0, 0, 0}, + { 2, 4, 5, 0, 0, 0, 0, 0}, + { 0, 2, 4, 5, 0, 0, 0, 0}, + { 1, 2, 4, 5, 0, 0, 0, 0}, + { 0, 1, 2, 4, 5, 0, 0, 0}, + { 3, 4, 5, 0, 0, 0, 0, 0}, + { 0, 3, 4, 5, 0, 0, 0, 0}, + { 1, 3, 4, 5, 0, 0, 0, 0}, + { 0, 1, 3, 4, 5, 0, 0, 0}, + { 2, 3, 4, 5, 0, 0, 0, 0}, + { 0, 2, 3, 4, 5, 0, 0, 0}, + { 1, 2, 3, 4, 5, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 0, 0}, + { 6, 0, 0, 0, 0, 0, 0, 0}, + { 0, 6, 0, 0, 0, 0, 0, 0}, + { 1, 6, 0, 0, 0, 0, 0, 0}, + { 0, 1, 6, 0, 0, 0, 0, 0}, + { 2, 6, 0, 0, 0, 0, 0, 0}, + { 0, 2, 6, 0, 0, 0, 0, 0}, + { 1, 2, 6, 0, 0, 0, 0, 0}, + { 0, 1, 2, 6, 0, 0, 0, 0}, + { 3, 6, 0, 0, 0, 0, 0, 0}, + { 0, 3, 6, 0, 0, 0, 0, 0}, + { 1, 3, 6, 0, 0, 0, 0, 0}, + { 0, 1, 3, 6, 0, 0, 0, 0}, + { 2, 3, 6, 0, 0, 0, 0, 0}, + { 0, 2, 3, 6, 0, 0, 0, 0}, + { 1, 2, 3, 6, 0, 0, 0, 0}, + { 0, 1, 2, 3, 6, 0, 0, 0}, + { 4, 6, 0, 0, 0, 0, 0, 0}, + { 0, 4, 6, 0, 0, 0, 0, 0}, + { 1, 4, 6, 0, 0, 0, 0, 0}, + { 0, 1, 4, 6, 0, 0, 0, 0}, + { 2, 4, 6, 0, 0, 0, 0, 0}, + { 0, 2, 4, 6, 0, 0, 0, 0}, + { 1, 2, 4, 6, 0, 0, 0, 0}, + { 0, 1, 2, 4, 6, 0, 0, 0}, + { 3, 4, 6, 0, 0, 0, 0, 0}, + { 0, 3, 4, 6, 0, 0, 0, 0}, + { 1, 3, 4, 6, 0, 0, 0, 0}, + { 0, 1, 3, 4, 6, 0, 0, 0}, + { 2, 3, 4, 6, 0, 0, 0, 0}, + { 0, 2, 3, 4, 6, 0, 0, 0}, + { 1, 2, 3, 4, 6, 0, 0, 0}, + { 0, 1, 2, 3, 4, 6, 0, 0}, + { 5, 6, 0, 0, 0, 0, 0, 0}, + { 0, 5, 6, 0, 0, 0, 0, 0}, + { 1, 5, 6, 0, 0, 0, 0, 0}, + { 0, 1, 5, 6, 0, 0, 0, 0}, + { 2, 5, 6, 0, 0, 0, 0, 0}, + { 0, 2, 5, 6, 0, 0, 0, 0}, + { 1, 2, 5, 6, 0, 0, 0, 0}, + { 0, 1, 2, 5, 6, 0, 0, 0}, + { 3, 5, 6, 0, 0, 0, 0, 0}, + { 0, 3, 5, 6, 0, 0, 0, 0}, + { 1, 3, 5, 6, 0, 0, 0, 0}, + { 0, 1, 3, 5, 6, 0, 0, 0}, + { 2, 3, 5, 6, 0, 0, 0, 0}, + { 0, 2, 3, 5, 6, 0, 0, 0}, + { 1, 2, 3, 5, 6, 0, 0, 0}, + { 0, 1, 2, 3, 5, 6, 0, 0}, + { 4, 5, 6, 0, 0, 0, 0, 0}, + { 0, 4, 5, 6, 0, 0, 0, 0}, + { 1, 4, 5, 6, 0, 0, 0, 0}, + { 0, 1, 4, 5, 6, 0, 0, 0}, + { 2, 4, 5, 6, 0, 0, 0, 0}, + { 0, 2, 4, 5, 6, 0, 0, 0}, + { 1, 2, 4, 5, 6, 0, 0, 0}, + { 0, 1, 2, 4, 5, 6, 0, 0}, + { 3, 4, 5, 6, 0, 0, 0, 0}, + { 0, 3, 4, 5, 6, 0, 0, 0}, + { 1, 3, 4, 5, 6, 0, 0, 0}, + { 0, 1, 3, 4, 5, 6, 0, 0}, + { 2, 3, 4, 5, 6, 0, 0, 0}, + { 0, 2, 3, 4, 5, 6, 0, 0}, + { 1, 2, 3, 4, 5, 6, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 0}, + { 7, 0, 0, 0, 0, 0, 0, 0}, + { 0, 7, 0, 0, 0, 0, 0, 0}, + { 1, 7, 0, 0, 0, 0, 0, 0}, + { 0, 1, 7, 0, 0, 0, 0, 0}, + { 2, 7, 0, 0, 0, 0, 0, 0}, + { 0, 2, 7, 0, 0, 0, 0, 0}, + { 1, 2, 7, 0, 0, 0, 0, 0}, + { 0, 1, 2, 7, 0, 0, 0, 0}, + { 3, 7, 0, 0, 0, 0, 0, 0}, + { 0, 3, 7, 0, 0, 0, 0, 0}, + { 1, 3, 7, 0, 0, 0, 0, 0}, + { 0, 1, 3, 7, 0, 0, 0, 0}, + { 2, 3, 7, 0, 0, 0, 0, 0}, + { 0, 2, 3, 7, 0, 0, 0, 0}, + { 1, 2, 3, 7, 0, 0, 0, 0}, + { 0, 1, 2, 3, 7, 0, 0, 0}, + { 4, 7, 0, 0, 0, 0, 0, 0}, + { 0, 4, 7, 0, 0, 0, 0, 0}, + { 1, 4, 7, 0, 0, 0, 0, 0}, + { 0, 1, 4, 7, 0, 0, 0, 0}, + { 2, 4, 7, 0, 0, 0, 0, 0}, + { 0, 2, 4, 7, 0, 0, 0, 0}, + { 1, 2, 4, 7, 0, 0, 0, 0}, + { 0, 1, 2, 4, 7, 0, 0, 0}, + { 3, 4, 7, 0, 0, 0, 0, 0}, + { 0, 3, 4, 7, 0, 0, 0, 0}, + { 1, 3, 4, 7, 0, 0, 0, 0}, + { 0, 1, 3, 4, 7, 0, 0, 0}, + { 2, 3, 4, 7, 0, 0, 0, 0}, + { 0, 2, 3, 4, 7, 0, 0, 0}, + { 1, 2, 3, 4, 7, 0, 0, 0}, + { 0, 1, 2, 3, 4, 7, 0, 0}, + { 5, 7, 0, 0, 0, 0, 0, 0}, + { 0, 5, 7, 0, 0, 0, 0, 0}, + { 1, 5, 7, 0, 0, 0, 0, 0}, + { 0, 1, 5, 7, 0, 0, 0, 0}, + { 2, 5, 7, 0, 0, 0, 0, 0}, + { 0, 2, 5, 7, 0, 0, 0, 0}, + { 1, 2, 5, 7, 0, 0, 0, 0}, + { 0, 1, 2, 5, 7, 0, 0, 0}, + { 3, 5, 7, 0, 0, 0, 0, 0}, + { 0, 3, 5, 7, 0, 0, 0, 0}, + { 1, 3, 5, 7, 0, 0, 0, 0}, + { 0, 1, 3, 5, 7, 0, 0, 0}, + { 2, 3, 5, 7, 0, 0, 0, 0}, + { 0, 2, 3, 5, 7, 0, 0, 0}, + { 1, 2, 3, 5, 7, 0, 0, 0}, + { 0, 1, 2, 3, 5, 7, 0, 0}, + { 4, 5, 7, 0, 0, 0, 0, 0}, + { 0, 4, 5, 7, 0, 0, 0, 0}, + { 1, 4, 5, 7, 0, 0, 0, 0}, + { 0, 1, 4, 5, 7, 0, 0, 0}, + { 2, 4, 5, 7, 0, 0, 0, 0}, + { 0, 2, 4, 5, 7, 0, 0, 0}, + { 1, 2, 4, 5, 7, 0, 0, 0}, + { 0, 1, 2, 4, 5, 7, 0, 0}, + { 3, 4, 5, 7, 0, 0, 0, 0}, + { 0, 3, 4, 5, 7, 0, 0, 0}, + { 1, 3, 4, 5, 7, 0, 0, 0}, + { 0, 1, 3, 4, 5, 7, 0, 0}, + { 2, 3, 4, 5, 7, 0, 0, 0}, + { 0, 2, 3, 4, 5, 7, 0, 0}, + { 1, 2, 3, 4, 5, 7, 0, 0}, + { 0, 1, 2, 3, 4, 5, 7, 0}, + { 6, 7, 0, 0, 0, 0, 0, 0}, + { 0, 6, 7, 0, 0, 0, 0, 0}, + { 1, 6, 7, 0, 0, 0, 0, 0}, + { 0, 1, 6, 7, 0, 0, 0, 0}, + { 2, 6, 7, 0, 0, 0, 0, 0}, + { 0, 2, 6, 7, 0, 0, 0, 0}, + { 1, 2, 6, 7, 0, 0, 0, 0}, + { 0, 1, 2, 6, 7, 0, 0, 0}, + { 3, 6, 7, 0, 0, 0, 0, 0}, + { 0, 3, 6, 7, 0, 0, 0, 0}, + { 1, 3, 6, 7, 0, 0, 0, 0}, + { 0, 1, 3, 6, 7, 0, 0, 0}, + { 2, 3, 6, 7, 0, 0, 0, 0}, + { 0, 2, 3, 6, 7, 0, 0, 0}, + { 1, 2, 3, 6, 7, 0, 0, 0}, + { 0, 1, 2, 3, 6, 7, 0, 0}, + { 4, 6, 7, 0, 0, 0, 0, 0}, + { 0, 4, 6, 7, 0, 0, 0, 0}, + { 1, 4, 6, 7, 0, 0, 0, 0}, + { 0, 1, 4, 6, 7, 0, 0, 0}, + { 2, 4, 6, 7, 0, 0, 0, 0}, + { 0, 2, 4, 6, 7, 0, 0, 0}, + { 1, 2, 4, 6, 7, 0, 0, 0}, + { 0, 1, 2, 4, 6, 7, 0, 0}, + { 3, 4, 6, 7, 0, 0, 0, 0}, + { 0, 3, 4, 6, 7, 0, 0, 0}, + { 1, 3, 4, 6, 7, 0, 0, 0}, + { 0, 1, 3, 4, 6, 7, 0, 0}, + { 2, 3, 4, 6, 7, 0, 0, 0}, + { 0, 2, 3, 4, 6, 7, 0, 0}, + { 1, 2, 3, 4, 6, 7, 0, 0}, + { 0, 1, 2, 3, 4, 6, 7, 0}, + { 5, 6, 7, 0, 0, 0, 0, 0}, + { 0, 5, 6, 7, 0, 0, 0, 0}, + { 1, 5, 6, 7, 0, 0, 0, 0}, + { 0, 1, 5, 6, 7, 0, 0, 0}, + { 2, 5, 6, 7, 0, 0, 0, 0}, + { 0, 2, 5, 6, 7, 0, 0, 0}, + { 1, 2, 5, 6, 7, 0, 0, 0}, + { 0, 1, 2, 5, 6, 7, 0, 0}, + { 3, 5, 6, 7, 0, 0, 0, 0}, + { 0, 3, 5, 6, 7, 0, 0, 0}, + { 1, 3, 5, 6, 7, 0, 0, 0}, + { 0, 1, 3, 5, 6, 7, 0, 0}, + { 2, 3, 5, 6, 7, 0, 0, 0}, + { 0, 2, 3, 5, 6, 7, 0, 0}, + { 1, 2, 3, 5, 6, 7, 0, 0}, + { 0, 1, 2, 3, 5, 6, 7, 0}, + { 4, 5, 6, 7, 0, 0, 0, 0}, + { 0, 4, 5, 6, 7, 0, 0, 0}, + { 1, 4, 5, 6, 7, 0, 0, 0}, + { 0, 1, 4, 5, 6, 7, 0, 0}, + { 2, 4, 5, 6, 7, 0, 0, 0}, + { 0, 2, 4, 5, 6, 7, 0, 0}, + { 1, 2, 4, 5, 6, 7, 0, 0}, + { 0, 1, 2, 4, 5, 6, 7, 0}, + { 3, 4, 5, 6, 7, 0, 0, 0}, + { 0, 3, 4, 5, 6, 7, 0, 0}, + { 1, 3, 4, 5, 6, 7, 0, 0}, + { 0, 1, 3, 4, 5, 6, 7, 0}, + { 2, 3, 4, 5, 6, 7, 0, 0}, + { 0, 2, 3, 4, 5, 6, 7, 0}, + { 1, 2, 3, 4, 5, 6, 7, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7} +}; + +unsigned int PQCLEAN_DILITHIUM4_AVX2_rej_uniform( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint32_t vec[8]; + __m256i d, tmp; + uint32_t good; + const __m256i bound = _mm256_set1_epi32(Q); + + ctr = pos = 0; + while (ctr + 8 <= len && pos + 24 <= buflen) { + for (i = 0; i < 8; i++) { + vec[i] = buf[pos++]; + vec[i] |= (uint32_t)buf[pos++] << 8; + vec[i] |= (uint32_t)buf[pos++] << 16; + vec[i] &= 0x7FFFFF; + } + + d = _mm256_loadu_si256((__m256i_u *)vec); + tmp = _mm256_cmpgt_epi32(bound, d); + good = _mm256_movemask_ps((__m256)tmp); + + __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); + tmp = _mm256_cvtepu8_epi32(rid); + d = _mm256_permutevar8x32_epi32(d, tmp); + _mm256_storeu_si256((__m256i_u *)&r[ctr], d); + ctr += __builtin_popcount(good); + } + + while (ctr < len && pos + 3 <= buflen) { + vec[0] = buf[pos++]; + vec[0] |= (uint32_t)buf[pos++] << 8; + vec[0] |= (uint32_t)buf[pos++] << 16; + vec[0] &= 0x7FFFFF; + + if (vec[0] < Q) { + r[ctr++] = vec[0]; + } + } + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM4_AVX2_rej_eta( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint8_t vec[32]; + __m256i tmp0, tmp1; + __m128i d0, d1, rid; + uint32_t good; + const __m256i bound = _mm256_set1_epi8(2 * ETA + 1); + const __m256i off = _mm256_set1_epi32(Q + ETA); + + ctr = pos = 0; + while (ctr + 32 <= len && pos + 16 <= buflen) { + for (i = 0; i < 16; i++) { + vec[2 * i + 0] = buf[pos] & 0x07; + vec[2 * i + 1] = buf[pos++] >> 5; + } + + tmp0 = _mm256_loadu_si256((__m256i_u *)vec); + tmp1 = _mm256_cmpgt_epi8(bound, tmp0); + good = _mm256_movemask_epi8(tmp1); + + d0 = _mm256_castsi256_si128(tmp0); + rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount(good & 0xFF); + + d0 = _mm_bsrli_si128(d0, 8); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 8) & 0xFF); + + d0 = _mm256_extracti128_si256(tmp0, 1); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 16) & 0xFF); + + d0 = _mm_bsrli_si128(d0, 8); + rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]); + d1 = _mm_shuffle_epi8(d0, rid); + tmp1 = _mm256_cvtepu8_epi32(d1); + tmp1 = _mm256_sub_epi32(off, tmp1); + _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); + ctr += __builtin_popcount((good >> 24) & 0xFF); + } + + while (ctr < len && pos < buflen) { + vec[0] = buf[pos] & 0x07; + vec[1] = buf[pos++] >> 5; + + if (vec[0] <= 2 * ETA) { + r[ctr++] = Q + ETA - vec[0]; + } + if (vec[1] <= 2 * ETA && ctr < len) { + r[ctr++] = Q + ETA - vec[1]; + } + } + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM4_AVX2_rej_gamma1m1( + uint32_t *r, + unsigned int len, + const unsigned char *buf, + unsigned int buflen) { + unsigned int i, ctr, pos; + uint32_t vec[8]; + __m256i d, tmp; + uint32_t good; + const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1); + const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1); + + ctr = pos = 0; + while (ctr + 8 <= len && pos + 20 <= buflen) { + for (i = 0; i < 4; i++) { + vec[2 * i + 0] = buf[pos + 0]; + vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8; + vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16; + vec[2 * i + 0] &= 0xFFFFF; + + vec[2 * i + 1] = buf[pos + 2] >> 4; + vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4; + vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + } + + d = _mm256_loadu_si256((__m256i_u *)vec); + tmp = _mm256_cmpgt_epi32(bound, d); + good = _mm256_movemask_ps((__m256)tmp); + d = _mm256_sub_epi32(off, d); + + __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); + tmp = _mm256_cvtepu8_epi32(rid); + d = _mm256_permutevar8x32_epi32(d, tmp); + _mm256_storeu_si256((__m256i_u *)&r[ctr], d); + ctr += __builtin_popcount(good); + } + + while (ctr < len && pos + 5 <= buflen) { + vec[0] = buf[pos + 0]; + vec[0] |= (uint32_t)buf[pos + 1] << 8; + vec[0] |= (uint32_t)buf[pos + 2] << 16; + vec[0] &= 0xFFFFF; + + vec[1] = buf[pos + 2] >> 4; + vec[1] |= (uint32_t)buf[pos + 3] << 4; + vec[1] |= (uint32_t)buf[pos + 4] << 12; + + pos += 5; + + if (vec[0] <= 2 * GAMMA1 - 2) { + r[ctr++] = Q + GAMMA1 - 1 - vec[0]; + } + if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) { + r[ctr++] = Q + GAMMA1 - 1 - vec[1]; + } + } + + return ctr; +} diff --git a/crypto_sign/dilithium4/avx2/rejsample.h b/crypto_sign/dilithium4/avx2/rejsample.h new file mode 100644 index 00000000..f78f56e3 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/rejsample.h @@ -0,0 +1,26 @@ +#ifndef REJSAMPLE_H +#define REJSAMPLE_H + +#include + +#include "poly.h" + +unsigned int PQCLEAN_DILITHIUM4_AVX2_rej_uniform( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +unsigned int PQCLEAN_DILITHIUM4_AVX2_rej_eta( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +unsigned int PQCLEAN_DILITHIUM4_AVX2_rej_gamma1m1( + uint32_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen); + +#endif diff --git a/crypto_sign/dilithium4/avx2/rounding.c b/crypto_sign/dilithium4/avx2/rounding.c new file mode 100644 index 00000000..80d28229 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/rounding.c @@ -0,0 +1,115 @@ +#include "rounding.h" + +/************************************************* +* Name: power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - uint32_t a: input element +* - uint32_t *a0: pointer to output element Q + a0 +* +* Returns a1. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM4_AVX2_power2round(uint32_t a, uint32_t *a0) { + int32_t t; + + /* Centralized remainder mod 2^D */ + t = a & ((1U << D) - 1); + t -= (1U << (D - 1)) + 1; + t += (t >> 31) & (1U << D); + t -= (1U << (D - 1)) - 1; + *a0 = Q + t; + a = (a - t) >> D; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - uint32_t a: input element +* - uint32_t *a0: pointer to output element Q + a0 +* +* Returns a1. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM4_AVX2_decompose(uint32_t a, uint32_t *a0) { + int32_t t, u; + + /* Centralized remainder mod ALPHA */ + t = a & 0x7FFFF; + t += (a >> 19) << 9; + t -= ALPHA / 2 + 1; + t += (t >> 31) & ALPHA; + t -= ALPHA / 2 - 1; + a -= t; + + /* Divide by ALPHA (possible to avoid) */ + u = a - 1; + u >>= 31; + a = (a >> 19) + 1; + a -= u & 1; + + /* Border case */ + *a0 = Q + t - (a >> 4); + a &= 0xF; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. Inputs assumed to be +* standard representatives. +* +* Arguments: - uint32_t a0: low bits of input element +* - uint32_t a1: high bits of input element +* +* Returns 1 if high bits of a and b differ and 0 otherwise. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM4_AVX2_make_hint(const uint32_t a0, const uint32_t a1) { + if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { + return 0; + } + + return 1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - uint32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +uint32_t PQCLEAN_DILITHIUM4_AVX2_use_hint(const uint32_t a, const unsigned int hint) { + uint32_t a0, a1; + + a1 = PQCLEAN_DILITHIUM4_AVX2_decompose(a, &a0); + if (hint == 0) { + return a1; + } + if (a0 > Q) { + return (a1 + 1) & 0xF; + } + return (a1 - 1) & 0xF; + + /* If decompose does not divide out ALPHA: + if(hint == 0) + return a1; + else if(a0 > Q) + return (a1 + ALPHA) % (Q - 1); + else + return (a1 - ALPHA) % (Q - 1); + */ +} diff --git a/crypto_sign/dilithium4/avx2/rounding.h b/crypto_sign/dilithium4/avx2/rounding.h new file mode 100644 index 00000000..611791c9 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/rounding.h @@ -0,0 +1,12 @@ +#ifndef ROUNDING_H +#define ROUNDING_H + +#include "params.h" +#include + +uint32_t PQCLEAN_DILITHIUM4_AVX2_power2round(uint32_t a, uint32_t *a0); +uint32_t PQCLEAN_DILITHIUM4_AVX2_decompose(uint32_t a, uint32_t *a0); +unsigned int PQCLEAN_DILITHIUM4_AVX2_make_hint(uint32_t a0, uint32_t a1); +uint32_t PQCLEAN_DILITHIUM4_AVX2_use_hint(uint32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium4/avx2/shuffle.inc b/crypto_sign/dilithium4/avx2/shuffle.inc new file mode 100644 index 00000000..df352030 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/shuffle.inc @@ -0,0 +1,23 @@ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +vpsllq $32,%ymm\r1,%ymm12 +vpsrlq $32,%ymm\r0,%ymm13 +vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 +vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm12 +vpsrld $16,%ymm\r0,%ymm13 +vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 +vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +.endm diff --git a/crypto_sign/dilithium4/avx2/sign.c b/crypto_sign/dilithium4/avx2/sign.c new file mode 100644 index 00000000..9a64b286 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/sign.c @@ -0,0 +1,463 @@ +#include +#include + +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|i|j). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_expand_mat(polyvecl mat[6], const uint8_t rho[SEEDBYTES]) { + poly t0, t1; + + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[0].vec[0], + &mat[0].vec[1], + &mat[0].vec[2], + &mat[0].vec[3], + rho, 0, 1, 2, 3); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[0].vec[4], + &mat[1].vec[0], + &mat[1].vec[1], + &mat[1].vec[2], + rho, 4, 256, 257, 258); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[1].vec[3], + &mat[1].vec[4], + &mat[2].vec[0], + &mat[2].vec[1], + rho, 259, 260, 512, 513); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[2].vec[2], + &mat[2].vec[3], + &mat[2].vec[4], + &mat[3].vec[0], + rho, 514, 515, 516, 768); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[3].vec[1], + &mat[3].vec[2], + &mat[3].vec[3], + &mat[3].vec[4], + rho, 769, 770, 771, 772); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[4].vec[0], + &mat[4].vec[1], + &mat[4].vec[2], + &mat[4].vec[3], + rho, 1024, 1025, 1026, 1027); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[4].vec[4], + &mat[5].vec[0], + &mat[5].vec[1], + &mat[5].vec[2], + rho, 1028, 1280, 1281, 1282); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[5].vec[3], + &mat[5].vec[4], + &t0, + &t1, + rho, 1283, 1284, 0, 0); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with 60 nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(mu|w1). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing mu +* - const polyveck *w1: pointer to vector w1 +**************************************************/ +void PQCLEAN_DILITHIUM4_AVX2_challenge(poly *c, + const uint8_t mu[CRHBYTES], + const polyveck *w1) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; + uint8_t outbuf[SHAKE256_RATE]; + shake256ctx state; + + for (i = 0; i < CRHBYTES; ++i) { + inbuf[i] = mu[i]; + } + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); + } + + shake256_absorb(&state, inbuf, sizeof(inbuf)); + shake256_squeezeblocks(outbuf, 1, &state); + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t) outbuf[i] << 8 * i; + } + + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + + for (i = 196; i < 256; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_squeezeblocks(outbuf, 1, &state); + pos = 0; + } + + b = outbuf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1; + c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1)); + signs >>= 1; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; + uint16_t nonce = 0; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t, t1, t0; + + /* Expand 32 bytes of randomness into rho, rhoprime and key */ + randombytes(seedbuf, 3 * SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Expand matrix */ + PQCLEAN_DILITHIUM4_AVX2_expand_mat(mat, rho); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, + nonce, nonce + 1, nonce + 2, nonce + 3); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(&s1.vec[4], &s2.vec[0], &s2.vec[1], &s2.vec[2], rhoprime, + nonce + 4, nonce + 5, nonce + 6, nonce + 7); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(&s2.vec[3], &s2.vec[4], &s2.vec[5], &t.vec[0], rhoprime, + nonce + 8, nonce + 9, nonce + 10, 0); + + /* Matrix-vector multiplication */ + s1hat = s1; + PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(&s1hat); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); + //PQCLEAN_DILITHIUM4_AVX2_poly_reduce(&t.vec[i]); + PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&t.vec[i]); + } + + /* Add error vector s2 */ + PQCLEAN_DILITHIUM4_AVX2_polyveck_add(&t, &t, &s2); + + /* Extract t1 and write public key */ + PQCLEAN_DILITHIUM4_AVX2_polyveck_freeze(&t); + PQCLEAN_DILITHIUM4_AVX2_polyveck_power2round(&t1, &t0, &t); + PQCLEAN_DILITHIUM4_AVX2_pack_pk(pk, rho, &t1); + + /* Compute CRH(rho, t1) and write secret key */ + crh(tr, pk, PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES); + PQCLEAN_DILITHIUM4_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign_signature +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES +* of len) +* - size_t *siglen: pointer to output length of signed message +* (should be PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES) +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + size_t i; + unsigned int n; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint16_t nonce = 0; + poly c, chat; + polyvecl mat[K], s1, y, yhat, z; + polyveck t0, s2, w, w1, w0; + polyveck h, cs2, ct0; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM4_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); + + + // use incremental hash API instead of copying around buffers + /* Compute CRH(tr, m) */ + shake256incctx state; + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM4_AVX2_expand_mat(mat, rho); + PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &y.vec[3], + rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); + PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1(&y.vec[4], rhoprime, nonce + 4); + nonce += 5; + + /* Matrix-vector multiplication */ + yhat = y; + PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(&yhat); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); + PQCLEAN_DILITHIUM4_AVX2_poly_reduce(&w.vec[i]); + PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&w.vec[i]); + } + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(&w); + PQCLEAN_DILITHIUM4_AVX2_polyveck_decompose(&w1, &w0, &w); + PQCLEAN_DILITHIUM4_AVX2_challenge(&c, mu, &w1); + chat = c; + PQCLEAN_DILITHIUM4_AVX2_poly_ntt(&chat); + + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); + PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&cs2.vec[i]); + } + PQCLEAN_DILITHIUM4_AVX2_polyveck_sub(&w0, &w0, &cs2); + PQCLEAN_DILITHIUM4_AVX2_polyveck_freeze(&w0); + if (PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) { + goto rej; + } + + /* Compute z, reject if it reveals secret */ + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); + PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&z.vec[i]); + } + PQCLEAN_DILITHIUM4_AVX2_polyvecl_add(&z, &z, &y); + PQCLEAN_DILITHIUM4_AVX2_polyvecl_freeze(&z); + if (PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + goto rej; + } + + /* Compute hints for w1 */ + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); + PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&ct0.vec[i]); + } + + PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(&ct0); + if (PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm(&ct0, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM4_AVX2_polyveck_add(&w0, &w0, &ct0); + PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(&w0); + n = PQCLEAN_DILITHIUM4_AVX2_polyveck_make_hint(&h, &w0, &w1); + if (n > OMEGA) { + goto rej; + } + + /* Write signature */ + PQCLEAN_DILITHIUM4_AVX2_pack_sig(sig, &z, &h, &c); + *siglen = PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - unsigned long long *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - unsigned long long mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + int rc; + memmove(sm + PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES, m, mlen); + rc = PQCLEAN_DILITHIUM4_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk); + *smlen += mlen; + return rc; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign_verify +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *sig: signature +* - size_t siglen: length of signature (PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES) +* - uint8_t *m: pointer to message +* - size_t *mlen: pointer to output length of message +* - uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk) { + size_t i; + uint8_t rho[SEEDBYTES]; + uint8_t mu[CRHBYTES]; + poly c, chat, cp; + polyvecl mat[K], z; + polyveck t1, w1, h, tmp1, tmp2; + + if (siglen < PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES) { + return -1; + } + + PQCLEAN_DILITHIUM4_AVX2_unpack_pk(rho, &t1, pk); + if (PQCLEAN_DILITHIUM4_AVX2_unpack_sig(&z, &h, &c, sig)) { + return -1; + } + if (PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES); + + shake256incctx state; + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + PQCLEAN_DILITHIUM4_AVX2_expand_mat(mat, rho); + PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(&z); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); + } + + chat = c; + PQCLEAN_DILITHIUM4_AVX2_poly_ntt(&chat); + PQCLEAN_DILITHIUM4_AVX2_polyveck_shiftl(&t1); + PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(&t1); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); + } + + PQCLEAN_DILITHIUM4_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2); + PQCLEAN_DILITHIUM4_AVX2_polyveck_reduce(&tmp1); + PQCLEAN_DILITHIUM4_AVX2_polyveck_invntt_montgomery(&tmp1); + + /* Reconstruct w1 */ + PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(&tmp1); + PQCLEAN_DILITHIUM4_AVX2_polyveck_use_hint(&w1, &tmp1, &h); + + /* Call random oracle and verify challenge */ + PQCLEAN_DILITHIUM4_AVX2_challenge(&cp, mu, &w1); + for (i = 0; i < N; ++i) { + if (c.coeffs[i] != cp.coeffs[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - unsigned char *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - unsigned long long *mlen: pointer to output length of message +* - const unsigned char *sm: pointer to signed message +* - unsigned long long smlen: length of signed message +* - const unsigned char *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk) { + size_t i; + if (smlen < PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES) { + goto badsig; + } + *mlen = smlen - PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES; + + if (PQCLEAN_DILITHIUM4_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES, + sm + PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES + i]; + } + return 0; + } + + /* Signature verification failed */ +badsig: + *mlen = (size_t) -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium4/avx2/sign.h b/crypto_sign/dilithium4/avx2/sign.h new file mode 100644 index 00000000..73968a7f --- /dev/null +++ b/crypto_sign/dilithium4/avx2/sign.h @@ -0,0 +1,15 @@ +#ifndef SIGN_H +#define SIGN_H + +#include "api.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + +void PQCLEAN_DILITHIUM4_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM4_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES], + const polyveck *w1); + + +#endif + diff --git a/crypto_sign/dilithium4/avx2/stream.c b/crypto_sign/dilithium4/avx2/stream.c new file mode 100644 index 00000000..2163bc19 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/stream.c @@ -0,0 +1,26 @@ +#include "stream.h" + +#include + +void PQCLEAN_DILITHIUM4_AVX2_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + + uint8_t buf[SEEDBYTES + 2]; + memcpy(buf, seed, SEEDBYTES); + buf[SEEDBYTES] = (uint8_t)nonce; + buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); + + shake128_absorb(state, buf, SEEDBYTES + 2); +} + + +void PQCLEAN_DILITHIUM4_AVX2_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + + uint8_t buf[CRHBYTES + 2]; + memcpy(buf, seed, CRHBYTES); + buf[CRHBYTES] = (uint8_t)nonce; + buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); + + shake256_absorb(state, buf, CRHBYTES + 2); +} diff --git a/crypto_sign/dilithium4/avx2/stream.h b/crypto_sign/dilithium4/avx2/stream.h new file mode 100644 index 00000000..87a280e4 --- /dev/null +++ b/crypto_sign/dilithium4/avx2/stream.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_STREAM_H +#define PQCLEAN_DILITHIUM4_AVX2_STREAM_H + +#include + +#include "fips202.h" +#include "params.h" + +void PQCLEAN_DILITHIUM4_AVX2_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM4_AVX2_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#endif diff --git a/crypto_sign/dilithium4/avx2/symmetric.h b/crypto_sign/dilithium4/avx2/symmetric.h new file mode 100644 index 00000000..8ad8243b --- /dev/null +++ b/crypto_sign/dilithium4/avx2/symmetric.h @@ -0,0 +1,23 @@ +#ifndef PQCLEAN_DILITHIUM4_AVX2_SYMMETRIC_H +#define PQCLEAN_DILITHIUM4_AVX2_SYMMETRIC_H + +#include "params.h" +#include "stream.h" + + +#include "fips202.h" + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM4_AVX2_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM4_AVX2_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) + +#define STREAM128_BLOCKBYTES SHAKE128_RATE +#define STREAM256_BLOCKBYTES SHAKE256_RATE + +typedef shake128ctx stream128_state; +typedef shake256ctx stream256_state; + + +#endif diff --git a/crypto_sign/dilithium4/clean/LICENSE b/crypto_sign/dilithium4/clean/LICENSE index 0299dbff..40541676 100644 --- a/crypto_sign/dilithium4/clean/LICENSE +++ b/crypto_sign/dilithium4/clean/LICENSE @@ -1,2 +1,6 @@ -Public Domain -Authors: Léo Ducas, Eike Kiltz, Tancrède Lepoint, Vadim Lyubashevsky, Gregor Seiler, Peter Schwabe, Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and the random number generator +we are using public-domain code from sources +and by authors listed in comments on top of +the respective files. diff --git a/crypto_sign/dilithium4/clean/Makefile b/crypto_sign/dilithium4/clean/Makefile index 265034fe..1f8fcac7 100644 --- a/crypto_sign/dilithium4/clean/Makefile +++ b/crypto_sign/dilithium4/clean/Makefile @@ -2,10 +2,10 @@ LIB=libdilithium4_clean.a -SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c symmetric.c -OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o symmetric.o +SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c +OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - reduce.h rounding.h symmetric.h + reduce.h rounding.h symmetric.h stream.h CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_sign/dilithium4/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium4/clean/Makefile.Microsoft_nmake index 5e341dcf..99ffb30f 100644 --- a/crypto_sign/dilithium4/clean/Makefile.Microsoft_nmake +++ b/crypto_sign/dilithium4/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libdilithium4_clean.lib -OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj symmetric.obj +OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX all: $(LIBRARY) diff --git a/crypto_sign/dilithium4/clean/api.h b/crypto_sign/dilithium4/clean/api.h index c6dac283..75eca0bd 100644 --- a/crypto_sign/dilithium4/clean/api.h +++ b/crypto_sign/dilithium4/clean/api.h @@ -7,10 +7,22 @@ #define PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES 1760U #define PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES 3856U #define PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES 3366U + #define PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_ALGNAME "Dilithium4" -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair( + uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *msg, size_t len, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature( uint8_t *sig, size_t *siglen, @@ -20,13 +32,6 @@ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); #endif diff --git a/crypto_sign/dilithium4/clean/ntt.c b/crypto_sign/dilithium4/clean/ntt.c index e46b42bd..265907fe 100644 --- a/crypto_sign/dilithium4/clean/ntt.c +++ b/crypto_sign/dilithium4/clean/ntt.c @@ -1,11 +1,12 @@ +#include + #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" -#include -/* Roots of unity in order needed by forward ntt */ -static const uint32_t zetas[N] = { +/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM4_CLEAN_ntt */ +static const uint32_t PQCLEAN_DILITHIUM4_CLEAN_zetas[N] = { 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231, @@ -40,8 +41,8 @@ static const uint32_t zetas[N] = { 8332111, 7018208, 3937738, 1400424, 7534263, 1976782 }; -/* Roots of unity in order needed by inverse ntt */ -static const uint32_t zetas_inv[N] = { +/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM4_CLEAN_ntt */ +static const uint32_t PQCLEAN_DILITHIUM4_CLEAN_zetas_inv[N] = { 6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416, 3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036, 3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683, @@ -77,7 +78,7 @@ static const uint32_t zetas_inv[N] = { }; /************************************************* -* Name: ntt +* Name: PQCLEAN_DILITHIUM4_CLEAN_ntt * * Description: Forward NTT, in-place. No modular reduction is performed after * additions or subtractions. Hence output coefficients can be up @@ -86,16 +87,16 @@ static const uint32_t zetas_inv[N] = { * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_ntt(uint32_t p[N]) { +void PQCLEAN_DILITHIUM4_CLEAN_ntt(uint32_t *p) { unsigned int len, start, j, k; uint32_t zeta, t; k = 1; for (len = 128; len > 0; len >>= 1) { for (start = 0; start < N; start = j + len) { - zeta = zetas[k++]; + zeta = PQCLEAN_DILITHIUM4_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]); + t = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); p[j + len] = p[j] + 2 * Q - t; p[j] = p[j] + t; } @@ -104,7 +105,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_ntt(uint32_t p[N]) { } /************************************************* -* Name: invntt_frominvmont +* Name: PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont * * Description: Inverse NTT and multiplication by Montgomery factor 2^32. * In-place. No modular reductions after additions or @@ -113,7 +114,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_ntt(uint32_t p[N]) { * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont(uint32_t p[N]) { +void PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont(uint32_t *p) { unsigned int start, len, j, k; uint32_t t, zeta; const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q; @@ -121,17 +122,17 @@ void PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont(uint32_t p[N]) { k = 0; for (len = 1; len < N; len <<= 1) { for (start = 0; start < N; start = j + len) { - zeta = zetas_inv[k++]; + zeta = PQCLEAN_DILITHIUM4_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { t = p[j]; p[j] = t + p[j + len]; p[j + len] = t + 256 * Q - p[j + len]; - p[j + len] = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t)zeta * p[j + len]); + p[j + len] = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); } } } for (j = 0; j < N; ++j) { - p[j] = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t)f * p[j]); + p[j] = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t) f * p[j]); } } diff --git a/crypto_sign/dilithium4/clean/ntt.h b/crypto_sign/dilithium4/clean/ntt.h index 0a03fefa..21f4d5b8 100644 --- a/crypto_sign/dilithium4/clean/ntt.h +++ b/crypto_sign/dilithium4/clean/ntt.h @@ -1,8 +1,9 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_NTT_H +#define PQCLEAN_DILITHIUM4_CLEAN_NTT_H + +#include #include "params.h" -#include void PQCLEAN_DILITHIUM4_CLEAN_ntt(uint32_t p[N]); void PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont(uint32_t p[N]); diff --git a/crypto_sign/dilithium4/clean/packing.c b/crypto_sign/dilithium4/clean/packing.c index 2d10aec6..37b0fca6 100644 --- a/crypto_sign/dilithium4/clean/packing.c +++ b/crypto_sign/dilithium4/clean/packing.c @@ -4,17 +4,18 @@ #include "polyvec.h" /************************************************* -* Name: pack_pk +* Name: PQCLEAN_DILITHIUM4_CLEAN_pack_pk * * Description: Bit-pack public key pk = (rho, t1). * -* Arguments: - unsigned char pk[]: output byte array -* - const unsigned char rho[]: byte array containing rho +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], - const unsigned char rho[SEEDBYTES], - const polyveck *t1) { +void PQCLEAN_DILITHIUM4_CLEAN_pack_pk( + uint8_t *pk, + const uint8_t *rho, + const polyveck *t1) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -28,17 +29,18 @@ void PQCLEAN_DILITHIUM4_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], } /************************************************* -* Name: unpack_pk +* Name: PQCLEAN_DILITHIUM4_CLEAN_unpack_pk * * Description: Unpack public key pk = (rho, t1). * -* Arguments: - const unsigned char rho[]: output byte array for rho +* Arguments: - const uint8_t rho[]: output byte array for rho * - const polyveck *t1: pointer to output vector t1 -* - unsigned char pk[]: byte array containing bit-packed pk +* - uint8_t pk[]: byte array containing bit-packed pk **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], - polyveck *t1, - const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) { +void PQCLEAN_DILITHIUM4_CLEAN_unpack_pk( + uint8_t *rho, + polyveck *t1, + const uint8_t *pk) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -52,25 +54,26 @@ void PQCLEAN_DILITHIUM4_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], } /************************************************* -* Name: pack_sk +* Name: PQCLEAN_DILITHIUM4_CLEAN_pack_sk * * Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). * -* Arguments: - unsigned char sk[]: output byte array -* - const unsigned char rho[]: byte array containing rho -* - const unsigned char key[]: byte array containing key -* - const unsigned char tr[]: byte array containing tr +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t key[]: byte array containing key +* - const uint8_t tr[]: byte array containing tr * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 * - const polyveck *t0: pointer to vector t0 **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], - const unsigned char rho[SEEDBYTES], - const unsigned char key[SEEDBYTES], - const unsigned char tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { +void PQCLEAN_DILITHIUM4_CLEAN_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -104,25 +107,26 @@ void PQCLEAN_DILITHIUM4_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], } /************************************************* -* Name: unpack_sk +* Name: PQCLEAN_DILITHIUM4_CLEAN_unpack_sk * * Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). * -* Arguments: - const unsigned char rho[]: output byte array for rho -* - const unsigned char key[]: output byte array for key -* - const unsigned char tr[]: output byte array for tr +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t key[]: output byte array for key +* - const uint8_t tr[]: output byte array for tr * - const polyvecl *s1: pointer to output vector s1 * - const polyveck *s2: pointer to output vector s2 * - const polyveck *r0: pointer to output vector t0 -* - unsigned char sk[]: byte array containing bit-packed sk +* - uint8_t sk[]: byte array containing bit-packed sk **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], - unsigned char key[SEEDBYTES], - unsigned char tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const unsigned char sk[CRYPTO_SECRETKEYBYTES]) { +void PQCLEAN_DILITHIUM4_CLEAN_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk) { unsigned int i; for (i = 0; i < SEEDBYTES; ++i) { @@ -156,19 +160,20 @@ void PQCLEAN_DILITHIUM4_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], } /************************************************* -* Name: pack_sig +* Name: PQCLEAN_DILITHIUM4_CLEAN_pack_sig * * Description: Bit-pack signature sig = (z, h, c). * -* Arguments: - unsigned char sig[]: output byte array +* Arguments: - uint8_t sig[]: output byte array * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial +* - const poly *c: pointer to PQCLEAN_DILITHIUM4_CLEAN_challenge polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { +void PQCLEAN_DILITHIUM4_CLEAN_pack_sig( + uint8_t *sig, + const polyvecl *z, + const polyveck *h, + const poly *c) { unsigned int i, j, k; uint64_t signs, mask; @@ -182,10 +187,11 @@ void PQCLEAN_DILITHIUM4_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], for (i = 0; i < K; ++i) { for (j = 0; j < N; ++j) { if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (unsigned char) j; + sig[k++] = (uint8_t)j; } } - sig[OMEGA + i] = (unsigned char) k; + + sig[OMEGA + i] = (uint8_t)k; } while (k < OMEGA) { sig[k++] = 0; @@ -199,7 +205,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], sig[i] = 0; for (j = 0; j < 8; ++j) { if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (unsigned char) (1U << j); + sig[i] |= (uint8_t)(1u << j); if (c->coeffs[8 * i + j] == (Q - 1)) { signs |= mask; } @@ -209,27 +215,28 @@ void PQCLEAN_DILITHIUM4_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], } sig += N / 8; for (i = 0; i < 8; ++i) { - sig[i] = (unsigned char) (signs >> 8 * i); + sig[i] = (uint8_t)(signs >> 8u * i); } } /************************************************* -* Name: unpack_sig +* Name: PQCLEAN_DILITHIUM4_CLEAN_unpack_sig * * Description: Unpack signature sig = (z, h, c). * * Arguments: - polyvecl *z: pointer to output vector z * - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial -* - const unsigned char sig[]: byte array containing +* - poly *c: pointer to output PQCLEAN_DILITHIUM4_CLEAN_challenge polynomial +* - const uint8_t sig[]: byte array containing * bit-packed signature * * Returns 1 in case of malformed signature; otherwise 0. **************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_unpack_sig(polyvecl *z, - polyveck *h, - poly *c, - const unsigned char sig[CRYPTO_BYTES]) { +int PQCLEAN_DILITHIUM4_CLEAN_unpack_sig( + polyvecl *z, + polyveck *h, + poly *c, + const uint8_t *sig) { unsigned int i, j, k; uint64_t signs; @@ -266,6 +273,7 @@ int PQCLEAN_DILITHIUM4_CLEAN_unpack_sig(polyvecl *z, return 1; } } + sig += OMEGA + K; /* Decode c */ diff --git a/crypto_sign/dilithium4/clean/packing.h b/crypto_sign/dilithium4/clean/packing.h index 7b6ce97a..63b4856b 100644 --- a/crypto_sign/dilithium4/clean/packing.h +++ b/crypto_sign/dilithium4/clean/packing.h @@ -1,31 +1,36 @@ -#ifndef PACKING_H -#define PACKING_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_PACKING_H +#define PQCLEAN_DILITHIUM4_CLEAN_PACKING_H #include "params.h" #include "polyvec.h" -void PQCLEAN_DILITHIUM4_CLEAN_pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES], - const unsigned char rho[SEEDBYTES], const polyveck *t1); -void PQCLEAN_DILITHIUM4_CLEAN_pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES], - const unsigned char rho[SEEDBYTES], - const unsigned char key[SEEDBYTES], - const unsigned char tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM4_CLEAN_pack_sig(unsigned char sig[CRYPTO_BYTES], - const polyvecl *z, const polyveck *h, const poly *c); +void PQCLEAN_DILITHIUM4_CLEAN_pack_pk( + uint8_t *pk, + const uint8_t *rho, const polyveck *t1); +void PQCLEAN_DILITHIUM4_CLEAN_pack_sk( + uint8_t *sk, + const uint8_t *rho, + const uint8_t *key, + const uint8_t *tr, + const polyvecl *s1, + const polyveck *s2, + const polyveck *t0); +void PQCLEAN_DILITHIUM4_CLEAN_pack_sig( + uint8_t *sig, + const polyvecl *z, const polyveck *h, const poly *c); -void PQCLEAN_DILITHIUM4_CLEAN_unpack_pk(unsigned char rho[SEEDBYTES], polyveck *t1, - const unsigned char pk[CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM4_CLEAN_unpack_sk(unsigned char rho[SEEDBYTES], - unsigned char key[SEEDBYTES], - unsigned char tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const unsigned char sk[CRYPTO_SECRETKEYBYTES]); -int PQCLEAN_DILITHIUM4_CLEAN_unpack_sig(polyvecl *z, polyveck *h, poly *c, - const unsigned char sig[CRYPTO_BYTES]); +void PQCLEAN_DILITHIUM4_CLEAN_unpack_pk( + uint8_t *rho, polyveck *t1, + const uint8_t *pk); +void PQCLEAN_DILITHIUM4_CLEAN_unpack_sk( + uint8_t *rho, + uint8_t *key, + uint8_t *tr, + polyvecl *s1, + polyveck *s2, + polyveck *t0, + const uint8_t *sk); +int PQCLEAN_DILITHIUM4_CLEAN_unpack_sig( + polyvecl *z, polyveck *h, poly *c, const uint8_t *sig); #endif diff --git a/crypto_sign/dilithium4/clean/params.h b/crypto_sign/dilithium4/clean/params.h index 5cd83242..5cfade78 100644 --- a/crypto_sign/dilithium4/clean/params.h +++ b/crypto_sign/dilithium4/clean/params.h @@ -1,5 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_PARAMS_H +#define PQCLEAN_DILITHIUM4_CLEAN_PARAMS_H #define SEEDBYTES 32 @@ -7,13 +7,11 @@ #define N 256 #define Q 8380417 #define QBITS 23 -#define ROOT_OF_UNITY 1753 #define D 14 #define GAMMA1 ((Q - 1)/16) #define GAMMA2 (GAMMA1/2) #define ALPHA (2*GAMMA2) -// DilithiumIV parameters #define K 6 #define L 5 #define ETA 3 @@ -21,6 +19,7 @@ #define BETA 175 #define OMEGA 120 + #define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) #define POLT0_SIZE_PACKED ((N*D)/8) #define POLETA_SIZE_PACKED ((N*SETABITS)/8) diff --git a/crypto_sign/dilithium4/clean/poly.c b/crypto_sign/dilithium4/clean/poly.c index 85006681..632fe7be 100644 --- a/crypto_sign/dilithium4/clean/poly.c +++ b/crypto_sign/dilithium4/clean/poly.c @@ -1,10 +1,11 @@ +#include + #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "rounding.h" #include "symmetric.h" -#include /************************************************* @@ -16,8 +17,7 @@ * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_reduce(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_reduce32(a->coeffs[i]); } } @@ -31,8 +31,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_reduce(poly *a) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_csubq(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_csubq(a->coeffs[i]); } } @@ -46,8 +45,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_csubq(poly *a) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_freeze(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_freeze(a->coeffs[i]); } } @@ -61,9 +59,8 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_freeze(poly *a) { * - const poly *a: pointer to first summand * - const poly *b: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { - unsigned int i; - for (i = 0; i < N; ++i) { +void PQCLEAN_DILITHIUM4_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } @@ -78,11 +75,10 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial to be -* subtraced from first input polynomial +* subtracted from first input polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i]; } } @@ -96,8 +92,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_shiftl(poly *a) { - unsigned int i; - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] <<= D; } } @@ -139,9 +134,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(poly *a) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { c->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]); } @@ -160,12 +153,9 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly * * - const poly *v: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a1->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_power2round(a->coeffs[i], &a0->coeffs[i]); } - } /************************************************* @@ -182,12 +172,9 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a * - const poly *c: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a1->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_decompose(a->coeffs[i], &a0->coeffs[i]); } - } /************************************************* @@ -204,13 +191,11 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) * Returns number of 1 bits. **************************************************/ unsigned int PQCLEAN_DILITHIUM4_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { - unsigned int i, s = 0; - - for (i = 0; i < N; ++i) { + unsigned int s = 0; + for (size_t i = 0; i < N; ++i) { h->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); s += h->coeffs[i]; } - return s; } @@ -224,12 +209,9 @@ unsigned int PQCLEAN_DILITHIUM4_CLEAN_poly_make_hint(poly *h, const poly *a0, co * - const poly *h: pointer to input hint polynomial **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *h) { - unsigned int i; - - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_use_hint(b->coeffs[i], h->coeffs[i]); } - } /************************************************* @@ -244,15 +226,13 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint(poly *a, const poly *b, const poly * * Returns 0 if norm is strictly smaller than B and 1 otherwise. **************************************************/ int PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(const poly *a, uint32_t B) { - unsigned int i; int32_t t; - /* It is ok to leak which coefficient violates the bound since the probability for each coefficient is independent of secret data but we must not leak the sign of the centralized representative. */ - for (i = 0; i < N; ++i) { + for (size_t i = 0; i < N; ++i) { /* Absolute value of centralized representative */ - t = (int32_t) ((Q - 1) / 2 - a->coeffs[i]); + t = (int32_t)((Q - 1) / 2 - a->coeffs[i]); t ^= (t >> 31); t = (Q - 1) / 2 - t; @@ -260,7 +240,6 @@ int PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(const poly *a, uint32_t B) { return 1; } } - return 0; } @@ -272,7 +251,7 @@ int PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(const poly *a, uint32_t B) { * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -280,8 +259,8 @@ int PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(const poly *a, uint32_t B) { **************************************************/ static unsigned int rej_uniform(uint32_t *a, unsigned int len, - const unsigned char *buf, - unsigned int buflen) { + const uint8_t *buf, + size_t buflen) { unsigned int ctr, pos; uint32_t t; @@ -308,19 +287,20 @@ static unsigned int rej_uniform(uint32_t *a, * output stream from SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES)/STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t seed[SEEDBYTES], uint16_t nonce) { - unsigned int i, ctr, off; - unsigned int buflen = POLY_UNIFORM_BUFLEN; - unsigned char buf[POLY_UNIFORM_BUFLEN + 2]; - shake128ctx state; + unsigned int i, ctr; + size_t buflen = POLY_UNIFORM_BUFLEN; + uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; + stream128_state state; + size_t off; stream128_init(&state, seed, nonce); stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); @@ -347,7 +327,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform(poly *a, * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -355,7 +335,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform(poly *a, **************************************************/ static unsigned int rej_eta(uint32_t *a, unsigned int len, - const unsigned char *buf, + const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; uint32_t t0, t1; @@ -384,19 +364,18 @@ static unsigned int rej_eta(uint32_t *a, * output stream from SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N/2 * (1U << SETABITS)) / (2*ETA + 1)\ - + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) #define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce) { unsigned int ctr; - unsigned char buf[POLY_UNIFORM_ETA_BUFLEN]; - shake128ctx state; + uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; + stream128_state state; stream128_init(&state, seed, nonce); stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); @@ -418,7 +397,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta(poly *a, * * Arguments: - uint32_t *a: pointer to output array (allocated) * - unsigned int len: number of coefficients to be sampled -* - const unsigned char *buf: array of random bytes +* - const uint8_t *buf: array of random bytes * - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough @@ -426,7 +405,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta(poly *a, **************************************************/ static unsigned int rej_gamma1m1(uint32_t *a, unsigned int len, - const unsigned char *buf, + const uint8_t *buf, unsigned int buflen) { unsigned int ctr, pos; uint32_t t0, t1; @@ -438,7 +417,7 @@ static unsigned int rej_gamma1m1(uint32_t *a, t0 |= (uint32_t)buf[pos + 2] << 16; t0 &= 0xFFFFF; - t1 = buf[pos + 2] >> 4; + t1 = buf[pos + 2] >> 4; t1 |= (uint32_t)buf[pos + 3] << 4; t1 |= (uint32_t)buf[pos + 4] << 12; @@ -463,19 +442,19 @@ static unsigned int rej_gamma1m1(uint32_t *a, * sampling on output stream of SHAKE256(seed|nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const unsigned char seed[]: byte array with seed of length +* - const uint8_t seed[]: byte array with seed of length * CRHBYTES * - uint16_t nonce: 16-bit nonce **************************************************/ #define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) #define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_gamma1m1(poly *a, - const unsigned char seed[CRHBYTES], + const uint8_t seed[CRHBYTES], uint16_t nonce) { unsigned int i, ctr, off; unsigned int buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - unsigned char buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; - shake256ctx state; + uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; + stream256_state state; stream256_init(&state, seed, nonce); stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); @@ -500,27 +479,27 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_gamma1m1(poly *a, * Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. * Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLETA_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(unsigned char *r, const poly *a) { +void PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { unsigned int i; - unsigned char t[8]; + uint8_t t[8]; for (i = 0; i < N / 8; ++i) { - t[0] = (uint8_t) (Q + ETA - a->coeffs[8 * i + 0]); - t[1] = (uint8_t) (Q + ETA - a->coeffs[8 * i + 1]); - t[2] = (uint8_t) (Q + ETA - a->coeffs[8 * i + 2]); - t[3] = (uint8_t) (Q + ETA - a->coeffs[8 * i + 3]); - t[4] = (uint8_t) (Q + ETA - a->coeffs[8 * i + 4]); - t[5] = (uint8_t) (Q + ETA - a->coeffs[8 * i + 5]); - t[6] = (uint8_t) (Q + ETA - a->coeffs[8 * i + 6]); - t[7] = (uint8_t) (Q + ETA - a->coeffs[8 * i + 7]); + t[0] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 0]); + t[1] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 1]); + t[2] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 2]); + t[3] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 3]); + t[4] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 4]); + t[5] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 5]); + t[6] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 6]); + t[7] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 7]); - r[3 * i + 0] = (uint8_t) ((t[0] >> 0) | (t[1] << 3) | (t[2] << 6)); - r[3 * i + 1] = (uint8_t) ((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)); - r[3 * i + 2] = (uint8_t) ((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)); + r[3 * i + 0] = (uint8_t)((t[0] >> 0) | (t[1] << 3) | (t[2] << 6)); + r[3 * i + 1] = (uint8_t)((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)); + r[3 * i + 2] = (uint8_t)((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)); } } @@ -531,18 +510,18 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(unsigned char *r, const poly *a) { * Output coefficients lie in [Q-ETA,Q+ETA]. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(poly *r, const unsigned char *a) { +void PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { unsigned int i; for (i = 0; i < N / 8; ++i) { r->coeffs[8 * i + 0] = a[3 * i + 0] & 0x07; r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 0x07; - r->coeffs[8 * i + 2] = (uint32_t) ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 0x07; + r->coeffs[8 * i + 2] = (uint32_t)((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 0x07; r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 0x07; r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 0x07; - r->coeffs[8 * i + 5] = (uint32_t) ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 0x07; + r->coeffs[8 * i + 5] = (uint32_t)((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 0x07; r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 0x07; r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 0x07; @@ -555,58 +534,56 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(poly *r, const unsigned char *a) { r->coeffs[8 * i + 6] = Q + ETA - r->coeffs[8 * i + 6]; r->coeffs[8 * i + 7] = Q + ETA - r->coeffs[8 * i + 7]; } + } /************************************************* -* Name: polyt1_pack +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack * * Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLT1_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack(unsigned char *r, const poly *a) { +void PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { unsigned int i; for (i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t) ((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t) ((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t) ((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t) ((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t) ((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t) ((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t) ((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t) ((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t) ((a->coeffs[8 * i + 7] >> 1)); + r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); + r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); + r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); + r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); + r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); + r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); + r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); + r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); + r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); } } /************************************************* -* Name: polyt1_unpack +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack * * Description: Unpack polynomial t1 with 9-bit coefficients. * Output coefficients are standard representatives. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) { - unsigned int i; - - for (i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; +void PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { + for (size_t i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t) a[9 * i + 1] << 8)) & 0x1FF; + r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t) a[9 * i + 2] << 7)) & 0x1FF; + r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t) a[9 * i + 3] << 6)) & 0x1FF; + r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t) a[9 * i + 4] << 5)) & 0x1FF; + r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t) a[9 * i + 5] << 4)) & 0x1FF; + r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t) a[9 * i + 6] << 3)) & 0x1FF; + r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t) a[9 * i + 7] << 2)) & 0x1FF; + r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t) a[9 * i + 8] << 1)) & 0x1FF; } - } /************************************************* @@ -615,32 +592,30 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack(poly *r, const unsigned char *a) { * Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. * Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLT0_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack(unsigned char *r, const poly *a) { - unsigned int i; +void PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { uint32_t t[4]; - for (i = 0; i < N / 4; ++i) { - t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; + for (size_t i = 0; i < N / 4; ++i) { + t[0] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 0]; + t[1] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 1]; + t[2] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 2]; + t[3] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 3]; - r[7 * i + 0] = (uint8_t) (t[0]); - r[7 * i + 1] = (uint8_t) (t[0] >> 8); - r[7 * i + 1] |= (uint8_t) (t[1] << 6); - r[7 * i + 2] = (uint8_t) (t[1] >> 2); - r[7 * i + 3] = (uint8_t) (t[1] >> 10); - r[7 * i + 3] |= (uint8_t) (t[2] << 4); - r[7 * i + 4] = (uint8_t) (t[2] >> 4); - r[7 * i + 5] = (uint8_t) (t[2] >> 12); - r[7 * i + 5] |= (uint8_t) (t[3] << 2); - r[7 * i + 6] = (uint8_t) (t[3] >> 6); + r[7 * i + 0] = (uint8_t)(t[0]); + r[7 * i + 1] = (uint8_t)(t[0] >> 8); + r[7 * i + 1] |= (uint8_t)(t[1] << 6); + r[7 * i + 2] = (uint8_t)(t[1] >> 2); + r[7 * i + 3] = (uint8_t)(t[1] >> 10); + r[7 * i + 3] |= (uint8_t)(t[2] << 4); + r[7 * i + 4] = (uint8_t)(t[2] >> 4); + r[7 * i + 5] = (uint8_t)(t[2] >> 12); + r[7 * i + 5] |= (uint8_t)(t[3] << 2); + r[7 * i + 6] = (uint8_t)(t[3] >> 6); } - } /************************************************* @@ -650,32 +625,30 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack(unsigned char *r, const poly *a) { * Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) { - unsigned int i; +void PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { - for (i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; + for (size_t i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[7 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t) (a[7 * i + 1] & 0x3F) << 8; - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; + r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; + r->coeffs[4 * i + 1] |= (uint32_t) a[7 * i + 2] << 2; + r->coeffs[4 * i + 1] |= (uint32_t) (a[7 * i + 3] & 0x0F) << 10; - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; + r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t) a[7 * i + 4] << 4; + r->coeffs[4 * i + 2] |= (uint32_t) (a[7 * i + 5] & 0x03) << 12; - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; + r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; + r->coeffs[4 * i + 3] |= (uint32_t) a[7 * i + 6] << 6; r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; } - } /************************************************* @@ -685,29 +658,27 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack(poly *r, const unsigned char *a) { * in [-(GAMMA1 - 1), GAMMA1 - 1]. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLZ_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyz_pack(unsigned char *r, const poly *a) { - unsigned int i; +void PQCLEAN_DILITHIUM4_CLEAN_polyz_pack(uint8_t *r, const poly *a) { uint32_t t[2]; - for (i = 0; i < N / 2; ++i) { + for (size_t i = 0; i < N / 2; ++i) { /* Map to {0,...,2*GAMMA1 - 2} */ t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; t[0] += ((int32_t)t[0] >> 31) & Q; t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; t[1] += ((int32_t)t[1] >> 31) & Q; - r[5 * i + 0] = (uint8_t) (t[0]); - r[5 * i + 1] = (uint8_t) (t[0] >> 8); - r[5 * i + 2] = (uint8_t) (t[0] >> 16); - r[5 * i + 2] |= (uint8_t) (t[1] << 4); - r[5 * i + 3] = (uint8_t) (t[1] >> 4); - r[5 * i + 4] = (uint8_t) (t[1] >> 12); + r[5 * i + 0] = (uint8_t)t[0]; + r[5 * i + 1] = (uint8_t)(t[0] >> 8); + r[5 * i + 2] = (uint8_t)(t[0] >> 16); + r[5 * i + 2] |= (uint8_t)(t[1] << 4); + r[5 * i + 3] = (uint8_t)(t[1] >> 4); + r[5 * i + 4] = (uint8_t)(t[1] >> 12); } - } /************************************************* @@ -718,26 +689,23 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyz_pack(unsigned char *r, const poly *a) { * Output coefficients are standard representatives. * * Arguments: - poly *r: pointer to output polynomial -* - const unsigned char *a: byte array with bit-packed polynomial +* - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack(poly *r, const unsigned char *a) { - unsigned int i; - - for (i = 0; i < N / 2; ++i) { +void PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { + for (size_t i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; + r->coeffs[2 * i + 0] |= (uint32_t) a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t) (a[5 * i + 2] & 0x0F) << 16; r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 4] << 12; - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; + r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; } - } /************************************************* @@ -746,15 +714,13 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack(poly *r, const unsigned char *a) { * Description: Bit-pack polynomial w1 with coefficients in [0, 15]. * Input coefficients are assumed to be standard representatives. * -* Arguments: - unsigned char *r: pointer to output byte array with at least +* Arguments: - uint8_t *r: pointer to output byte array with at least * POLW1_SIZE_PACKED bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyw1_pack(unsigned char *r, const poly *a) { - unsigned int i; - - for (i = 0; i < N / 2; ++i) { - r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)); +void PQCLEAN_DILITHIUM4_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { + for (size_t i = 0; i < N / 2; ++i) { + r[i] = (uint8_t)(a->coeffs[2 * i + 0] | a->coeffs[2 * i + 1] << 4); } } diff --git a/crypto_sign/dilithium4/clean/poly.h b/crypto_sign/dilithium4/clean/poly.h index 2767e259..ee7e2aa4 100644 --- a/crypto_sign/dilithium4/clean/poly.h +++ b/crypto_sign/dilithium4/clean/poly.h @@ -1,8 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_POLY_H +#define PQCLEAN_DILITHIUM4_CLEAN_POLY_H + +#include #include "params.h" -#include typedef struct { uint32_t coeffs[N]; @@ -27,27 +28,27 @@ void PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint(poly *a, const poly *b, const poly * int PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(const poly *a, uint32_t B); void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce); void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta(poly *a, - const unsigned char seed[SEEDBYTES], + const uint8_t *seed, uint16_t nonce); void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_gamma1m1(poly *a, - const unsigned char seed[CRHBYTES], + const uint8_t seed[CRHBYTES], uint16_t nonce); -void PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyz_pack(unsigned char *r, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack(poly *r, const unsigned char *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack(poly *r, const uint8_t *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyw1_pack(unsigned char *r, const poly *a); +void PQCLEAN_DILITHIUM4_CLEAN_polyw1_pack(uint8_t *r, const poly *a); #endif diff --git a/crypto_sign/dilithium4/clean/polyvec.c b/crypto_sign/dilithium4/clean/polyvec.c index b75203d0..97533d61 100644 --- a/crypto_sign/dilithium4/clean/polyvec.c +++ b/crypto_sign/dilithium4/clean/polyvec.c @@ -1,14 +1,15 @@ +#include + #include "params.h" #include "poly.h" #include "polyvec.h" -#include /**************************************************************/ /************ Vectors of polynomials of length L **************/ /**************************************************************/ /************************************************* -* Name: polyvecl_freeze +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_freeze * * Description: Reduce coefficients of polynomials in vector of length L * to standard representatives. @@ -24,7 +25,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_freeze(polyvecl *v) { } /************************************************* -* Name: polyvecl_add +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_add * * Description: Add vectors of polynomials of length L. * No modular reduction is performed. @@ -42,7 +43,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const } /************************************************* -* Name: polyvecl_ntt +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt * * Description: Forward NTT of all polynomials in vector of length L. Output * coefficients can be up to 16*Q larger than input coefficients. @@ -58,7 +59,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt(polyvecl *v) { } /************************************************* -* Name: polyvecl_pointwise_acc_invmontgomery +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery * * Description: Pointwise multiply vectors of polynomials of length L, multiply * resulting vector by 2^{-32} and add (accumulate) polynomials @@ -85,7 +86,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w, } /************************************************* -* Name: polyvecl_chknorm +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm * * Description: Check infinity norm of polynomials in vector of length L. * Assumes input coefficients to be standard representatives. @@ -96,14 +97,15 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery(poly *w, * Returns 0 if norm of all polynomials is strictly smaller than B and 1 * otherwise. **************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { +int PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { unsigned int i; for (i = 0; i < L; ++i) { - if (PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(&v->vec[i], bound)) { + if (PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(&v->vec[i], B)) { return 1; } } + return 0; } @@ -113,7 +115,7 @@ int PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t bound) /************************************************* -* Name: polyveck_reduce +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_reduce * * Description: Reduce coefficients of polynomials in vector of length K * to representatives in [0,2*Q[. @@ -129,7 +131,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_reduce(polyveck *v) { } /************************************************* -* Name: polyveck_csubq +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq * * Description: For all coefficients of polynomials in vector of length K * subtract Q if coefficient is bigger than Q. @@ -145,7 +147,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq(polyveck *v) { } /************************************************* -* Name: polyveck_freeze +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_freeze * * Description: Reduce coefficients of polynomials in vector of length K * to standard representatives. @@ -161,7 +163,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_freeze(polyveck *v) { } /************************************************* -* Name: polyveck_add +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_add * * Description: Add vectors of polynomials of length K. * No modular reduction is performed. @@ -179,7 +181,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const } /************************************************* -* Name: polyveck_sub +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_sub * * Description: Subtract vectors of polynomials of length K. * Assumes coefficients of polynomials in second input vector @@ -199,7 +201,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const } /************************************************* -* Name: polyveck_shiftl +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_shiftl * * Description: Multiply vector of polynomials of Length K by 2^D without modular * reduction. Assumes input coefficients to be less than 2^{32-D}. @@ -215,7 +217,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_shiftl(polyveck *v) { } /************************************************* -* Name: polyveck_ntt +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_ntt * * Description: Forward NTT of all polynomials in vector of length K. Output * coefficients can be up to 16*Q larger than input coefficients. @@ -231,7 +233,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_ntt(polyveck *v) { } /************************************************* -* Name: polyveck_invntt_montgomery +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_invntt_montgomery * * Description: Inverse NTT and multiplication by 2^{32} of polynomials * in vector of length K. Input coefficients need to be less @@ -248,7 +250,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_invntt_montgomery(polyveck *v) { } /************************************************* -* Name: polyveck_chknorm +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_chknorm * * Description: Check infinity norm of polynomials in vector of length K. * Assumes input coefficients to be standard representatives. @@ -259,19 +261,20 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_invntt_montgomery(polyveck *v) { * Returns 0 if norm of all polynomials are strictly smaller than B and 1 * otherwise. **************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t bound) { +int PQCLEAN_DILITHIUM4_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { unsigned int i; for (i = 0; i < K; ++i) { - if (PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(&v->vec[i], bound)) { + if (PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(&v->vec[i], B)) { return 1; } } + return 0; } /************************************************* -* Name: polyveck_power2round +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_power2round * * Description: For all coefficients a of polynomials in vector of length K, * compute a0, a1 such that a mod Q = a1*2^D + a0 @@ -293,7 +296,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, c } /************************************************* -* Name: polyveck_decompose +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_decompose * * Description: For all coefficients a of polynomials in vector of length K, * compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 @@ -316,7 +319,7 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con } /************************************************* -* Name: polyveck_make_hint +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_make_hint * * Description: Compute hint vector. * @@ -339,19 +342,19 @@ unsigned int PQCLEAN_DILITHIUM4_CLEAN_polyveck_make_hint(polyveck *h, } /************************************************* -* Name: polyveck_use_hint +* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint * * Description: Use hint vector to correct the high bits of input vector. * * Arguments: - polyveck *w: pointer to output vector of polynomials with * corrected high bits -* - const polyveck *u: pointer to input vector +* - const polyveck *v: pointer to input vector * - const polyveck *h: pointer to input hint vector **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { +void PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { unsigned int i; for (i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); } } diff --git a/crypto_sign/dilithium4/clean/polyvec.h b/crypto_sign/dilithium4/clean/polyvec.h index 9481672b..2b9ec4d5 100644 --- a/crypto_sign/dilithium4/clean/polyvec.h +++ b/crypto_sign/dilithium4/clean/polyvec.h @@ -1,9 +1,10 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_POLYVEC_H +#define PQCLEAN_DILITHIUM4_CLEAN_POLYVEC_H + +#include #include "params.h" #include "poly.h" -#include /* Vectors of polynomials of length L */ typedef struct { @@ -46,6 +47,6 @@ void PQCLEAN_DILITHIUM4_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, con unsigned int PQCLEAN_DILITHIUM4_CLEAN_polyveck_make_hint(polyveck *h, const polyveck *v0, const polyveck *v1); -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); +void PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h); #endif diff --git a/crypto_sign/dilithium4/clean/reduce.c b/crypto_sign/dilithium4/clean/reduce.c index 8901a453..69334c2d 100644 --- a/crypto_sign/dilithium4/clean/reduce.c +++ b/crypto_sign/dilithium4/clean/reduce.c @@ -1,9 +1,10 @@ -#include "params.h" -#include "reduce.h" #include +#include "params.h" +#include "reduce.h" + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce * * Description: For finite field element a with 0 <= a <= Q*2^32, * compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q. @@ -20,11 +21,11 @@ uint32_t PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce(uint64_t a) { t *= Q; t = a + t; t >>= 32; - return (uint32_t) t; + return (uint32_t)t; } /************************************************* -* Name: reduce32 +* Name: PQCLEAN_DILITHIUM4_CLEAN_reduce32 * * Description: For finite field element a, compute r \equiv a (mod Q) * such that 0 <= r < 2*Q. @@ -43,7 +44,7 @@ uint32_t PQCLEAN_DILITHIUM4_CLEAN_reduce32(uint32_t a) { } /************************************************* -* Name: csubq +* Name: PQCLEAN_DILITHIUM4_CLEAN_csubq * * Description: Subtract Q if input coefficient is bigger than Q. * @@ -58,7 +59,7 @@ uint32_t PQCLEAN_DILITHIUM4_CLEAN_csubq(uint32_t a) { } /************************************************* -* Name: freeze +* Name: PQCLEAN_DILITHIUM4_CLEAN_freeze * * Description: For finite field element a, compute standard * representative r = a mod Q. diff --git a/crypto_sign/dilithium4/clean/reduce.h b/crypto_sign/dilithium4/clean/reduce.h index 7fe2fd2d..9caf592d 100644 --- a/crypto_sign/dilithium4/clean/reduce.h +++ b/crypto_sign/dilithium4/clean/reduce.h @@ -1,5 +1,5 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_REDUCE_H +#define PQCLEAN_DILITHIUM4_CLEAN_REDUCE_H #include diff --git a/crypto_sign/dilithium4/clean/rounding.c b/crypto_sign/dilithium4/clean/rounding.c index 95897924..f0d717b4 100644 --- a/crypto_sign/dilithium4/clean/rounding.c +++ b/crypto_sign/dilithium4/clean/rounding.c @@ -1,7 +1,10 @@ +#include + #include "params.h" #include "rounding.h" + /************************************************* -* Name: power2round +* Name: PQCLEAN_DILITHIUM4_CLEAN_power2round * * Description: For finite field element a, compute a0, a1 such that * a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. @@ -17,16 +20,16 @@ uint32_t PQCLEAN_DILITHIUM4_CLEAN_power2round(uint32_t a, uint32_t *a0) { /* Centralized remainder mod 2^D */ t = a & ((1U << D) - 1); - t -= ((1U << (D - 1)) + 1); - t += ((uint32_t)((int32_t)t >> 31) & (1U << D)); - t -= ((1U << (D - 1)) - 1); - *a0 = (Q + t); + t -= (1U << (D - 1)) + 1; + t += ((uint32_t)((int32_t)t >> 31) & (1 << D)); + t -= (1U << (D - 1)) - 1; + *a0 = Q + t; a = (a - t) >> D; return a; } /************************************************* -* Name: decompose +* Name: PQCLEAN_DILITHIUM4_CLEAN_decompose * * Description: For finite field element a, compute high and low bits a0, a1 such * that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except @@ -41,28 +44,29 @@ uint32_t PQCLEAN_DILITHIUM4_CLEAN_power2round(uint32_t a, uint32_t *a0) { **************************************************/ uint32_t PQCLEAN_DILITHIUM4_CLEAN_decompose(uint32_t a, uint32_t *a0) { int32_t t, u; + /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFF; - t += (int32_t) ((a >> 19) << 9); + t = a & 0x7FFFFu; + t += (int32_t)((a >> 19u) << 9u); t -= ALPHA / 2 + 1; t += (t >> 31) & ALPHA; t -= ALPHA / 2 - 1; - a -= (uint32_t) t; + a -= (uint32_t)t; /* Divide by ALPHA (possible to avoid) */ - u = (int32_t) a - 1; + u = (int32_t)(a - 1); u >>= 31; a = (a >> 19) + 1; a -= u & 1; /* Border case */ - *a0 = Q + (uint32_t)t - (a >> 4); - a &= 0xF; + *a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u)); + a &= 0xFu; return a; } /************************************************* -* Name: make_hint +* Name: PQCLEAN_DILITHIUM4_CLEAN_make_hint * * Description: Compute hint bit indicating whether the low bits of the * input element overflow into the high bits. Inputs assumed to be @@ -73,7 +77,7 @@ uint32_t PQCLEAN_DILITHIUM4_CLEAN_decompose(uint32_t a, uint32_t *a0) { * * Returns 1 if high bits of a and b differ and 0 otherwise. **************************************************/ -unsigned int PQCLEAN_DILITHIUM4_CLEAN_make_hint(uint32_t a0, uint32_t a1) { +unsigned int PQCLEAN_DILITHIUM4_CLEAN_make_hint(const uint32_t a0, const uint32_t a1) { if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { return 0; } @@ -82,7 +86,7 @@ unsigned int PQCLEAN_DILITHIUM4_CLEAN_make_hint(uint32_t a0, uint32_t a1) { } /************************************************* -* Name: use_hint +* Name: PQCLEAN_DILITHIUM4_CLEAN_use_hint * * Description: Correct high bits according to hint. * @@ -91,7 +95,7 @@ unsigned int PQCLEAN_DILITHIUM4_CLEAN_make_hint(uint32_t a0, uint32_t a1) { * * Returns corrected high bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_use_hint(uint32_t a, unsigned int hint) { +uint32_t PQCLEAN_DILITHIUM4_CLEAN_use_hint(const uint32_t a, const unsigned int hint) { uint32_t a0, a1; a1 = PQCLEAN_DILITHIUM4_CLEAN_decompose(a, &a0); @@ -101,5 +105,15 @@ uint32_t PQCLEAN_DILITHIUM4_CLEAN_use_hint(uint32_t a, unsigned int hint) { if (a0 > Q) { return (a1 + 1) & 0xF; } + return (a1 - 1) & 0xF; + + /* If PQCLEAN_DILITHIUM4_CLEAN_decompose does not divide out ALPHA: + if(hint == 0) + return a1; + else if(a0 > Q) + return (a1 + ALPHA) % (Q - 1); + else + return (a1 - ALPHA) % (Q - 1); + */ } diff --git a/crypto_sign/dilithium4/clean/rounding.h b/crypto_sign/dilithium4/clean/rounding.h index 66cfa6f7..e0eed02f 100644 --- a/crypto_sign/dilithium4/clean/rounding.h +++ b/crypto_sign/dilithium4/clean/rounding.h @@ -1,5 +1,5 @@ -#ifndef ROUNDING_H -#define ROUNDING_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_ROUNDING_H +#define PQCLEAN_DILITHIUM4_CLEAN_ROUNDING_H #include diff --git a/crypto_sign/dilithium4/clean/sign.c b/crypto_sign/dilithium4/clean/sign.c index 70c4679b..5b85560c 100644 --- a/crypto_sign/dilithium4/clean/sign.c +++ b/crypto_sign/dilithium4/clean/sign.c @@ -1,3 +1,6 @@ +#include +#include + #include "fips202.h" #include "packing.h" #include "params.h" @@ -7,19 +10,17 @@ #include "sign.h" #include "symmetric.h" -#include - /************************************************* -* Name: expand_mat +* Name: PQCLEAN_DILITHIUM4_CLEAN_expand_mat * * Description: Implementation of ExpandA. Generates matrix A with uniformly * random coefficients a_{i,j} by performing rejection * sampling on the output stream of SHAKE128(rho|i|j). * * Arguments: - polyvecl mat[K]: output matrix -* - const unsigned char rho[]: byte array containing seed rho +* - const uint8_t rho[]: byte array containing seed rho **************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]) { +void PQCLEAN_DILITHIUM4_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { unsigned int i, j; for (i = 0; i < K; ++i) { @@ -30,23 +31,23 @@ void PQCLEAN_DILITHIUM4_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rh } /************************************************* -* Name: challenge +* Name: PQCLEAN_DILITHIUM4_CLEAN_challenge * * Description: Implementation of H. Samples polynomial with 60 nonzero * coefficients in {-1,1} using the output stream of * SHAKE256(mu|w1). * * Arguments: - poly *c: pointer to output polynomial -* - const unsigned char mu[]: byte array containing mu +* - const uint8_t mu[]: byte array containing mu * - const polyveck *w1: pointer to vector w1 **************************************************/ void PQCLEAN_DILITHIUM4_CLEAN_challenge(poly *c, - const unsigned char mu[CRHBYTES], + const uint8_t mu[CRHBYTES], const polyveck *w1) { unsigned int i, b, pos; uint64_t signs; - unsigned char inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - unsigned char outbuf[SHAKE256_RATE]; + uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; + uint8_t outbuf[SHAKE256_RATE]; shake256ctx state; for (i = 0; i < CRHBYTES; ++i) { @@ -88,22 +89,22 @@ void PQCLEAN_DILITHIUM4_CLEAN_challenge(poly *c, } /************************************************* -* Name: crypto_sign_keypair +* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair * * Description: Generates public and private key. * -* Arguments: - unsigned char *pk: pointer to output public key (allocated -* array of CRYPTO_PUBLICKEYBYTES bytes) -* - unsigned char *sk: pointer to output private key (allocated -* array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { unsigned int i; - unsigned char seedbuf[3 * SEEDBYTES]; - unsigned char tr[CRHBYTES]; - const unsigned char *rho, *rhoprime, *key; + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; uint16_t nonce = 0; polyvecl mat[K]; polyvecl s1, s1hat; @@ -144,19 +145,35 @@ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { PQCLEAN_DILITHIUM4_CLEAN_pack_pk(pk, rho, &t1); /* Compute CRH(rho, t1) and write secret key */ - crh(tr, pk, CRYPTO_PUBLICKEYBYTES); + crh(tr, pk, PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES); PQCLEAN_DILITHIUM4_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); return 0; } +/************************************************* +* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES +* of len) +* - size_t *smlen: pointer to output length of signed message +* (should be PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES) +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature( uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, const uint8_t *sk) { + const uint8_t *msg, size_t mlen, + const uint8_t *sk) { unsigned long long i; unsigned int n; - unsigned char seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; - unsigned char *rho, *tr, *key, *mu, *rhoprime; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; uint16_t nonce = 0; poly c, chat; polyvecl mat[K], s1, y, yhat, z; @@ -170,13 +187,12 @@ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature( rhoprime = mu + CRHBYTES; PQCLEAN_DILITHIUM4_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); - // use incremental hash API instead of copying around buffers /* Compute CRH(tr, msg) */ shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, tr, CRHBYTES); - shake256_inc_absorb(&state, m, mlen); + shake256_inc_absorb(&state, msg, mlen); shake256_inc_finalize(&state); shake256_inc_squeeze(mu, CRHBYTES, &state); @@ -253,11 +269,51 @@ rej: /* Write signature */ PQCLEAN_DILITHIUM4_CLEAN_pack_sig(sig, &z, &h, &c); - - *siglen = CRYPTO_BYTES; + *siglen = PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES; return 0; } +/************************************************* +* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - unsigned long long *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - unsigned long long mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk) { + int rc; + memmove(sm + PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES, m, mlen); + rc = PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); + *smlen += mlen; + return rc; +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *sig: signature +* - size_t siglen: length of signature (PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES) +* - uint8_t *m: pointer to message +* - size_t *mlen: pointer to output length of message +* - uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { @@ -268,7 +324,7 @@ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify( polyvecl mat[K], z; polyveck t1, w1, h, tmp1, tmp2; - if (siglen < CRYPTO_BYTES) { + if (siglen < PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES) { return -1; } @@ -281,7 +337,7 @@ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify( } /* Compute CRH(CRH(rho, t1), msg) */ - crh(mu, pk, CRYPTO_PUBLICKEYBYTES); + crh(mu, pk, PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES); shake256incctx state; shake256_inc_init(&state); @@ -325,40 +381,9 @@ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify( // All good return 0; } -/************************************************* -* Name: crypto_sign -* -* Description: Compute signed message. -* -* Arguments: - unsigned char *sm: pointer to output signed message (allocated -* array with CRYPTO_BYTES + mlen bytes), -* can be equal to m -* - unsigned long long *smlen: pointer to output length of signed -* message -* - const unsigned char *m: pointer to message to be signed -* - unsigned long long mlen: length of message -* - const unsigned char *sk: pointer to bit-packed secret key -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign(uint8_t *sm, - size_t *smlen, - const uint8_t *m, - size_t mlen, - const uint8_t *sk) { - size_t i; - int rc; - for (i = 0; i < mlen; i++) { - sm[CRYPTO_BYTES + i] = m[i]; - } - rc = PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); - *smlen += mlen; - return rc; - -} /************************************************* -* Name: crypto_sign_open +* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open * * Description: Verify signed message. * @@ -371,24 +396,23 @@ int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign(uint8_t *sm, * * Returns 0 if signed message could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open(uint8_t *m, - size_t *mlen, - const uint8_t *sm, - size_t smlen, - const uint8_t *pk) { +int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk) { size_t i; - if (smlen < CRYPTO_BYTES) { + if (smlen < PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - CRYPTO_BYTES; + *mlen = smlen - PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES; - if (PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify(sm, CRYPTO_BYTES, - sm + CRYPTO_BYTES, *mlen, pk)) { + if (PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES, + sm + PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ for (i = 0; i < *mlen; ++i) { - m[i] = sm[CRYPTO_BYTES + i]; + m[i] = sm[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES + i]; } return 0; } diff --git a/crypto_sign/dilithium4/clean/sign.h b/crypto_sign/dilithium4/clean/sign.h index 2ade7be9..f44cb5fd 100644 --- a/crypto_sign/dilithium4/clean/sign.h +++ b/crypto_sign/dilithium4/clean/sign.h @@ -1,30 +1,12 @@ -#ifndef SIGN_H -#define SIGN_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_SIGN_H +#define PQCLEAN_DILITHIUM4_CLEAN_SIGN_H +#include "api.h" #include "params.h" #include "poly.h" #include "polyvec.h" -void PQCLEAN_DILITHIUM4_CLEAN_expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM4_CLEAN_challenge(poly *c, const unsigned char mu[CRHBYTES], +void PQCLEAN_DILITHIUM4_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM4_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES], const polyveck *w1); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, const uint8_t *sk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, const uint8_t *pk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); - #endif diff --git a/crypto_sign/dilithium4/clean/stream.c b/crypto_sign/dilithium4/clean/stream.c new file mode 100644 index 00000000..9be23a56 --- /dev/null +++ b/crypto_sign/dilithium4/clean/stream.c @@ -0,0 +1,26 @@ +#include "stream.h" + +#include + +void PQCLEAN_DILITHIUM4_CLEAN_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + + uint8_t buf[SEEDBYTES + 2]; + memcpy(buf, seed, SEEDBYTES); + buf[SEEDBYTES] = (uint8_t)nonce; + buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); + + shake128_absorb(state, buf, SEEDBYTES + 2); +} + + +void PQCLEAN_DILITHIUM4_CLEAN_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + + uint8_t buf[CRHBYTES + 2]; + memcpy(buf, seed, CRHBYTES); + buf[CRHBYTES] = (uint8_t)nonce; + buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); + + shake256_absorb(state, buf, CRHBYTES + 2); +} diff --git a/crypto_sign/dilithium4/clean/stream.h b/crypto_sign/dilithium4/clean/stream.h new file mode 100644 index 00000000..d9807822 --- /dev/null +++ b/crypto_sign/dilithium4/clean/stream.h @@ -0,0 +1,15 @@ +#ifndef PQCLEAN_DILITHIUM4_CLEAN_STREAM_H +#define PQCLEAN_DILITHIUM4_CLEAN_STREAM_H + +#include + +#include "fips202.h" +#include "params.h" + +void PQCLEAN_DILITHIUM4_CLEAN_shake128_stream_init( + shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM4_CLEAN_shake256_stream_init( + shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); + +#endif diff --git a/crypto_sign/dilithium4/clean/symmetric.c b/crypto_sign/dilithium4/clean/symmetric.c deleted file mode 100644 index f7c767d0..00000000 --- a/crypto_sign/dilithium4/clean/symmetric.c +++ /dev/null @@ -1,32 +0,0 @@ -#include "symmetric.h" -#include "fips202.h" - -void PQCLEAN_DILITHIUM4_CLEAN_shake128_stream_init(shake128ctx *state, - const unsigned char seed[SEEDBYTES], - uint16_t nonce) { - unsigned int i; - unsigned char buf[SEEDBYTES + 2]; - - for (i = 0; i < SEEDBYTES; ++i) { - buf[i] = seed[i]; - } - buf[SEEDBYTES] = (uint8_t) nonce; - buf[SEEDBYTES + 1] = (uint8_t) (nonce >> 8); - - shake128_absorb(state, buf, sizeof(buf)); -} - -void PQCLEAN_DILITHIUM4_CLEAN_shake256_stream_init(shake256ctx *state, - const unsigned char seed[CRHBYTES], - uint16_t nonce) { - unsigned int i; - unsigned char buf[CRHBYTES + 2]; - - for (i = 0; i < CRHBYTES; ++i) { - buf[i] = seed[i]; - } - buf[CRHBYTES] = (uint8_t) nonce; - buf[CRHBYTES + 1] = (uint8_t) (nonce >> 8); - - shake256_absorb(state, buf, sizeof(buf)); -} diff --git a/crypto_sign/dilithium4/clean/symmetric.h b/crypto_sign/dilithium4/clean/symmetric.h index 01afe773..4102ddce 100644 --- a/crypto_sign/dilithium4/clean/symmetric.h +++ b/crypto_sign/dilithium4/clean/symmetric.h @@ -1,8 +1,11 @@ -#ifndef SYMMETRIC_H -#define SYMMETRIC_H +#ifndef PQCLEAN_DILITHIUM4_CLEAN_SYMMETRIC_H +#define PQCLEAN_DILITHIUM4_CLEAN_SYMMETRIC_H + +#include "params.h" +#include "stream.h" + #include "fips202.h" -#include "params.h" #define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) #define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM4_CLEAN_shake128_stream_init(STATE, SEED, NONCE) @@ -13,11 +16,8 @@ #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE -void PQCLEAN_DILITHIUM4_CLEAN_shake128_stream_init(shake128ctx *state, - const unsigned char *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM4_CLEAN_shake256_stream_init(shake256ctx *state, - const unsigned char *seed, - uint16_t nonce); +typedef shake128ctx stream128_state; +typedef shake256ctx stream256_state; + #endif diff --git a/test/duplicate_consistency/dilithium2_avx2.yml b/test/duplicate_consistency/dilithium2_avx2.yml new file mode 100644 index 00000000..ed477e5e --- /dev/null +++ b/test/duplicate_consistency/dilithium2_avx2.yml @@ -0,0 +1,63 @@ +consistency_checks: +- source: + scheme: dilithium3 + implementation: avx2 + files: + - alignment.h + - fips202x4.c + - fips202x4.h + - nttconsts.c + - nttconsts.h + - ntt.s + - ntt.h + - packing.c + - packing.h + - poly.h + - polyvec.c + - polyvec.h + - reduce.h + - reduce.s + - rounding.c + - rounding.h + - rejsample.h + - sign.h + - stream.c + - stream.h + - symmetric.h +- source: + scheme: dilithium4 + implementation: avx2 + files: + - alignment.h + - fips202x4.c + - fips202x4.h + - nttconsts.c + - nttconsts.h + - ntt.s + - ntt.h + - packing.c + - packing.h + - poly.h + - polyvec.c + - polyvec.h + - reduce.h + - reduce.s + - rounding.c + - rounding.h + - rejsample.h + - sign.h + - stream.c + - stream.h + - symmetric.h +- source: + scheme: dilithium2 + implementation: clean + files: + - api.h + - packing.c + - packing.h + - polyvec.h + - params.h + - stream.c + - stream.h + - symmetric.h diff --git a/test/duplicate_consistency/dilithium2_clean.yml b/test/duplicate_consistency/dilithium2_clean.yml index 0b88ceb7..6d90ddca 100644 --- a/test/duplicate_consistency/dilithium2_clean.yml +++ b/test/duplicate_consistency/dilithium2_clean.yml @@ -17,7 +17,8 @@ consistency_checks: - rounding.h - sign.c - sign.h - - symmetric.c + - stream.c + - stream.h - symmetric.h - source: scheme: dilithium4 @@ -36,5 +37,18 @@ consistency_checks: - rounding.h - sign.c - sign.h - - symmetric.c + - stream.c + - stream.h + - symmetric.h +- source: + scheme: dilithium2 + implementation: avx2 + files: + - api.h + - packing.c + - packing.h + - polyvec.h + - params.h + - stream.c + - stream.h - symmetric.h diff --git a/test/duplicate_consistency/dilithium3_avx2.yml b/test/duplicate_consistency/dilithium3_avx2.yml new file mode 100644 index 00000000..3a638dae --- /dev/null +++ b/test/duplicate_consistency/dilithium3_avx2.yml @@ -0,0 +1,63 @@ +consistency_checks: +- source: + scheme: dilithium2 + implementation: avx2 + files: + - alignment.h + - fips202x4.c + - fips202x4.h + - nttconsts.c + - nttconsts.h + - ntt.s + - ntt.h + - packing.c + - packing.h + - poly.h + - polyvec.c + - polyvec.h + - reduce.h + - reduce.s + - rounding.c + - rounding.h + - rejsample.h + - sign.h + - stream.c + - stream.h + - symmetric.h +- source: + scheme: dilithium4 + implementation: avx2 + files: + - alignment.h + - fips202x4.c + - fips202x4.h + - nttconsts.c + - nttconsts.h + - ntt.s + - ntt.h + - packing.c + - packing.h + - poly.h + - polyvec.c + - polyvec.h + - reduce.h + - reduce.s + - rounding.c + - rounding.h + - rejsample.h + - sign.h + - stream.c + - stream.h + - symmetric.h +- source: + scheme: dilithium3 + implementation: clean + files: + - api.h + - packing.c + - packing.h + - polyvec.h + - params.h + - stream.c + - stream.h + - symmetric.h diff --git a/test/duplicate_consistency/dilithium3_clean.yml b/test/duplicate_consistency/dilithium3_clean.yml index c7b391c2..1145e238 100644 --- a/test/duplicate_consistency/dilithium3_clean.yml +++ b/test/duplicate_consistency/dilithium3_clean.yml @@ -17,7 +17,8 @@ consistency_checks: - rounding.h - sign.c - sign.h - - symmetric.c + - stream.c + - stream.h - symmetric.h - source: scheme: dilithium4 @@ -36,5 +37,18 @@ consistency_checks: - rounding.h - sign.c - sign.h - - symmetric.c + - stream.c + - stream.h + - symmetric.h +- source: + scheme: dilithium2 + implementation: avx2 + files: + - api.h + - packing.c + - packing.h + - polyvec.h + - params.h + - stream.c + - stream.h - symmetric.h diff --git a/test/duplicate_consistency/dilithium4_avx2.yml b/test/duplicate_consistency/dilithium4_avx2.yml new file mode 100644 index 00000000..b37a7695 --- /dev/null +++ b/test/duplicate_consistency/dilithium4_avx2.yml @@ -0,0 +1,63 @@ +consistency_checks: +- source: + scheme: dilithium2 + implementation: avx2 + files: + - alignment.h + - fips202x4.c + - fips202x4.h + - nttconsts.c + - nttconsts.h + - ntt.s + - ntt.h + - packing.c + - packing.h + - poly.h + - polyvec.c + - polyvec.h + - reduce.h + - reduce.s + - rounding.c + - rounding.h + - rejsample.h + - sign.h + - stream.c + - stream.h + - symmetric.h +- source: + scheme: dilithium3 + implementation: avx2 + files: + - alignment.h + - fips202x4.c + - fips202x4.h + - nttconsts.c + - nttconsts.h + - ntt.s + - ntt.h + - packing.c + - packing.h + - poly.h + - polyvec.c + - polyvec.h + - reduce.h + - reduce.s + - rounding.c + - rounding.h + - rejsample.h + - sign.h + - stream.c + - stream.h + - symmetric.h +- source: + scheme: dilithium4 + implementation: clean + files: + - api.h + - packing.c + - packing.h + - polyvec.h + - params.h + - stream.c + - stream.h + - symmetric.h diff --git a/test/duplicate_consistency/dilithium4_clean.yml b/test/duplicate_consistency/dilithium4_clean.yml index ad2f29b2..afa91d6b 100644 --- a/test/duplicate_consistency/dilithium4_clean.yml +++ b/test/duplicate_consistency/dilithium4_clean.yml @@ -16,7 +16,8 @@ consistency_checks: - rounding.h - sign.c - sign.h - - symmetric.c + - stream.c + - stream.h - symmetric.h - source: scheme: dilithium3 @@ -35,5 +36,19 @@ consistency_checks: - rounding.h - sign.c - sign.h - - symmetric.c + - stream.c + - stream.h - symmetric.h +- source: + scheme: dilithium2 + implementation: avx2 + files: + - api.h + - packing.c + - packing.h + - polyvec.h + - params.h + - stream.c + - stream.h + - symmetric.h +