From 32c613e8ecce4e1f1b71950f0d2499366437b126 Mon Sep 17 00:00:00 2001 From: John Schanck Date: Mon, 1 Feb 2021 00:32:40 -0500 Subject: [PATCH] Round 3 update for Dilithium (from github source) (#369) * Update Dilithium * Alternative montgomery reduce to avoid i386 functest errors * Explicit casts for msvc * More casts; bump upstream version; fix metadata * another cast --- .github/workflows/BADGES.md | 145 +- .github/workflows/sign_dilithium2aes.yml | 204 +++ .github/workflows/sign_dilithium3aes.yml | 204 +++ ...ign_dilithium4.yml => sign_dilithium5.yml} | 22 +- .github/workflows/sign_dilithium5aes.yml | 204 +++ BADGES.md | 0 crypto_sign/dilithium2/META.yml | 32 +- crypto_sign/dilithium2/avx2/LICENSE | 7 +- crypto_sign/dilithium2/avx2/Makefile | 31 +- crypto_sign/dilithium2/avx2/align.h | 19 + crypto_sign/dilithium2/avx2/alignment.h | 22 - crypto_sign/dilithium2/avx2/api.h | 27 +- .../cdecl.inc => dilithium2/avx2/cdecl.h} | 18 +- crypto_sign/dilithium2/avx2/consts.c | 101 ++ crypto_sign/dilithium2/avx2/consts.h | 10 + crypto_sign/dilithium2/avx2/f1600x4.S | 909 +++++++++++++ crypto_sign/dilithium2/avx2/fips202x4.c | 346 +++-- crypto_sign/dilithium2/avx2/fips202x4.h | 100 +- crypto_sign/dilithium2/avx2/invntt.S | 456 +++---- crypto_sign/dilithium2/avx2/ntt.S | 304 +++-- crypto_sign/dilithium2/avx2/ntt.h | 38 +- crypto_sign/dilithium2/avx2/nttconsts.c | 80 -- crypto_sign/dilithium2/avx2/nttconsts.h | 27 - crypto_sign/dilithium2/avx2/packing.c | 252 ++-- crypto_sign/dilithium2/avx2/packing.h | 57 +- crypto_sign/dilithium2/avx2/params.h | 40 +- crypto_sign/dilithium2/avx2/pointwise.S | 151 ++- crypto_sign/dilithium2/avx2/poly.c | 1200 +++++++++-------- crypto_sign/dilithium2/avx2/poly.h | 74 +- crypto_sign/dilithium2/avx2/polyvec.c | 283 +++- crypto_sign/dilithium2/avx2/polyvec.h | 66 +- crypto_sign/dilithium2/avx2/reduce.S | 93 -- crypto_sign/dilithium2/avx2/reduce.h | 9 - crypto_sign/dilithium2/avx2/rejsample.c | 265 ++-- crypto_sign/dilithium2/avx2/rejsample.h | 32 +- crypto_sign/dilithium2/avx2/rounding.c | 196 +-- crypto_sign/dilithium2/avx2/rounding.h | 14 +- crypto_sign/dilithium2/avx2/shuffle.S | 54 + crypto_sign/dilithium2/avx2/shuffle.inc | 18 +- crypto_sign/dilithium2/avx2/sign.c | 508 ++++--- crypto_sign/dilithium2/avx2/sign.h | 30 +- crypto_sign/dilithium2/avx2/stream.c | 26 - crypto_sign/dilithium2/avx2/stream.h | 15 - crypto_sign/dilithium2/avx2/symmetric-shake.c | 26 + crypto_sign/dilithium2/avx2/symmetric.h | 39 +- crypto_sign/dilithium2/clean/LICENSE | 7 +- crypto_sign/dilithium2/clean/Makefile | 9 +- .../dilithium2/clean/Makefile.Microsoft_nmake | 11 +- crypto_sign/dilithium2/clean/api.h | 27 +- crypto_sign/dilithium2/clean/ntt.c | 164 +-- crypto_sign/dilithium2/clean/ntt.h | 7 +- crypto_sign/dilithium2/clean/packing.c | 252 ++-- crypto_sign/dilithium2/clean/packing.h | 57 +- crypto_sign/dilithium2/clean/params.h | 40 +- crypto_sign/dilithium2/clean/poly.c | 721 ++++++---- crypto_sign/dilithium2/clean/poly.h | 57 +- crypto_sign/dilithium2/clean/polyvec.c | 268 ++-- crypto_sign/dilithium2/clean/polyvec.h | 58 +- crypto_sign/dilithium2/clean/reduce.c | 56 +- crypto_sign/dilithium2/clean/reduce.h | 18 +- crypto_sign/dilithium2/clean/rounding.c | 105 +- crypto_sign/dilithium2/clean/rounding.h | 13 +- crypto_sign/dilithium2/clean/sign.c | 366 ++--- crypto_sign/dilithium2/clean/sign.h | 27 +- crypto_sign/dilithium2/clean/stream.c | 26 - crypto_sign/dilithium2/clean/stream.h | 15 - .../dilithium2/clean/symmetric-shake.c | 26 + crypto_sign/dilithium2/clean/symmetric.h | 39 +- crypto_sign/dilithium2aes/META.yml | 31 + crypto_sign/dilithium2aes/avx2/LICENSE | 5 + crypto_sign/dilithium2aes/avx2/Makefile | 23 + crypto_sign/dilithium2aes/avx2/aes256ctr.c | 142 ++ crypto_sign/dilithium2aes/avx2/aes256ctr.h | 29 + crypto_sign/dilithium2aes/avx2/align.h | 19 + crypto_sign/dilithium2aes/avx2/api.h | 31 + crypto_sign/dilithium2aes/avx2/cdecl.h | 24 + crypto_sign/dilithium2aes/avx2/consts.c | 101 ++ crypto_sign/dilithium2aes/avx2/consts.h | 10 + crypto_sign/dilithium2aes/avx2/invntt.S | 240 ++++ crypto_sign/dilithium2aes/avx2/ntt.S | 199 +++ crypto_sign/dilithium2aes/avx2/ntt.h | 14 + crypto_sign/dilithium2aes/avx2/packing.c | 261 ++++ crypto_sign/dilithium2aes/avx2/packing.h | 31 + crypto_sign/dilithium2aes/avx2/params.h | 41 + crypto_sign/dilithium2aes/avx2/pointwise.S | 199 +++ crypto_sign/dilithium2aes/avx2/poly.c | 891 ++++++++++++ crypto_sign/dilithium2aes/avx2/poly.h | 52 + crypto_sign/dilithium2aes/avx2/polyvec.c | 449 ++++++ crypto_sign/dilithium2aes/avx2/polyvec.h | 64 + crypto_sign/dilithium2aes/avx2/rejsample.c | 394 ++++++ crypto_sign/dilithium2aes/avx2/rejsample.h | 19 + crypto_sign/dilithium2aes/avx2/rounding.c | 157 +++ crypto_sign/dilithium2aes/avx2/rounding.h | 12 + crypto_sign/dilithium2aes/avx2/shuffle.S | 54 + crypto_sign/dilithium2aes/avx2/shuffle.inc | 25 + crypto_sign/dilithium2aes/avx2/sign.c | 425 ++++++ crypto_sign/dilithium2aes/avx2/sign.h | 29 + crypto_sign/dilithium2aes/avx2/symmetric.h | 25 + crypto_sign/dilithium2aes/clean/LICENSE | 5 + crypto_sign/dilithium2aes/clean/Makefile | 19 + .../clean/Makefile.Microsoft_nmake | 23 + crypto_sign/dilithium2aes/clean/aes256ctr.c | 564 ++++++++ crypto_sign/dilithium2aes/clean/aes256ctr.h | 28 + crypto_sign/dilithium2aes/clean/api.h | 31 + crypto_sign/dilithium2aes/clean/ntt.c | 98 ++ crypto_sign/dilithium2aes/clean/ntt.h | 10 + crypto_sign/dilithium2aes/clean/packing.c | 261 ++++ crypto_sign/dilithium2aes/clean/packing.h | 31 + crypto_sign/dilithium2aes/clean/params.h | 41 + crypto_sign/dilithium2aes/clean/poly.c | 867 ++++++++++++ crypto_sign/dilithium2aes/clean/poly.h | 53 + crypto_sign/dilithium2aes/clean/polyvec.c | 448 ++++++ crypto_sign/dilithium2aes/clean/polyvec.h | 68 + crypto_sign/dilithium2aes/clean/reduce.c | 69 + crypto_sign/dilithium2aes/clean/reduce.h | 17 + crypto_sign/dilithium2aes/clean/rounding.c | 98 ++ crypto_sign/dilithium2aes/clean/rounding.h | 14 + crypto_sign/dilithium2aes/clean/sign.c | 343 +++++ crypto_sign/dilithium2aes/clean/sign.h | 29 + .../dilithium2aes/clean/symmetric-aes.c | 12 + crypto_sign/dilithium2aes/clean/symmetric.h | 33 + crypto_sign/dilithium3/META.yml | 32 +- crypto_sign/dilithium3/avx2/LICENSE | 7 +- crypto_sign/dilithium3/avx2/Makefile | 31 +- crypto_sign/dilithium3/avx2/align.h | 19 + crypto_sign/dilithium3/avx2/alignment.h | 22 - crypto_sign/dilithium3/avx2/api.h | 26 +- .../cdecl.inc => dilithium3/avx2/cdecl.h} | 18 +- crypto_sign/dilithium3/avx2/consts.c | 101 ++ crypto_sign/dilithium3/avx2/consts.h | 10 + crypto_sign/dilithium3/avx2/f1600x4.S | 909 +++++++++++++ crypto_sign/dilithium3/avx2/fips202x4.c | 346 +++-- crypto_sign/dilithium3/avx2/fips202x4.h | 100 +- crypto_sign/dilithium3/avx2/invntt.S | 456 +++---- crypto_sign/dilithium3/avx2/ntt.S | 304 +++-- crypto_sign/dilithium3/avx2/ntt.h | 38 +- crypto_sign/dilithium3/avx2/nttconsts.c | 80 -- crypto_sign/dilithium3/avx2/nttconsts.h | 27 - crypto_sign/dilithium3/avx2/packing.c | 252 ++-- crypto_sign/dilithium3/avx2/packing.h | 57 +- crypto_sign/dilithium3/avx2/params.h | 44 +- crypto_sign/dilithium3/avx2/pointwise.S | 151 ++- crypto_sign/dilithium3/avx2/poly.c | 1153 ++++++++-------- crypto_sign/dilithium3/avx2/poly.h | 74 +- crypto_sign/dilithium3/avx2/polyvec.c | 307 ++++- crypto_sign/dilithium3/avx2/polyvec.h | 66 +- crypto_sign/dilithium3/avx2/reduce.S | 93 -- crypto_sign/dilithium3/avx2/reduce.h | 9 - crypto_sign/dilithium3/avx2/rejsample.c | 249 ++-- crypto_sign/dilithium3/avx2/rejsample.h | 32 +- crypto_sign/dilithium3/avx2/rounding.c | 209 +-- crypto_sign/dilithium3/avx2/rounding.h | 14 +- crypto_sign/dilithium3/avx2/shuffle.S | 54 + crypto_sign/dilithium3/avx2/shuffle.inc | 18 +- crypto_sign/dilithium3/avx2/sign.c | 529 ++++---- crypto_sign/dilithium3/avx2/sign.h | 30 +- crypto_sign/dilithium3/avx2/stream.c | 26 - crypto_sign/dilithium3/avx2/stream.h | 15 - crypto_sign/dilithium3/avx2/symmetric-shake.c | 26 + crypto_sign/dilithium3/avx2/symmetric.h | 39 +- crypto_sign/dilithium3/clean/LICENSE | 7 +- crypto_sign/dilithium3/clean/Makefile | 9 +- .../dilithium3/clean/Makefile.Microsoft_nmake | 11 +- crypto_sign/dilithium3/clean/api.h | 26 +- crypto_sign/dilithium3/clean/ntt.c | 164 +-- crypto_sign/dilithium3/clean/ntt.h | 7 +- crypto_sign/dilithium3/clean/packing.c | 252 ++-- crypto_sign/dilithium3/clean/packing.h | 57 +- crypto_sign/dilithium3/clean/params.h | 44 +- crypto_sign/dilithium3/clean/poly.c | 664 +++++---- crypto_sign/dilithium3/clean/poly.h | 57 +- crypto_sign/dilithium3/clean/polyvec.c | 268 ++-- crypto_sign/dilithium3/clean/polyvec.h | 58 +- crypto_sign/dilithium3/clean/reduce.c | 56 +- crypto_sign/dilithium3/clean/reduce.h | 18 +- crypto_sign/dilithium3/clean/rounding.c | 99 +- crypto_sign/dilithium3/clean/rounding.h | 13 +- crypto_sign/dilithium3/clean/sign.c | 366 ++--- crypto_sign/dilithium3/clean/sign.h | 27 +- crypto_sign/dilithium3/clean/stream.c | 26 - crypto_sign/dilithium3/clean/stream.h | 15 - .../dilithium3/clean/symmetric-shake.c | 26 + crypto_sign/dilithium3/clean/symmetric.h | 39 +- crypto_sign/dilithium3aes/META.yml | 31 + crypto_sign/dilithium3aes/avx2/LICENSE | 5 + crypto_sign/dilithium3aes/avx2/Makefile | 23 + crypto_sign/dilithium3aes/avx2/aes256ctr.c | 142 ++ crypto_sign/dilithium3aes/avx2/aes256ctr.h | 29 + crypto_sign/dilithium3aes/avx2/align.h | 19 + crypto_sign/dilithium3aes/avx2/api.h | 32 + crypto_sign/dilithium3aes/avx2/cdecl.h | 24 + crypto_sign/dilithium3aes/avx2/consts.c | 101 ++ crypto_sign/dilithium3aes/avx2/consts.h | 10 + crypto_sign/dilithium3aes/avx2/invntt.S | 240 ++++ crypto_sign/dilithium3aes/avx2/ntt.S | 199 +++ crypto_sign/dilithium3aes/avx2/ntt.h | 14 + crypto_sign/dilithium3aes/avx2/packing.c | 261 ++++ crypto_sign/dilithium3aes/avx2/packing.h | 31 + crypto_sign/dilithium3aes/avx2/params.h | 41 + crypto_sign/dilithium3aes/avx2/pointwise.S | 201 +++ crypto_sign/dilithium3aes/avx2/poly.c | 862 ++++++++++++ crypto_sign/dilithium3aes/avx2/poly.h | 52 + crypto_sign/dilithium3aes/avx2/polyvec.c | 449 ++++++ crypto_sign/dilithium3aes/avx2/polyvec.h | 64 + crypto_sign/dilithium3aes/avx2/rejsample.c | 378 ++++++ crypto_sign/dilithium3aes/avx2/rejsample.h | 19 + crypto_sign/dilithium3aes/avx2/rounding.c | 154 +++ crypto_sign/dilithium3aes/avx2/rounding.h | 12 + crypto_sign/dilithium3aes/avx2/shuffle.S | 54 + crypto_sign/dilithium3aes/avx2/shuffle.inc | 25 + crypto_sign/dilithium3aes/avx2/sign.c | 425 ++++++ crypto_sign/dilithium3aes/avx2/sign.h | 29 + crypto_sign/dilithium3aes/avx2/symmetric.h | 25 + crypto_sign/dilithium3aes/clean/LICENSE | 5 + crypto_sign/dilithium3aes/clean/Makefile | 19 + .../clean/Makefile.Microsoft_nmake | 23 + crypto_sign/dilithium3aes/clean/aes256ctr.c | 564 ++++++++ crypto_sign/dilithium3aes/clean/aes256ctr.h | 28 + crypto_sign/dilithium3aes/clean/api.h | 32 + crypto_sign/dilithium3aes/clean/ntt.c | 98 ++ crypto_sign/dilithium3aes/clean/ntt.h | 10 + crypto_sign/dilithium3aes/clean/packing.c | 261 ++++ crypto_sign/dilithium3aes/clean/packing.h | 31 + crypto_sign/dilithium3aes/clean/params.h | 41 + crypto_sign/dilithium3aes/clean/poly.c | 818 +++++++++++ crypto_sign/dilithium3aes/clean/poly.h | 53 + crypto_sign/dilithium3aes/clean/polyvec.c | 448 ++++++ crypto_sign/dilithium3aes/clean/polyvec.h | 68 + crypto_sign/dilithium3aes/clean/reduce.c | 69 + crypto_sign/dilithium3aes/clean/reduce.h | 17 + crypto_sign/dilithium3aes/clean/rounding.c | 92 ++ crypto_sign/dilithium3aes/clean/rounding.h | 14 + crypto_sign/dilithium3aes/clean/sign.c | 343 +++++ crypto_sign/dilithium3aes/clean/sign.h | 29 + .../dilithium3aes/clean/symmetric-aes.c | 12 + crypto_sign/dilithium3aes/clean/symmetric.h | 33 + crypto_sign/dilithium4/META.yml | 31 - crypto_sign/dilithium4/avx2/LICENSE | 6 - crypto_sign/dilithium4/avx2/Makefile | 40 - crypto_sign/dilithium4/avx2/alignment.h | 22 - crypto_sign/dilithium4/avx2/api.h | 38 - crypto_sign/dilithium4/avx2/fips202x4.c | 233 ---- crypto_sign/dilithium4/avx2/fips202x4.h | 66 - crypto_sign/dilithium4/avx2/invntt.S | 282 ---- crypto_sign/dilithium4/avx2/ntt.S | 179 --- crypto_sign/dilithium4/avx2/ntt.h | 36 - crypto_sign/dilithium4/avx2/nttconsts.c | 80 -- crypto_sign/dilithium4/avx2/nttconsts.h | 27 - crypto_sign/dilithium4/avx2/packing.c | 297 ---- crypto_sign/dilithium4/avx2/packing.h | 42 - crypto_sign/dilithium4/avx2/params.h | 29 - crypto_sign/dilithium4/avx2/pointwise.S | 194 --- crypto_sign/dilithium4/avx2/poly.c | 923 ------------- crypto_sign/dilithium4/avx2/poly.h | 83 -- crypto_sign/dilithium4/avx2/polyvec.c | 323 ----- crypto_sign/dilithium4/avx2/polyvec.h | 58 - crypto_sign/dilithium4/avx2/reduce.S | 93 -- crypto_sign/dilithium4/avx2/reduce.h | 9 - crypto_sign/dilithium4/avx2/rejsample.h | 25 - crypto_sign/dilithium4/avx2/rounding.c | 115 -- crypto_sign/dilithium4/avx2/rounding.h | 12 - crypto_sign/dilithium4/avx2/shuffle.inc | 23 - crypto_sign/dilithium4/avx2/sign.c | 463 ------- crypto_sign/dilithium4/avx2/sign.h | 15 - crypto_sign/dilithium4/avx2/stream.c | 26 - crypto_sign/dilithium4/avx2/stream.h | 15 - crypto_sign/dilithium4/avx2/symmetric.h | 25 - crypto_sign/dilithium4/clean/LICENSE | 6 - crypto_sign/dilithium4/clean/Makefile | 22 - .../dilithium4/clean/Makefile.Microsoft_nmake | 18 - crypto_sign/dilithium4/clean/api.h | 38 - crypto_sign/dilithium4/clean/ntt.c | 138 -- crypto_sign/dilithium4/clean/ntt.h | 11 - crypto_sign/dilithium4/clean/packing.c | 297 ---- crypto_sign/dilithium4/clean/packing.h | 42 - crypto_sign/dilithium4/clean/params.h | 29 - crypto_sign/dilithium4/clean/poly.c | 726 ---------- crypto_sign/dilithium4/clean/poly.h | 66 - crypto_sign/dilithium4/clean/polyvec.c | 336 ----- crypto_sign/dilithium4/clean/polyvec.h | 58 - crypto_sign/dilithium4/clean/reduce.c | 75 -- crypto_sign/dilithium4/clean/reduce.h | 21 - crypto_sign/dilithium4/clean/rounding.c | 117 -- crypto_sign/dilithium4/clean/rounding.h | 11 - crypto_sign/dilithium4/clean/sign.c | 427 ------ crypto_sign/dilithium4/clean/sign.h | 12 - crypto_sign/dilithium4/clean/stream.c | 26 - crypto_sign/dilithium4/clean/stream.h | 15 - crypto_sign/dilithium4/clean/symmetric.h | 25 - crypto_sign/dilithium5/META.yml | 31 + crypto_sign/dilithium5/avx2/LICENSE | 5 + crypto_sign/dilithium5/avx2/Makefile | 31 + crypto_sign/dilithium5/avx2/align.h | 19 + crypto_sign/dilithium5/avx2/api.h | 30 + .../cdecl.inc => dilithium5/avx2/cdecl.h} | 18 +- crypto_sign/dilithium5/avx2/consts.c | 101 ++ crypto_sign/dilithium5/avx2/consts.h | 10 + crypto_sign/dilithium5/avx2/f1600x4.S | 909 +++++++++++++ crypto_sign/dilithium5/avx2/fips202x4.c | 219 +++ crypto_sign/dilithium5/avx2/fips202x4.h | 64 + crypto_sign/dilithium5/avx2/invntt.S | 240 ++++ crypto_sign/dilithium5/avx2/ntt.S | 199 +++ crypto_sign/dilithium5/avx2/ntt.h | 14 + crypto_sign/dilithium5/avx2/packing.c | 261 ++++ crypto_sign/dilithium5/avx2/packing.h | 31 + crypto_sign/dilithium5/avx2/params.h | 41 + crypto_sign/dilithium5/avx2/pointwise.S | 205 +++ crypto_sign/dilithium5/avx2/poly.c | 1022 ++++++++++++++ crypto_sign/dilithium5/avx2/poly.h | 79 ++ crypto_sign/dilithium5/avx2/polyvec.c | 538 ++++++++ crypto_sign/dilithium5/avx2/polyvec.h | 72 + .../avx2/rejsample.c | 265 ++-- crypto_sign/dilithium5/avx2/rejsample.h | 19 + crypto_sign/dilithium5/avx2/rounding.c | 154 +++ crypto_sign/dilithium5/avx2/rounding.h | 12 + crypto_sign/dilithium5/avx2/shuffle.S | 54 + crypto_sign/dilithium5/avx2/shuffle.inc | 25 + crypto_sign/dilithium5/avx2/sign.c | 435 ++++++ crypto_sign/dilithium5/avx2/sign.h | 29 + crypto_sign/dilithium5/avx2/symmetric-shake.c | 26 + crypto_sign/dilithium5/avx2/symmetric.h | 36 + crypto_sign/dilithium5/clean/LICENSE | 5 + crypto_sign/dilithium5/clean/Makefile | 19 + .../dilithium5/clean/Makefile.Microsoft_nmake | 23 + crypto_sign/dilithium5/clean/api.h | 30 + crypto_sign/dilithium5/clean/ntt.c | 98 ++ crypto_sign/dilithium5/clean/ntt.h | 10 + crypto_sign/dilithium5/clean/packing.c | 261 ++++ crypto_sign/dilithium5/clean/packing.h | 31 + crypto_sign/dilithium5/clean/params.h | 41 + crypto_sign/dilithium5/clean/poly.c | 842 ++++++++++++ crypto_sign/dilithium5/clean/poly.h | 53 + crypto_sign/dilithium5/clean/polyvec.c | 448 ++++++ crypto_sign/dilithium5/clean/polyvec.h | 68 + crypto_sign/dilithium5/clean/reduce.c | 69 + crypto_sign/dilithium5/clean/reduce.h | 17 + crypto_sign/dilithium5/clean/rounding.c | 92 ++ crypto_sign/dilithium5/clean/rounding.h | 14 + crypto_sign/dilithium5/clean/sign.c | 343 +++++ crypto_sign/dilithium5/clean/sign.h | 29 + .../dilithium5/clean/symmetric-shake.c | 26 + crypto_sign/dilithium5/clean/symmetric.h | 36 + crypto_sign/dilithium5aes/META.yml | 31 + crypto_sign/dilithium5aes/avx2/LICENSE | 5 + crypto_sign/dilithium5aes/avx2/Makefile | 23 + crypto_sign/dilithium5aes/avx2/aes256ctr.c | 142 ++ crypto_sign/dilithium5aes/avx2/aes256ctr.h | 29 + crypto_sign/dilithium5aes/avx2/align.h | 19 + crypto_sign/dilithium5aes/avx2/api.h | 30 + crypto_sign/dilithium5aes/avx2/cdecl.h | 24 + crypto_sign/dilithium5aes/avx2/consts.c | 101 ++ crypto_sign/dilithium5aes/avx2/consts.h | 10 + crypto_sign/dilithium5aes/avx2/invntt.S | 240 ++++ crypto_sign/dilithium5aes/avx2/ntt.S | 199 +++ crypto_sign/dilithium5aes/avx2/ntt.h | 14 + crypto_sign/dilithium5aes/avx2/packing.c | 261 ++++ crypto_sign/dilithium5aes/avx2/packing.h | 31 + crypto_sign/dilithium5aes/avx2/params.h | 41 + crypto_sign/dilithium5aes/avx2/pointwise.S | 205 +++ crypto_sign/dilithium5aes/avx2/poly.c | 886 ++++++++++++ crypto_sign/dilithium5aes/avx2/poly.h | 52 + crypto_sign/dilithium5aes/avx2/polyvec.c | 449 ++++++ crypto_sign/dilithium5aes/avx2/polyvec.h | 64 + crypto_sign/dilithium5aes/avx2/rejsample.c | 394 ++++++ crypto_sign/dilithium5aes/avx2/rejsample.h | 19 + crypto_sign/dilithium5aes/avx2/rounding.c | 154 +++ crypto_sign/dilithium5aes/avx2/rounding.h | 12 + crypto_sign/dilithium5aes/avx2/shuffle.S | 54 + crypto_sign/dilithium5aes/avx2/shuffle.inc | 25 + crypto_sign/dilithium5aes/avx2/sign.c | 425 ++++++ crypto_sign/dilithium5aes/avx2/sign.h | 29 + crypto_sign/dilithium5aes/avx2/symmetric.h | 25 + crypto_sign/dilithium5aes/clean/LICENSE | 5 + crypto_sign/dilithium5aes/clean/Makefile | 19 + .../clean/Makefile.Microsoft_nmake | 23 + crypto_sign/dilithium5aes/clean/aes256ctr.c | 564 ++++++++ crypto_sign/dilithium5aes/clean/aes256ctr.h | 28 + crypto_sign/dilithium5aes/clean/api.h | 30 + crypto_sign/dilithium5aes/clean/ntt.c | 98 ++ crypto_sign/dilithium5aes/clean/ntt.h | 10 + crypto_sign/dilithium5aes/clean/packing.c | 261 ++++ crypto_sign/dilithium5aes/clean/packing.h | 31 + crypto_sign/dilithium5aes/clean/params.h | 41 + crypto_sign/dilithium5aes/clean/poly.c | 842 ++++++++++++ crypto_sign/dilithium5aes/clean/poly.h | 53 + crypto_sign/dilithium5aes/clean/polyvec.c | 448 ++++++ crypto_sign/dilithium5aes/clean/polyvec.h | 68 + crypto_sign/dilithium5aes/clean/reduce.c | 69 + crypto_sign/dilithium5aes/clean/reduce.h | 17 + crypto_sign/dilithium5aes/clean/rounding.c | 92 ++ crypto_sign/dilithium5aes/clean/rounding.h | 14 + crypto_sign/dilithium5aes/clean/sign.c | 343 +++++ crypto_sign/dilithium5aes/clean/sign.h | 29 + .../dilithium5aes/clean/symmetric-aes.c | 12 + crypto_sign/dilithium5aes/clean/symmetric.h | 33 + .../duplicate_consistency/dilithium2_avx2.yml | 194 ++- .../dilithium2_clean.yml | 189 ++- .../dilithium2aes_avx2.yml | 129 ++ .../dilithium2aes_clean.yml | 135 ++ .../duplicate_consistency/dilithium3_avx2.yml | 193 ++- .../dilithium3_clean.yml | 191 ++- .../dilithium3aes_avx2.yml | 128 ++ .../dilithium3aes_clean.yml | 137 ++ .../duplicate_consistency/dilithium4_avx2.yml | 63 - .../dilithium4_clean.yml | 53 - .../duplicate_consistency/dilithium5_avx2.yml | 135 ++ .../dilithium5_clean.yml | 139 ++ .../dilithium5aes_avx2.yml | 131 ++ .../dilithium5aes_clean.yml | 137 ++ 409 files changed, 40654 insertions(+), 14120 deletions(-) create mode 100644 .github/workflows/sign_dilithium2aes.yml create mode 100644 .github/workflows/sign_dilithium3aes.yml rename .github/workflows/{sign_dilithium4.yml => sign_dilithium5.yml} (92%) create mode 100644 .github/workflows/sign_dilithium5aes.yml create mode 100644 BADGES.md create mode 100644 crypto_sign/dilithium2/avx2/align.h delete mode 100644 crypto_sign/dilithium2/avx2/alignment.h rename crypto_sign/{dilithium4/avx2/cdecl.inc => dilithium2/avx2/cdecl.h} (55%) create mode 100644 crypto_sign/dilithium2/avx2/consts.c create mode 100644 crypto_sign/dilithium2/avx2/consts.h create mode 100644 crypto_sign/dilithium2/avx2/f1600x4.S delete mode 100644 crypto_sign/dilithium2/avx2/nttconsts.c delete mode 100644 crypto_sign/dilithium2/avx2/nttconsts.h delete mode 100644 crypto_sign/dilithium2/avx2/reduce.S delete mode 100644 crypto_sign/dilithium2/avx2/reduce.h create mode 100644 crypto_sign/dilithium2/avx2/shuffle.S delete mode 100644 crypto_sign/dilithium2/avx2/stream.c delete mode 100644 crypto_sign/dilithium2/avx2/stream.h create mode 100644 crypto_sign/dilithium2/avx2/symmetric-shake.c delete mode 100644 crypto_sign/dilithium2/clean/stream.c delete mode 100644 crypto_sign/dilithium2/clean/stream.h create mode 100644 crypto_sign/dilithium2/clean/symmetric-shake.c create mode 100644 crypto_sign/dilithium2aes/META.yml create mode 100644 crypto_sign/dilithium2aes/avx2/LICENSE create mode 100644 crypto_sign/dilithium2aes/avx2/Makefile create mode 100644 crypto_sign/dilithium2aes/avx2/aes256ctr.c create mode 100644 crypto_sign/dilithium2aes/avx2/aes256ctr.h create mode 100644 crypto_sign/dilithium2aes/avx2/align.h create mode 100644 crypto_sign/dilithium2aes/avx2/api.h create mode 100644 crypto_sign/dilithium2aes/avx2/cdecl.h create mode 100644 crypto_sign/dilithium2aes/avx2/consts.c create mode 100644 crypto_sign/dilithium2aes/avx2/consts.h create mode 100644 crypto_sign/dilithium2aes/avx2/invntt.S create mode 100644 crypto_sign/dilithium2aes/avx2/ntt.S create mode 100644 crypto_sign/dilithium2aes/avx2/ntt.h create mode 100644 crypto_sign/dilithium2aes/avx2/packing.c create mode 100644 crypto_sign/dilithium2aes/avx2/packing.h create mode 100644 crypto_sign/dilithium2aes/avx2/params.h create mode 100644 crypto_sign/dilithium2aes/avx2/pointwise.S create mode 100644 crypto_sign/dilithium2aes/avx2/poly.c create mode 100644 crypto_sign/dilithium2aes/avx2/poly.h create mode 100644 crypto_sign/dilithium2aes/avx2/polyvec.c create mode 100644 crypto_sign/dilithium2aes/avx2/polyvec.h create mode 100644 crypto_sign/dilithium2aes/avx2/rejsample.c create mode 100644 crypto_sign/dilithium2aes/avx2/rejsample.h create mode 100644 crypto_sign/dilithium2aes/avx2/rounding.c create mode 100644 crypto_sign/dilithium2aes/avx2/rounding.h create mode 100644 crypto_sign/dilithium2aes/avx2/shuffle.S create mode 100644 crypto_sign/dilithium2aes/avx2/shuffle.inc create mode 100644 crypto_sign/dilithium2aes/avx2/sign.c create mode 100644 crypto_sign/dilithium2aes/avx2/sign.h create mode 100644 crypto_sign/dilithium2aes/avx2/symmetric.h create mode 100644 crypto_sign/dilithium2aes/clean/LICENSE create mode 100644 crypto_sign/dilithium2aes/clean/Makefile create mode 100644 crypto_sign/dilithium2aes/clean/Makefile.Microsoft_nmake create mode 100644 crypto_sign/dilithium2aes/clean/aes256ctr.c create mode 100644 crypto_sign/dilithium2aes/clean/aes256ctr.h create mode 100644 crypto_sign/dilithium2aes/clean/api.h create mode 100644 crypto_sign/dilithium2aes/clean/ntt.c create mode 100644 crypto_sign/dilithium2aes/clean/ntt.h create mode 100644 crypto_sign/dilithium2aes/clean/packing.c create mode 100644 crypto_sign/dilithium2aes/clean/packing.h create mode 100644 crypto_sign/dilithium2aes/clean/params.h create mode 100644 crypto_sign/dilithium2aes/clean/poly.c create mode 100644 crypto_sign/dilithium2aes/clean/poly.h create mode 100644 crypto_sign/dilithium2aes/clean/polyvec.c create mode 100644 crypto_sign/dilithium2aes/clean/polyvec.h create mode 100644 crypto_sign/dilithium2aes/clean/reduce.c create mode 100644 crypto_sign/dilithium2aes/clean/reduce.h create mode 100644 crypto_sign/dilithium2aes/clean/rounding.c create mode 100644 crypto_sign/dilithium2aes/clean/rounding.h create mode 100644 crypto_sign/dilithium2aes/clean/sign.c create mode 100644 crypto_sign/dilithium2aes/clean/sign.h create mode 100644 crypto_sign/dilithium2aes/clean/symmetric-aes.c create mode 100644 crypto_sign/dilithium2aes/clean/symmetric.h create mode 100644 crypto_sign/dilithium3/avx2/align.h delete mode 100644 crypto_sign/dilithium3/avx2/alignment.h rename crypto_sign/{dilithium2/avx2/cdecl.inc => dilithium3/avx2/cdecl.h} (55%) create mode 100644 crypto_sign/dilithium3/avx2/consts.c create mode 100644 crypto_sign/dilithium3/avx2/consts.h create mode 100644 crypto_sign/dilithium3/avx2/f1600x4.S delete mode 100644 crypto_sign/dilithium3/avx2/nttconsts.c delete mode 100644 crypto_sign/dilithium3/avx2/nttconsts.h delete mode 100644 crypto_sign/dilithium3/avx2/reduce.S delete mode 100644 crypto_sign/dilithium3/avx2/reduce.h create mode 100644 crypto_sign/dilithium3/avx2/shuffle.S delete mode 100644 crypto_sign/dilithium3/avx2/stream.c delete mode 100644 crypto_sign/dilithium3/avx2/stream.h create mode 100644 crypto_sign/dilithium3/avx2/symmetric-shake.c delete mode 100644 crypto_sign/dilithium3/clean/stream.c delete mode 100644 crypto_sign/dilithium3/clean/stream.h create mode 100644 crypto_sign/dilithium3/clean/symmetric-shake.c create mode 100644 crypto_sign/dilithium3aes/META.yml create mode 100644 crypto_sign/dilithium3aes/avx2/LICENSE create mode 100644 crypto_sign/dilithium3aes/avx2/Makefile create mode 100644 crypto_sign/dilithium3aes/avx2/aes256ctr.c create mode 100644 crypto_sign/dilithium3aes/avx2/aes256ctr.h create mode 100644 crypto_sign/dilithium3aes/avx2/align.h create mode 100644 crypto_sign/dilithium3aes/avx2/api.h create mode 100644 crypto_sign/dilithium3aes/avx2/cdecl.h create mode 100644 crypto_sign/dilithium3aes/avx2/consts.c create mode 100644 crypto_sign/dilithium3aes/avx2/consts.h create mode 100644 crypto_sign/dilithium3aes/avx2/invntt.S create mode 100644 crypto_sign/dilithium3aes/avx2/ntt.S create mode 100644 crypto_sign/dilithium3aes/avx2/ntt.h create mode 100644 crypto_sign/dilithium3aes/avx2/packing.c create mode 100644 crypto_sign/dilithium3aes/avx2/packing.h create mode 100644 crypto_sign/dilithium3aes/avx2/params.h create mode 100644 crypto_sign/dilithium3aes/avx2/pointwise.S create mode 100644 crypto_sign/dilithium3aes/avx2/poly.c create mode 100644 crypto_sign/dilithium3aes/avx2/poly.h create mode 100644 crypto_sign/dilithium3aes/avx2/polyvec.c create mode 100644 crypto_sign/dilithium3aes/avx2/polyvec.h create mode 100644 crypto_sign/dilithium3aes/avx2/rejsample.c create mode 100644 crypto_sign/dilithium3aes/avx2/rejsample.h create mode 100644 crypto_sign/dilithium3aes/avx2/rounding.c create mode 100644 crypto_sign/dilithium3aes/avx2/rounding.h create mode 100644 crypto_sign/dilithium3aes/avx2/shuffle.S create mode 100644 crypto_sign/dilithium3aes/avx2/shuffle.inc create mode 100644 crypto_sign/dilithium3aes/avx2/sign.c create mode 100644 crypto_sign/dilithium3aes/avx2/sign.h create mode 100644 crypto_sign/dilithium3aes/avx2/symmetric.h create mode 100644 crypto_sign/dilithium3aes/clean/LICENSE create mode 100644 crypto_sign/dilithium3aes/clean/Makefile create mode 100644 crypto_sign/dilithium3aes/clean/Makefile.Microsoft_nmake create mode 100644 crypto_sign/dilithium3aes/clean/aes256ctr.c create mode 100644 crypto_sign/dilithium3aes/clean/aes256ctr.h create mode 100644 crypto_sign/dilithium3aes/clean/api.h create mode 100644 crypto_sign/dilithium3aes/clean/ntt.c create mode 100644 crypto_sign/dilithium3aes/clean/ntt.h create mode 100644 crypto_sign/dilithium3aes/clean/packing.c create mode 100644 crypto_sign/dilithium3aes/clean/packing.h create mode 100644 crypto_sign/dilithium3aes/clean/params.h create mode 100644 crypto_sign/dilithium3aes/clean/poly.c create mode 100644 crypto_sign/dilithium3aes/clean/poly.h create mode 100644 crypto_sign/dilithium3aes/clean/polyvec.c create mode 100644 crypto_sign/dilithium3aes/clean/polyvec.h create mode 100644 crypto_sign/dilithium3aes/clean/reduce.c create mode 100644 crypto_sign/dilithium3aes/clean/reduce.h create mode 100644 crypto_sign/dilithium3aes/clean/rounding.c create mode 100644 crypto_sign/dilithium3aes/clean/rounding.h create mode 100644 crypto_sign/dilithium3aes/clean/sign.c create mode 100644 crypto_sign/dilithium3aes/clean/sign.h create mode 100644 crypto_sign/dilithium3aes/clean/symmetric-aes.c create mode 100644 crypto_sign/dilithium3aes/clean/symmetric.h delete mode 100644 crypto_sign/dilithium4/META.yml delete mode 100644 crypto_sign/dilithium4/avx2/LICENSE delete mode 100644 crypto_sign/dilithium4/avx2/Makefile delete mode 100644 crypto_sign/dilithium4/avx2/alignment.h delete mode 100644 crypto_sign/dilithium4/avx2/api.h delete mode 100644 crypto_sign/dilithium4/avx2/fips202x4.c delete mode 100644 crypto_sign/dilithium4/avx2/fips202x4.h delete mode 100644 crypto_sign/dilithium4/avx2/invntt.S delete mode 100644 crypto_sign/dilithium4/avx2/ntt.S delete mode 100644 crypto_sign/dilithium4/avx2/ntt.h delete mode 100644 crypto_sign/dilithium4/avx2/nttconsts.c delete mode 100644 crypto_sign/dilithium4/avx2/nttconsts.h delete mode 100644 crypto_sign/dilithium4/avx2/packing.c delete mode 100644 crypto_sign/dilithium4/avx2/packing.h delete mode 100644 crypto_sign/dilithium4/avx2/params.h delete mode 100644 crypto_sign/dilithium4/avx2/pointwise.S delete mode 100644 crypto_sign/dilithium4/avx2/poly.c delete mode 100644 crypto_sign/dilithium4/avx2/poly.h delete mode 100644 crypto_sign/dilithium4/avx2/polyvec.c delete mode 100644 crypto_sign/dilithium4/avx2/polyvec.h delete mode 100644 crypto_sign/dilithium4/avx2/reduce.S delete mode 100644 crypto_sign/dilithium4/avx2/reduce.h delete mode 100644 crypto_sign/dilithium4/avx2/rejsample.h delete mode 100644 crypto_sign/dilithium4/avx2/rounding.c delete mode 100644 crypto_sign/dilithium4/avx2/rounding.h delete mode 100644 crypto_sign/dilithium4/avx2/shuffle.inc delete mode 100644 crypto_sign/dilithium4/avx2/sign.c delete mode 100644 crypto_sign/dilithium4/avx2/sign.h delete mode 100644 crypto_sign/dilithium4/avx2/stream.c delete mode 100644 crypto_sign/dilithium4/avx2/stream.h delete mode 100644 crypto_sign/dilithium4/avx2/symmetric.h delete mode 100644 crypto_sign/dilithium4/clean/LICENSE delete mode 100644 crypto_sign/dilithium4/clean/Makefile delete mode 100644 crypto_sign/dilithium4/clean/Makefile.Microsoft_nmake delete mode 100644 crypto_sign/dilithium4/clean/api.h delete mode 100644 crypto_sign/dilithium4/clean/ntt.c delete mode 100644 crypto_sign/dilithium4/clean/ntt.h delete mode 100644 crypto_sign/dilithium4/clean/packing.c delete mode 100644 crypto_sign/dilithium4/clean/packing.h delete mode 100644 crypto_sign/dilithium4/clean/params.h delete mode 100644 crypto_sign/dilithium4/clean/poly.c delete mode 100644 crypto_sign/dilithium4/clean/poly.h delete mode 100644 crypto_sign/dilithium4/clean/polyvec.c delete mode 100644 crypto_sign/dilithium4/clean/polyvec.h delete mode 100644 crypto_sign/dilithium4/clean/reduce.c delete mode 100644 crypto_sign/dilithium4/clean/reduce.h delete mode 100644 crypto_sign/dilithium4/clean/rounding.c delete mode 100644 crypto_sign/dilithium4/clean/rounding.h delete mode 100644 crypto_sign/dilithium4/clean/sign.c delete mode 100644 crypto_sign/dilithium4/clean/sign.h delete mode 100644 crypto_sign/dilithium4/clean/stream.c delete mode 100644 crypto_sign/dilithium4/clean/stream.h delete mode 100644 crypto_sign/dilithium4/clean/symmetric.h create mode 100644 crypto_sign/dilithium5/META.yml create mode 100644 crypto_sign/dilithium5/avx2/LICENSE create mode 100644 crypto_sign/dilithium5/avx2/Makefile create mode 100644 crypto_sign/dilithium5/avx2/align.h create mode 100644 crypto_sign/dilithium5/avx2/api.h rename crypto_sign/{dilithium3/avx2/cdecl.inc => dilithium5/avx2/cdecl.h} (55%) create mode 100644 crypto_sign/dilithium5/avx2/consts.c create mode 100644 crypto_sign/dilithium5/avx2/consts.h create mode 100644 crypto_sign/dilithium5/avx2/f1600x4.S create mode 100644 crypto_sign/dilithium5/avx2/fips202x4.c create mode 100644 crypto_sign/dilithium5/avx2/fips202x4.h create mode 100644 crypto_sign/dilithium5/avx2/invntt.S create mode 100644 crypto_sign/dilithium5/avx2/ntt.S create mode 100644 crypto_sign/dilithium5/avx2/ntt.h create mode 100644 crypto_sign/dilithium5/avx2/packing.c create mode 100644 crypto_sign/dilithium5/avx2/packing.h create mode 100644 crypto_sign/dilithium5/avx2/params.h create mode 100644 crypto_sign/dilithium5/avx2/pointwise.S create mode 100644 crypto_sign/dilithium5/avx2/poly.c create mode 100644 crypto_sign/dilithium5/avx2/poly.h create mode 100644 crypto_sign/dilithium5/avx2/polyvec.c create mode 100644 crypto_sign/dilithium5/avx2/polyvec.h rename crypto_sign/{dilithium4 => dilithium5}/avx2/rejsample.c (66%) create mode 100644 crypto_sign/dilithium5/avx2/rejsample.h create mode 100644 crypto_sign/dilithium5/avx2/rounding.c create mode 100644 crypto_sign/dilithium5/avx2/rounding.h create mode 100644 crypto_sign/dilithium5/avx2/shuffle.S create mode 100644 crypto_sign/dilithium5/avx2/shuffle.inc create mode 100644 crypto_sign/dilithium5/avx2/sign.c create mode 100644 crypto_sign/dilithium5/avx2/sign.h create mode 100644 crypto_sign/dilithium5/avx2/symmetric-shake.c create mode 100644 crypto_sign/dilithium5/avx2/symmetric.h create mode 100644 crypto_sign/dilithium5/clean/LICENSE create mode 100644 crypto_sign/dilithium5/clean/Makefile create mode 100644 crypto_sign/dilithium5/clean/Makefile.Microsoft_nmake create mode 100644 crypto_sign/dilithium5/clean/api.h create mode 100644 crypto_sign/dilithium5/clean/ntt.c create mode 100644 crypto_sign/dilithium5/clean/ntt.h create mode 100644 crypto_sign/dilithium5/clean/packing.c create mode 100644 crypto_sign/dilithium5/clean/packing.h create mode 100644 crypto_sign/dilithium5/clean/params.h create mode 100644 crypto_sign/dilithium5/clean/poly.c create mode 100644 crypto_sign/dilithium5/clean/poly.h create mode 100644 crypto_sign/dilithium5/clean/polyvec.c create mode 100644 crypto_sign/dilithium5/clean/polyvec.h create mode 100644 crypto_sign/dilithium5/clean/reduce.c create mode 100644 crypto_sign/dilithium5/clean/reduce.h create mode 100644 crypto_sign/dilithium5/clean/rounding.c create mode 100644 crypto_sign/dilithium5/clean/rounding.h create mode 100644 crypto_sign/dilithium5/clean/sign.c create mode 100644 crypto_sign/dilithium5/clean/sign.h create mode 100644 crypto_sign/dilithium5/clean/symmetric-shake.c create mode 100644 crypto_sign/dilithium5/clean/symmetric.h create mode 100644 crypto_sign/dilithium5aes/META.yml create mode 100644 crypto_sign/dilithium5aes/avx2/LICENSE create mode 100644 crypto_sign/dilithium5aes/avx2/Makefile create mode 100644 crypto_sign/dilithium5aes/avx2/aes256ctr.c create mode 100644 crypto_sign/dilithium5aes/avx2/aes256ctr.h create mode 100644 crypto_sign/dilithium5aes/avx2/align.h create mode 100644 crypto_sign/dilithium5aes/avx2/api.h create mode 100644 crypto_sign/dilithium5aes/avx2/cdecl.h create mode 100644 crypto_sign/dilithium5aes/avx2/consts.c create mode 100644 crypto_sign/dilithium5aes/avx2/consts.h create mode 100644 crypto_sign/dilithium5aes/avx2/invntt.S create mode 100644 crypto_sign/dilithium5aes/avx2/ntt.S create mode 100644 crypto_sign/dilithium5aes/avx2/ntt.h create mode 100644 crypto_sign/dilithium5aes/avx2/packing.c create mode 100644 crypto_sign/dilithium5aes/avx2/packing.h create mode 100644 crypto_sign/dilithium5aes/avx2/params.h create mode 100644 crypto_sign/dilithium5aes/avx2/pointwise.S create mode 100644 crypto_sign/dilithium5aes/avx2/poly.c create mode 100644 crypto_sign/dilithium5aes/avx2/poly.h create mode 100644 crypto_sign/dilithium5aes/avx2/polyvec.c create mode 100644 crypto_sign/dilithium5aes/avx2/polyvec.h create mode 100644 crypto_sign/dilithium5aes/avx2/rejsample.c create mode 100644 crypto_sign/dilithium5aes/avx2/rejsample.h create mode 100644 crypto_sign/dilithium5aes/avx2/rounding.c create mode 100644 crypto_sign/dilithium5aes/avx2/rounding.h create mode 100644 crypto_sign/dilithium5aes/avx2/shuffle.S create mode 100644 crypto_sign/dilithium5aes/avx2/shuffle.inc create mode 100644 crypto_sign/dilithium5aes/avx2/sign.c create mode 100644 crypto_sign/dilithium5aes/avx2/sign.h create mode 100644 crypto_sign/dilithium5aes/avx2/symmetric.h create mode 100644 crypto_sign/dilithium5aes/clean/LICENSE create mode 100644 crypto_sign/dilithium5aes/clean/Makefile create mode 100644 crypto_sign/dilithium5aes/clean/Makefile.Microsoft_nmake create mode 100644 crypto_sign/dilithium5aes/clean/aes256ctr.c create mode 100644 crypto_sign/dilithium5aes/clean/aes256ctr.h create mode 100644 crypto_sign/dilithium5aes/clean/api.h create mode 100644 crypto_sign/dilithium5aes/clean/ntt.c create mode 100644 crypto_sign/dilithium5aes/clean/ntt.h create mode 100644 crypto_sign/dilithium5aes/clean/packing.c create mode 100644 crypto_sign/dilithium5aes/clean/packing.h create mode 100644 crypto_sign/dilithium5aes/clean/params.h create mode 100644 crypto_sign/dilithium5aes/clean/poly.c create mode 100644 crypto_sign/dilithium5aes/clean/poly.h create mode 100644 crypto_sign/dilithium5aes/clean/polyvec.c create mode 100644 crypto_sign/dilithium5aes/clean/polyvec.h create mode 100644 crypto_sign/dilithium5aes/clean/reduce.c create mode 100644 crypto_sign/dilithium5aes/clean/reduce.h create mode 100644 crypto_sign/dilithium5aes/clean/rounding.c create mode 100644 crypto_sign/dilithium5aes/clean/rounding.h create mode 100644 crypto_sign/dilithium5aes/clean/sign.c create mode 100644 crypto_sign/dilithium5aes/clean/sign.h create mode 100644 crypto_sign/dilithium5aes/clean/symmetric-aes.c create mode 100644 crypto_sign/dilithium5aes/clean/symmetric.h create mode 100644 test/duplicate_consistency/dilithium2aes_avx2.yml create mode 100644 test/duplicate_consistency/dilithium2aes_clean.yml create mode 100644 test/duplicate_consistency/dilithium3aes_avx2.yml create mode 100644 test/duplicate_consistency/dilithium3aes_clean.yml delete mode 100644 test/duplicate_consistency/dilithium4_avx2.yml delete mode 100644 test/duplicate_consistency/dilithium4_clean.yml create mode 100644 test/duplicate_consistency/dilithium5_avx2.yml create mode 100644 test/duplicate_consistency/dilithium5_clean.yml create mode 100644 test/duplicate_consistency/dilithium5aes_avx2.yml create mode 100644 test/duplicate_consistency/dilithium5aes_clean.yml diff --git a/.github/workflows/BADGES.md b/.github/workflows/BADGES.md index d625517f..4f8bc542 100644 --- a/.github/workflows/BADGES.md +++ b/.github/workflows/BADGES.md @@ -1,88 +1,91 @@ -![Test sphincs-haraka-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-robust/badge.svg?branch=master) -![Test sphincs-haraka-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-simple/badge.svg?branch=master) -![Test sphincs-sha256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-simple/badge.svg?branch=master) -![Test sphincs-haraka-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-robust/badge.svg?branch=master) -![Test sphincs-sha256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-simple/badge.svg?branch=master) -![Test dilithium2](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2/badge.svg?branch=master) -![Test sphincs-shake256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-robust/badge.svg?branch=master) -![Test rainbowIII-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-compressed/badge.svg?branch=master) -![Test sphincs-haraka-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-simple/badge.svg?branch=master) -![Test sphincs-sha256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-robust/badge.svg?branch=master) -![Test sphincs-haraka-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-simple/badge.svg?branch=master) +![Test sphincs-haraka-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-robust/badge.svg?branch=master) ![Test rainbowV-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-circumzenithal/badge.svg?branch=master) -![Test sphincs-shake256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-simple/badge.svg?branch=master) -![Test sphincs-shake256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-robust/badge.svg?branch=master) ![Test rainbowIII-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-circumzenithal/badge.svg?branch=master) -![Test sphincs-sha256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-robust/badge.svg?branch=master) ![Test sphincs-haraka-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192f-simple/badge.svg?branch=master) -![Test sphincs-shake256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-simple/badge.svg?branch=master) -![Test sphincs-sha256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-robust/badge.svg?branch=master) -![Test sphincs-shake256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-robust/badge.svg?branch=master) -![Test sphincs-haraka-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-simple/badge.svg?branch=master) -![Test sphincs-sha256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-simple/badge.svg?branch=master) +![Test sphincs-sha256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-simple/badge.svg?branch=master) +![Test rainbowIII-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-classic/badge.svg?branch=master) +![Test sphincs-shake256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-simple/badge.svg?branch=master) ![Test rainbowI-circumzenithal](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-circumzenithal/badge.svg?branch=master) -![Test sphincs-sha256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-robust/badge.svg?branch=master) -![Test rainbowV-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-compressed/badge.svg?branch=master) -![Test rainbowV-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-classic/badge.svg?branch=master) -![Test sphincs-sha256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-simple/badge.svg?branch=master) -![Test falcon-512](https://github.com/PQClean/PQClean/workflows/Test%20falcon-512/badge.svg?branch=master) -![Test falcon-1024](https://github.com/PQClean/PQClean/workflows/Test%20falcon-1024/badge.svg?branch=master) -![Test sphincs-haraka-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-simple/badge.svg?branch=master) -![Test sphincs-shake256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-robust/badge.svg?branch=master) -![Test dilithium4](https://github.com/PQClean/PQClean/workflows/Test%20dilithium4/badge.svg?branch=master) -![Test sphincs-sha256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-robust/badge.svg?branch=master) -![Test sphincs-haraka-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-robust/badge.svg?branch=master) -![Test sphincs-shake256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-simple/badge.svg?branch=master) ![Test rainbowI-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-classic/badge.svg?branch=master) ![Test sphincs-haraka-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-robust/badge.svg?branch=master) -![Test sphincs-sha256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-robust/badge.svg?branch=master) -![Test sphincs-shake256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-robust/badge.svg?branch=master) -![Test rainbowI-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-compressed/badge.svg?branch=master) -![Test rainbowIII-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-classic/badge.svg?branch=master) +![Test sphincs-sha256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-simple/badge.svg?branch=master) ![Test sphincs-haraka-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192f-robust/badge.svg?branch=master) +![Test sphincs-haraka-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-robust/badge.svg?branch=master) +![Test sphincs-shake256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-robust/badge.svg?branch=master) +![Test sphincs-haraka-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-simple/badge.svg?branch=master) +![Test sphincs-haraka-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256s-simple/badge.svg?branch=master) +![Test sphincs-sha256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-robust/badge.svg?branch=master) +![Test sphincs-shake256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-simple/badge.svg?branch=master) +![Test sphincs-sha256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192f-robust/badge.svg?branch=master) ![Test dilithium3](https://github.com/PQClean/PQClean/workflows/Test%20dilithium3/badge.svg?branch=master) -![Test sphincs-sha256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-simple/badge.svg?branch=master) -![Test sphincs-shake256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-simple/badge.svg?branch=master) +![Test rainbowI-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowI-compressed/badge.svg?branch=master) +![Test dilithium2](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2/badge.svg?branch=master) +![Test sphincs-sha256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-robust/badge.svg?branch=master) +![Test sphincs-shake256-256s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-robust/badge.svg?branch=master) +![Test dilithium5](https://github.com/PQClean/PQClean/workflows/Test%20dilithium5/badge.svg?branch=master) +![Test sphincs-haraka-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-256f-simple/badge.svg?branch=master) +![Test sphincs-haraka-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-robust/badge.svg?branch=master) ![Test sphincs-shake256-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-simple/badge.svg?branch=master) -![Test sphincs-haraka-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-robust/badge.svg?branch=master) -![Test sphincs-shake256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-robust/badge.svg?branch=master) -![Test sphincs-shake256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192s-simple/badge.svg?branch=master) +![Test sphincs-sha256-256f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-simple/badge.svg?branch=master) +![Test sphincs-shake256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-robust/badge.svg?branch=master) +![Test sphincs-shake256-192f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-simple/badge.svg?branch=master) +![Test rainbowV-classic](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-classic/badge.svg?branch=master) +![Test sphincs-sha256-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-robust/badge.svg?branch=master) +![Test sphincs-shake256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256s-simple/badge.svg?branch=master) +![Test rainbowV-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowV-compressed/badge.svg?branch=master) +![Test sphincs-sha256-256s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256s-simple/badge.svg?branch=master) +![Test dilithium5aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium5aes/badge.svg?branch=master) +![Test dilithium2aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium2aes/badge.svg?branch=master) ![Test sphincs-sha256-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128s-simple/badge.svg?branch=master) -![Test mceliece460896](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896/badge.svg?branch=master) -![Test saber](https://github.com/PQClean/PQClean/workflows/Test%20saber/badge.svg?branch=master) -![Test kyber1024-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024-90s/badge.svg?branch=master) -![Test kyber1024](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024/badge.svg?branch=master) -![Test mceliece8192128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128/badge.svg?branch=master) +![Test sphincs-sha256-192s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-192s-simple/badge.svg?branch=master) +![Test rainbowIII-compressed](https://github.com/PQClean/PQClean/workflows/Test%20rainbowIII-compressed/badge.svg?branch=master) +![Test sphincs-sha256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-256f-robust/badge.svg?branch=master) +![Test falcon-512](https://github.com/PQClean/PQClean/workflows/Test%20falcon-512/badge.svg?branch=master) +![Test sphincs-shake256-192f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-192f-robust/badge.svg?branch=master) +![Test falcon-1024](https://github.com/PQClean/PQClean/workflows/Test%20falcon-1024/badge.svg?branch=master) +![Test sphincs-haraka-128s-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128s-simple/badge.svg?branch=master) +![Test sphincs-shake256-256f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-256f-robust/badge.svg?branch=master) +![Test sphincs-shake256-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128f-simple/badge.svg?branch=master) +![Test sphincs-shake256-128s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-shake256-128s-robust/badge.svg?branch=master) +![Test sphincs-haraka-192s-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-192s-robust/badge.svg?branch=master) +![Test sphincs-sha256-128f-robust](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-sha256-128f-robust/badge.svg?branch=master) +![Test sphincs-haraka-128f-simple](https://github.com/PQClean/PQClean/workflows/Test%20sphincs-haraka-128f-simple/badge.svg?branch=master) +![Test dilithium3aes](https://github.com/PQClean/PQClean/workflows/Test%20dilithium3aes/badge.svg?branch=master) +![Test kyber512-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber512-90s/badge.svg?branch=master) +![Test firesaber](https://github.com/PQClean/PQClean/workflows/Test%20firesaber/badge.svg?branch=master) +![Test frodokem1344aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344aes/badge.svg?branch=master) +![Test sntrup653](https://github.com/PQClean/PQClean/workflows/Test%20sntrup653/badge.svg?branch=master) +![Test mceliece6688128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128/badge.svg?branch=master) +![Test ntrulpr761](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr761/badge.svg?branch=master) +![Test frodokem976aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976aes/badge.svg?branch=master) +![Test hqc-rmrs-192](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-192/badge.svg?branch=master) +![Test sntrup857](https://github.com/PQClean/PQClean/workflows/Test%20sntrup857/badge.svg?branch=master) +![Test frodokem640aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640aes/badge.svg?branch=master) +![Test hqc-rmrs-128](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-128/badge.svg?branch=master) ![Test mceliece6960119](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119/badge.svg?branch=master) -![Test ntruhrss701](https://github.com/PQClean/PQClean/workflows/Test%20ntruhrss701/badge.svg?branch=master) -![Test ntrulpr857](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr857/badge.svg?branch=master) ![Test frodokem1344shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344shake/badge.svg?branch=master) ![Test mceliece6688128f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128f/badge.svg?branch=master) -![Test ntruhps2048677](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048677/badge.svg?branch=master) -![Test frodokem640aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640aes/badge.svg?branch=master) +![Test mceliece8192128f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128f/badge.svg?branch=master) +![Test kyber1024](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024/badge.svg?branch=master) +![Test mceliece348864f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864f/badge.svg?branch=master) ![Test sntrup761](https://github.com/PQClean/PQClean/workflows/Test%20sntrup761/badge.svg?branch=master) +![Test ntruhps4096821](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps4096821/badge.svg?branch=master) +![Test ntruhrss701](https://github.com/PQClean/PQClean/workflows/Test%20ntruhrss701/badge.svg?branch=master) +![Test mceliece348864](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864/badge.svg?branch=master) +![Test ntrulpr653](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr653/badge.svg?branch=master) +![Test kyber1024-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber1024-90s/badge.svg?branch=master) +![Test ntruhps2048509](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048509/badge.svg?branch=master) +![Test kyber768-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber768-90s/badge.svg?branch=master) +![Test mceliece6960119f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119f/badge.svg?branch=master) +![Test saber](https://github.com/PQClean/PQClean/workflows/Test%20saber/badge.svg?branch=master) +![Test kyber768](https://github.com/PQClean/PQClean/workflows/Test%20kyber768/badge.svg?branch=master) +![Test kyber512](https://github.com/PQClean/PQClean/workflows/Test%20kyber512/badge.svg?branch=master) +![Test lightsaber](https://github.com/PQClean/PQClean/workflows/Test%20lightsaber/badge.svg?branch=master) +![Test mceliece460896f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896f/badge.svg?branch=master) +![Test mceliece8192128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128/badge.svg?branch=master) +![Test mceliece460896](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896/badge.svg?branch=master) +![Test ntruhps2048677](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048677/badge.svg?branch=master) +![Test ntrulpr857](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr857/badge.svg?branch=master) ![Test hqc-rmrs-256](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-256/badge.svg?branch=master) ![Test frodokem976shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976shake/badge.svg?branch=master) -![Test mceliece348864](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864/badge.svg?branch=master) -![Test frodokem1344aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem1344aes/badge.svg?branch=master) -![Test hqc-rmrs-192](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-192/badge.svg?branch=master) -![Test frodokem976aes](https://github.com/PQClean/PQClean/workflows/Test%20frodokem976aes/badge.svg?branch=master) -![Test mceliece8192128f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece8192128f/badge.svg?branch=master) -![Test mceliece460896f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece460896f/badge.svg?branch=master) -![Test kyber512-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber512-90s/badge.svg?branch=master) -![Test mceliece348864f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece348864f/badge.svg?branch=master) -![Test mceliece6960119f](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6960119f/badge.svg?branch=master) -![Test firesaber](https://github.com/PQClean/PQClean/workflows/Test%20firesaber/badge.svg?branch=master) -![Test sntrup857](https://github.com/PQClean/PQClean/workflows/Test%20sntrup857/badge.svg?branch=master) ![Test frodokem640shake](https://github.com/PQClean/PQClean/workflows/Test%20frodokem640shake/badge.svg?branch=master) -![Test ntrulpr653](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr653/badge.svg?branch=master) -![Test sntrup653](https://github.com/PQClean/PQClean/workflows/Test%20sntrup653/badge.svg?branch=master) -![Test lightsaber](https://github.com/PQClean/PQClean/workflows/Test%20lightsaber/badge.svg?branch=master) -![Test kyber768-90s](https://github.com/PQClean/PQClean/workflows/Test%20kyber768-90s/badge.svg?branch=master) -![Test kyber768](https://github.com/PQClean/PQClean/workflows/Test%20kyber768/badge.svg?branch=master) -![Test ntruhps2048509](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps2048509/badge.svg?branch=master) -![Test ntruhps4096821](https://github.com/PQClean/PQClean/workflows/Test%20ntruhps4096821/badge.svg?branch=master) -![Test ntrulpr761](https://github.com/PQClean/PQClean/workflows/Test%20ntrulpr761/badge.svg?branch=master) -![Test kyber512](https://github.com/PQClean/PQClean/workflows/Test%20kyber512/badge.svg?branch=master) -![Test hqc-rmrs-128](https://github.com/PQClean/PQClean/workflows/Test%20hqc-rmrs-128/badge.svg?branch=master) -![Test mceliece6688128](https://github.com/PQClean/PQClean/workflows/Test%20mceliece6688128/badge.svg?branch=master) diff --git a/.github/workflows/sign_dilithium2aes.yml b/.github/workflows/sign_dilithium2aes.yml new file mode 100644 index 00000000..ede46b5a --- /dev/null +++ b/.github/workflows/sign_dilithium2aes.yml @@ -0,0 +1,204 @@ +on: + push: + paths: + # build if tests change + - 'test/**' + # do not build if other schemes duplicate_consistency files change + - '!test/duplicate_consistency/*.yml' + - 'test/duplicate_consistency/dilithium2aes*.yml' + # build if common files change + - 'common/**' + # build if scheme changed + - 'crypto_sign/dilithium2aes/**' + # build if workflow file changed + - '.github/workflows/sign_dilithium2aes.yml' + # Build if any files in the root change, except .md files + - '*' + - '!*.md' + pull_request: + paths: + # build if tests change + - 'test/**' + # do not build if other schemes duplicate_consistency files change + - '!test/duplicate_consistency/*.yml' + - 'test/duplicate_consistency/dilithium2aes*.yml' + # build if common files change + - 'common/**' + # build if scheme changed + - 'crypto_sign/dilithium2aes/**' + # build if workflow file changed + - '.github/workflows/sign_dilithium2aes.yml' + # Build if any files in the root change, except .md files + - '*' + - '!*.md' + schedule: + - cron: '5 4 * * *' + +name: Test dilithium2aes + +jobs: + test-native: + runs-on: ubuntu-latest + container: + image: pqclean/ci-container:${{ matrix.arch }} + env: + PQCLEAN_ONLY_SCHEMES: dilithium2aes + CC: ccache ${{ matrix.cc }} + CCACHE_NOSTATS: 1 + CCACHE_DIR: /ccache + CCACHE_SLOPPINESS: include_file_mtime + strategy: + matrix: + arch: + - amd64 + - i386 + cc: + - gcc + - clang + steps: + - name: Cancel Previous Runs + uses: thomwiggers/cancel-workflow-action@all_but_latest + with: + all_but_latest: true + access_token: ${{ github.token }} + continue-on-error: true + if: matrix.arch == 'amd64' && matrix.cc == 'gcc' + - uses: actions/checkout@v2 + with: + submodules: true + - name: Cache ccache + uses: actions/cache@v2 + env: + cache-name: cache-ccache + with: + path: /ccache + key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} + - name: Cache pip + uses: actions/cache@v2 + env: + cache-name: cache-python-pip + with: + path: ~/.cache/pip + key: v1-python-pip + - name: Install python dependencies + run: | + python3 -m pip install -U -r requirements.txt + - name: Run tests + run: | + cd test + python3 -m pytest --verbose --numprocesses=auto + test-emulated: + needs: + - test-native + runs-on: ubuntu-latest + strategy: + matrix: + arch: + - armhf + - unstable-ppc + cc: + - gcc + - clang + env: + CC: ${{ matrix.cc }} + steps: + - name: Register qemu-user-static + run: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset + - uses: actions/checkout@v2 + with: + submodules: true + - name: Cache ccache + uses: actions/cache@v2 + env: + cache-name: cache-ccache + with: + path: ~/ccache + key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} + - name: Cache pip + uses: actions/cache@v2 + env: + cache-name: cache-python-pip + with: + path: ~/.cache/pip + key: v1-python-pip + - name: Run tests in container + run: | + docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium2aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ + export CCACHE_NOSTATS=1 && \ + export CCACHE_DIR=/ccache && \ + export CCACHE_SLOPPINESS=include_file_mtime && \ + export CC=\"ccache $CC\" && \ + pip3 install -U -r requirements.txt && \ + cd test && \ + python3 -m pytest --verbose --numprocesses=auto" + test-windows: + needs: + - test-native + strategy: + matrix: + bits: + - 64 + - 32 + env: + PQCLEAN_ONLY_SCHEMES: dilithium2aes + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Setup astyle + run: | + # Setup strong crypto + Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord + Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord + Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe" + shell: powershell + - name: Setup Python + uses: actions/setup-python@main + with: + python-version: "3.x" + - name: Install python requirements + run: python -m pip install -U -r requirements.txt + - name: Run tests + run: | + call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat" + cd test + python -m pytest --verbose --numprocesses=auto + shell: cmd + test-macos: + needs: + - test-native + env: + PQCLEAN_ONLY_SCHEMES: dilithium2aes + CCACHE_NOSTATS: 1 + CCACHE_SLOPPINESS: include_file_mtime + # XCode version + DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer + strategy: + matrix: + compiler: + - clang # XCode (Apple LLVM/Clang) + - gcc9 # GNU (Homebrew) + runs-on: macos-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Install astyle + run: | + brew install astyle + - name: Set up GCC9 compiler + run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9' + if: matrix.compiler == 'gcc9' + - name: Setup Python + uses: actions/setup-python@main + with: + python-version: "3.x" + - name: Install Python dependencies + run: python -m pip install -U -r requirements.txt + - name: Run tests + run: | + cd test + python -m pytest --verbose --numprocesses=auto + diff --git a/.github/workflows/sign_dilithium3aes.yml b/.github/workflows/sign_dilithium3aes.yml new file mode 100644 index 00000000..8b7202ff --- /dev/null +++ b/.github/workflows/sign_dilithium3aes.yml @@ -0,0 +1,204 @@ +on: + push: + paths: + # build if tests change + - 'test/**' + # do not build if other schemes duplicate_consistency files change + - '!test/duplicate_consistency/*.yml' + - 'test/duplicate_consistency/dilithium3aes*.yml' + # build if common files change + - 'common/**' + # build if scheme changed + - 'crypto_sign/dilithium3aes/**' + # build if workflow file changed + - '.github/workflows/sign_dilithium3aes.yml' + # Build if any files in the root change, except .md files + - '*' + - '!*.md' + pull_request: + paths: + # build if tests change + - 'test/**' + # do not build if other schemes duplicate_consistency files change + - '!test/duplicate_consistency/*.yml' + - 'test/duplicate_consistency/dilithium3aes*.yml' + # build if common files change + - 'common/**' + # build if scheme changed + - 'crypto_sign/dilithium3aes/**' + # build if workflow file changed + - '.github/workflows/sign_dilithium3aes.yml' + # Build if any files in the root change, except .md files + - '*' + - '!*.md' + schedule: + - cron: '5 4 * * *' + +name: Test dilithium3aes + +jobs: + test-native: + runs-on: ubuntu-latest + container: + image: pqclean/ci-container:${{ matrix.arch }} + env: + PQCLEAN_ONLY_SCHEMES: dilithium3aes + CC: ccache ${{ matrix.cc }} + CCACHE_NOSTATS: 1 + CCACHE_DIR: /ccache + CCACHE_SLOPPINESS: include_file_mtime + strategy: + matrix: + arch: + - amd64 + - i386 + cc: + - gcc + - clang + steps: + - name: Cancel Previous Runs + uses: thomwiggers/cancel-workflow-action@all_but_latest + with: + all_but_latest: true + access_token: ${{ github.token }} + continue-on-error: true + if: matrix.arch == 'amd64' && matrix.cc == 'gcc' + - uses: actions/checkout@v2 + with: + submodules: true + - name: Cache ccache + uses: actions/cache@v2 + env: + cache-name: cache-ccache + with: + path: /ccache + key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} + - name: Cache pip + uses: actions/cache@v2 + env: + cache-name: cache-python-pip + with: + path: ~/.cache/pip + key: v1-python-pip + - name: Install python dependencies + run: | + python3 -m pip install -U -r requirements.txt + - name: Run tests + run: | + cd test + python3 -m pytest --verbose --numprocesses=auto + test-emulated: + needs: + - test-native + runs-on: ubuntu-latest + strategy: + matrix: + arch: + - armhf + - unstable-ppc + cc: + - gcc + - clang + env: + CC: ${{ matrix.cc }} + steps: + - name: Register qemu-user-static + run: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset + - uses: actions/checkout@v2 + with: + submodules: true + - name: Cache ccache + uses: actions/cache@v2 + env: + cache-name: cache-ccache + with: + path: ~/ccache + key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} + - name: Cache pip + uses: actions/cache@v2 + env: + cache-name: cache-python-pip + with: + path: ~/.cache/pip + key: v1-python-pip + - name: Run tests in container + run: | + docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium3aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ + export CCACHE_NOSTATS=1 && \ + export CCACHE_DIR=/ccache && \ + export CCACHE_SLOPPINESS=include_file_mtime && \ + export CC=\"ccache $CC\" && \ + pip3 install -U -r requirements.txt && \ + cd test && \ + python3 -m pytest --verbose --numprocesses=auto" + test-windows: + needs: + - test-native + strategy: + matrix: + bits: + - 64 + - 32 + env: + PQCLEAN_ONLY_SCHEMES: dilithium3aes + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Setup astyle + run: | + # Setup strong crypto + Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord + Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord + Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe" + shell: powershell + - name: Setup Python + uses: actions/setup-python@main + with: + python-version: "3.x" + - name: Install python requirements + run: python -m pip install -U -r requirements.txt + - name: Run tests + run: | + call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat" + cd test + python -m pytest --verbose --numprocesses=auto + shell: cmd + test-macos: + needs: + - test-native + env: + PQCLEAN_ONLY_SCHEMES: dilithium3aes + CCACHE_NOSTATS: 1 + CCACHE_SLOPPINESS: include_file_mtime + # XCode version + DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer + strategy: + matrix: + compiler: + - clang # XCode (Apple LLVM/Clang) + - gcc9 # GNU (Homebrew) + runs-on: macos-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Install astyle + run: | + brew install astyle + - name: Set up GCC9 compiler + run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9' + if: matrix.compiler == 'gcc9' + - name: Setup Python + uses: actions/setup-python@main + with: + python-version: "3.x" + - name: Install Python dependencies + run: python -m pip install -U -r requirements.txt + - name: Run tests + run: | + cd test + python -m pytest --verbose --numprocesses=auto + diff --git a/.github/workflows/sign_dilithium4.yml b/.github/workflows/sign_dilithium5.yml similarity index 92% rename from .github/workflows/sign_dilithium4.yml rename to .github/workflows/sign_dilithium5.yml index 82c27ce6..cc1797b4 100644 --- a/.github/workflows/sign_dilithium4.yml +++ b/.github/workflows/sign_dilithium5.yml @@ -5,13 +5,13 @@ on: - 'test/**' # do not build if other schemes duplicate_consistency files change - '!test/duplicate_consistency/*.yml' - - 'test/duplicate_consistency/dilithium4*.yml' + - 'test/duplicate_consistency/dilithium5*.yml' # build if common files change - 'common/**' # build if scheme changed - - 'crypto_sign/dilithium4/**' + - 'crypto_sign/dilithium5/**' # build if workflow file changed - - '.github/workflows/sign_dilithium4.yml' + - '.github/workflows/sign_dilithium5.yml' # Build if any files in the root change, except .md files - '*' - '!*.md' @@ -21,20 +21,20 @@ on: - 'test/**' # do not build if other schemes duplicate_consistency files change - '!test/duplicate_consistency/*.yml' - - 'test/duplicate_consistency/dilithium4*.yml' + - 'test/duplicate_consistency/dilithium5*.yml' # build if common files change - 'common/**' # build if scheme changed - - 'crypto_sign/dilithium4/**' + - 'crypto_sign/dilithium5/**' # build if workflow file changed - - '.github/workflows/sign_dilithium4.yml' + - '.github/workflows/sign_dilithium5.yml' # Build if any files in the root change, except .md files - '*' - '!*.md' schedule: - cron: '5 4 * * *' -name: Test dilithium4 +name: Test dilithium5 jobs: test-native: @@ -42,7 +42,7 @@ jobs: container: image: pqclean/ci-container:${{ matrix.arch }} env: - PQCLEAN_ONLY_SCHEMES: dilithium4 + PQCLEAN_ONLY_SCHEMES: dilithium5 CC: ccache ${{ matrix.cc }} CCACHE_NOSTATS: 1 CCACHE_DIR: /ccache @@ -124,7 +124,7 @@ jobs: key: v1-python-pip - name: Run tests in container run: | - docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium4 -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ + docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium5 -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ export CCACHE_NOSTATS=1 && \ export CCACHE_DIR=/ccache && \ export CCACHE_SLOPPINESS=include_file_mtime && \ @@ -141,7 +141,7 @@ jobs: - 64 - 32 env: - PQCLEAN_ONLY_SCHEMES: dilithium4 + PQCLEAN_ONLY_SCHEMES: dilithium5 runs-on: windows-latest steps: - uses: actions/checkout@v2 @@ -170,7 +170,7 @@ jobs: needs: - test-native env: - PQCLEAN_ONLY_SCHEMES: dilithium4 + PQCLEAN_ONLY_SCHEMES: dilithium5 CCACHE_NOSTATS: 1 CCACHE_SLOPPINESS: include_file_mtime # XCode version diff --git a/.github/workflows/sign_dilithium5aes.yml b/.github/workflows/sign_dilithium5aes.yml new file mode 100644 index 00000000..3696bc4a --- /dev/null +++ b/.github/workflows/sign_dilithium5aes.yml @@ -0,0 +1,204 @@ +on: + push: + paths: + # build if tests change + - 'test/**' + # do not build if other schemes duplicate_consistency files change + - '!test/duplicate_consistency/*.yml' + - 'test/duplicate_consistency/dilithium5aes*.yml' + # build if common files change + - 'common/**' + # build if scheme changed + - 'crypto_sign/dilithium5aes/**' + # build if workflow file changed + - '.github/workflows/sign_dilithium5aes.yml' + # Build if any files in the root change, except .md files + - '*' + - '!*.md' + pull_request: + paths: + # build if tests change + - 'test/**' + # do not build if other schemes duplicate_consistency files change + - '!test/duplicate_consistency/*.yml' + - 'test/duplicate_consistency/dilithium5aes*.yml' + # build if common files change + - 'common/**' + # build if scheme changed + - 'crypto_sign/dilithium5aes/**' + # build if workflow file changed + - '.github/workflows/sign_dilithium5aes.yml' + # Build if any files in the root change, except .md files + - '*' + - '!*.md' + schedule: + - cron: '5 4 * * *' + +name: Test dilithium5aes + +jobs: + test-native: + runs-on: ubuntu-latest + container: + image: pqclean/ci-container:${{ matrix.arch }} + env: + PQCLEAN_ONLY_SCHEMES: dilithium5aes + CC: ccache ${{ matrix.cc }} + CCACHE_NOSTATS: 1 + CCACHE_DIR: /ccache + CCACHE_SLOPPINESS: include_file_mtime + strategy: + matrix: + arch: + - amd64 + - i386 + cc: + - gcc + - clang + steps: + - name: Cancel Previous Runs + uses: thomwiggers/cancel-workflow-action@all_but_latest + with: + all_but_latest: true + access_token: ${{ github.token }} + continue-on-error: true + if: matrix.arch == 'amd64' && matrix.cc == 'gcc' + - uses: actions/checkout@v2 + with: + submodules: true + - name: Cache ccache + uses: actions/cache@v2 + env: + cache-name: cache-ccache + with: + path: /ccache + key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} + - name: Cache pip + uses: actions/cache@v2 + env: + cache-name: cache-python-pip + with: + path: ~/.cache/pip + key: v1-python-pip + - name: Install python dependencies + run: | + python3 -m pip install -U -r requirements.txt + - name: Run tests + run: | + cd test + python3 -m pytest --verbose --numprocesses=auto + test-emulated: + needs: + - test-native + runs-on: ubuntu-latest + strategy: + matrix: + arch: + - armhf + - unstable-ppc + cc: + - gcc + - clang + env: + CC: ${{ matrix.cc }} + steps: + - name: Register qemu-user-static + run: | + docker run --rm --privileged multiarch/qemu-user-static:register --reset + - uses: actions/checkout@v2 + with: + submodules: true + - name: Cache ccache + uses: actions/cache@v2 + env: + cache-name: cache-ccache + with: + path: ~/ccache + key: v1-${{ runner.os }}-build-${{ env.cache-name }}-${{ matrix.cc }}-${{ env.GITHUB_WORKFLOW }}-${{ matrix.arch }} + - name: Cache pip + uses: actions/cache@v2 + env: + cache-name: cache-python-pip + with: + path: ~/.cache/pip + key: v1-python-pip + - name: Run tests in container + run: | + docker run --rm -e CI -e CC -e PQCLEAN_ONLY_SCHEMES=dilithium5aes -v $PWD:$PWD -w $PWD -v ~/ccache:/ccache pqclean/ci-container:${{ matrix.arch }} /bin/bash -c "\ + export CCACHE_NOSTATS=1 && \ + export CCACHE_DIR=/ccache && \ + export CCACHE_SLOPPINESS=include_file_mtime && \ + export CC=\"ccache $CC\" && \ + pip3 install -U -r requirements.txt && \ + cd test && \ + python3 -m pytest --verbose --numprocesses=auto" + test-windows: + needs: + - test-native + strategy: + matrix: + bits: + - 64 + - 32 + env: + PQCLEAN_ONLY_SCHEMES: dilithium5aes + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Setup astyle + run: | + # Setup strong crypto + Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Wow6432Node\\Microsoft\\.NetFramework\\v4.0.30319" -Name 'SchUseStrongCrypto' -Value '1' -Type DWord + Set-ItemProperty -Path "HKLM:\\SOFTWARE\\Microsoft\\.NetFramework\\v4.0.30319" -Name "SchUseStrongCrypto" -Value '1' -Type DWord + Invoke-WebRequest -OutFile "test\\astyle.exe" "https://rded.nl/pqclean/AStyle.exe" + shell: powershell + - name: Setup Python + uses: actions/setup-python@main + with: + python-version: "3.x" + - name: Install python requirements + run: python -m pip install -U -r requirements.txt + - name: Run tests + run: | + call "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\VC\\Auxiliary\\Build\\vcvars${{ matrix.bits }}.bat" + cd test + python -m pytest --verbose --numprocesses=auto + shell: cmd + test-macos: + needs: + - test-native + env: + PQCLEAN_ONLY_SCHEMES: dilithium5aes + CCACHE_NOSTATS: 1 + CCACHE_SLOPPINESS: include_file_mtime + # XCode version + DEVELOPER_DIR: /Applications/Xcode_11.5.app/Contents/Developer + strategy: + matrix: + compiler: + - clang # XCode (Apple LLVM/Clang) + - gcc9 # GNU (Homebrew) + runs-on: macos-latest + steps: + - uses: actions/checkout@v2 + with: + submodules: true + - name: Install astyle + run: | + brew install astyle + - name: Set up GCC9 compiler + run: 'export PATH="/usr/local/bin:$PATH" && export CC=gcc-9' + if: matrix.compiler == 'gcc9' + - name: Setup Python + uses: actions/setup-python@main + with: + python-version: "3.x" + - name: Install Python dependencies + run: python -m pip install -U -r requirements.txt + - name: Run tests + run: | + cd test + python -m pytest --verbose --numprocesses=auto + diff --git a/BADGES.md b/BADGES.md new file mode 100644 index 00000000..e69de29b diff --git a/crypto_sign/dilithium2/META.yml b/crypto_sign/dilithium2/META.yml index 9156e58f..0249aac7 100644 --- a/crypto_sign/dilithium2/META.yml +++ b/crypto_sign/dilithium2/META.yml @@ -1,11 +1,11 @@ name: Dilithium2 type: signature -claimed-nist-level: 1 -length-public-key: 1184 -length-secret-key: 2800 -length-signature: 2044 -nistkat-sha256: 23b7d52a268bbd8633d139b64a1b0e3263777cb2b074f7af0a7fd315afe94d18 -testvectors-sha256: d647039ae7e1785414c64934d5ae37518f259acab95d6a6e873e9b6d3ad63dfd +claimed-nist-level: 2 +length-public-key: 1312 +length-secret-key: 2544 +length-signature: 2420 +nistkat-sha256: 9c636528bf81c03df6ad8f9471cb1b4d9097d66af825d4f60b7ff0d941ca4d37 +testvectors-sha256: 166fc2481358d5a1b7a528b30af36ad069b049b5755cf63b843ce0f25f35aeb6 principal-submitters: - Vadim Lyubashevsky auxiliary-submitters: @@ -17,15 +17,15 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium - name: avx2 - version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium supported_platforms: - - architecture: x86_64 - operating_systems: - - Darwin - - Linux - required_flags: - - avx2 - - bmi1 - - popcnt + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - aes + - avx2 + - popcnt diff --git a/crypto_sign/dilithium2/avx2/LICENSE b/crypto_sign/dilithium2/avx2/LICENSE index 40541676..08473af7 100644 --- a/crypto_sign/dilithium2/avx2/LICENSE +++ b/crypto_sign/dilithium2/avx2/LICENSE @@ -1,6 +1,5 @@ Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) -For Keccak and the random number generator -we are using public-domain code from sources -and by authors listed in comments on top of -the respective files. +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium2/avx2/Makefile b/crypto_sign/dilithium2/avx2/Makefile index 12418161..47101200 100644 --- a/crypto_sign/dilithium2/avx2/Makefile +++ b/crypto_sign/dilithium2/avx2/Makefile @@ -1,34 +1,27 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium2_avx2.a - -SOURCES = fips202x4.c invntt.S nttconsts.c ntt.S packing.c pointwise.S poly.c \ - polyvec.c reduce.S rejsample.c rounding.c sign.c stream.c -OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \ - polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o -HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \ - fips202x4.h shuffle.inc cdecl.inc - -CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \ - -Wmissing-prototypes -Wredundant-decls -std=c99 \ - -Wcast-align -Werror=shadow\ - -mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS) - -all: $(LIB) - +HEADERS=align.h api.h cdecl.h consts.h fips202x4.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc +OBJECTS=consts.o fips202x4.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o symmetric-shake.o f1600x4.o invntt.o ntt.o pointwise.o shuffle.o KECCAK4XDIR=../../../common/keccak4x KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) +CFLAGS=-mavx2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls \ + -Wpointer-arith -Wshadow \ + -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< %.o: %.S $(HEADERS) - $(CC) -c -o $@ $< + $(CC) $(CFLAGS) -c -o $@ $< $(LIB): $(OBJECTS) $(KECCAK4X) - $(AR) -r $@ $^ + $(AR) -r $@ $(OBJECTS) $(KECCAK4X) $(KECCAK4X): $(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ) @@ -36,5 +29,3 @@ $(KECCAK4X): clean: $(RM) $(OBJECTS) $(RM) $(LIB) - $(MAKE) -C $(KECCAK4XDIR) clean - diff --git a/crypto_sign/dilithium2/avx2/align.h b/crypto_sign/dilithium2/avx2/align.h new file mode 100644 index 00000000..1e74b915 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/align.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGN_H +#define PQCLEAN_DILITHIUM2_AVX2_ALIGN_H + +#include +#include + +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[N]; \ + __m256i vec[((N)+31)/32]; \ + } + +#define ALIGNED_INT32(N) \ + union { \ + int32_t coeffs[N]; \ + __m256i vec[((N)+7)/8]; \ + } + +#endif diff --git a/crypto_sign/dilithium2/avx2/alignment.h b/crypto_sign/dilithium2/avx2/alignment.h deleted file mode 100644 index 40279ed3..00000000 --- a/crypto_sign/dilithium2/avx2/alignment.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H -#define PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H - -#define ALIGNED_UINT8(N) \ - union { \ - uint32_t as_arr[N]; \ - __m256i as_vec[(N)/32]; \ - } - -#define ALIGNED_UINT32(N) \ - union { \ - uint32_t as_arr[N]; \ - __m256i as_vec[(N)/8]; \ - } - -#define ALIGNED_UINT64(N) \ - union { \ - uint64_t as_arr[N]; \ - __m256i as_vec[(N)/8]; \ - } - -#endif //PQCLEAN_DILITHIUM2_AVX2_ALIGNMENT_H diff --git a/crypto_sign/dilithium2/avx2/api.h b/crypto_sign/dilithium2/avx2/api.h index c8375d00..7eaf76b3 100644 --- a/crypto_sign/dilithium2/avx2/api.h +++ b/crypto_sign/dilithium2/avx2/api.h @@ -4,26 +4,13 @@ #include #include - -#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1184U -#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2800U -#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2044U - +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES 1312 +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES 2544 +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES 2420 #define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2" -int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair( - uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( uint8_t *sig, size_t *siglen, @@ -33,6 +20,12 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); #endif diff --git a/crypto_sign/dilithium4/avx2/cdecl.inc b/crypto_sign/dilithium2/avx2/cdecl.h similarity index 55% rename from crypto_sign/dilithium4/avx2/cdecl.inc rename to crypto_sign/dilithium2/avx2/cdecl.h index 01dc4734..d23e7646 100644 --- a/crypto_sign/dilithium4/avx2/cdecl.inc +++ b/crypto_sign/dilithium2/avx2/cdecl.h @@ -1,5 +1,14 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_CDECL -#define PQCLEAN_DILITHIUM4_AVX2_CDECL +#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL_H +#define PQCLEAN_DILITHIUM2_AVX2_CDECL_H + + + +#define _8XQ 0 +#define _8XQINV 8 +#define _8XDIV_QINV 16 +#define _8XDIV 24 +#define _ZETAS_QINV 32 +#define _ZETAS 328 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -9,10 +18,7 @@ * This define helps us get around this */ -#if defined(__WIN32__) || defined(__APPLE__) -#define cdecl(s) _##s -#else +#define _cdecl(s) _##s #define cdecl(s) s -#endif #endif diff --git a/crypto_sign/dilithium2/avx2/consts.c b/crypto_sign/dilithium2/avx2/consts.c new file mode 100644 index 00000000..7190fa5c --- /dev/null +++ b/crypto_sign/dilithium2/avx2/consts.c @@ -0,0 +1,101 @@ +#include "consts.h" +#include "params.h" +#include + +#define QINV 58728449 // q^(-1) mod 2^32 +#define MONT (-4186625) // 2^32 mod q +#define DIV 41978 // mont^2/256 +#define DIV_QINV (-8395782) + +const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata = {{ +//#define _8XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, + +//#define _8XQINV 8 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +//#define _8XDIV_QINV 16 + DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, + +//#define _8XDIV 24 + DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, + +//#define _ZETAS_QINV 32 + -151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, + 308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, + -1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, + -1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, + -285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, + 1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, + 1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, + 1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, + 329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, + -1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, + -202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, + -1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, + 1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, + -1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, + -783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, + 1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, + -695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, + -654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, + -247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, + -916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, + 1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, + -898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, + 2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, + 831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, + -2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, + 991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, + 908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, + -1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, + 6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, + 1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, + -1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, + 1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, + 702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, + 746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, + 885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, + 1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, + -1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, + +//#define _ZETAS 328 + -3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, + -359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, + 3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, + 3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, + 2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, + -549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, + -2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, + 1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, + 3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, + -3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, + 189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, + -1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, + -983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, + 264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, + -3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, + 2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, + 342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, + -1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, + -3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, + 3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, + 286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, + 1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, + 3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, + 2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, + -2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, + -2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, + 3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, + 3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, + 4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, + -1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, + 269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, + } +}; diff --git a/crypto_sign/dilithium2/avx2/consts.h b/crypto_sign/dilithium2/avx2/consts.h new file mode 100644 index 00000000..44a50460 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/consts.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_CONSTS_H +#define PQCLEAN_DILITHIUM2_AVX2_CONSTS_H +#include "align.h" +#include "cdecl.h" + + +typedef ALIGNED_INT32(624) qdata_t; +extern const qdata_t PQCLEAN_DILITHIUM2_AVX2_qdata; + +#endif diff --git a/crypto_sign/dilithium2/avx2/f1600x4.S b/crypto_sign/dilithium2/avx2/f1600x4.S new file mode 100644 index 00000000..76f89ca6 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/f1600x4.S @@ -0,0 +1,909 @@ +/* Taken from Bas Westerbaan's new 4-way SHAKE implementation + * for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/), + * but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */ + +#include "cdecl.h" + +.data +.p2align 5 +rho8: +.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14 +rho56: +.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8 + +.text +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4) +.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4) +cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4): +_cdecl(PQCLEAN_DILITHIUM2_AVX2_f1600x4): +vmovdqa rho8(%rip), %ymm0 +movq $6, %rax +looptop: +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 192(%rdi), %ymm4, %ymm9 +vpxor 384(%rdi), %ymm3, %ymm10 +vpxor 576(%rdi), %ymm2, %ymm11 +vpxor 768(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 0(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 96(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 320(%rdi), %ymm5, %ymm10 +vpxor 512(%rdi), %ymm4, %ymm11 +vpxor 704(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 32(%rdi), %ymm4, %ymm8 +vpxor 224(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 608(%rdi), %ymm1, %ymm11 +vpxor 640(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 128(%rdi), %ymm1, %ymm8 +vpxor 160(%rdi), %ymm5, %ymm9 +vpxor 352(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 736(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 64(%rdi), %ymm3, %ymm8 +vpxor 256(%rdi), %ymm2, %ymm9 +vpxor 448(%rdi), %ymm1, %ymm10 +vpxor 480(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 448(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 512(%rdi), %ymm4, %ymm9 +vpxor 224(%rdi), %ymm3, %ymm10 +vpxor 736(%rdi), %ymm2, %ymm11 +vpxor 448(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 8(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 576(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 640(%rdi), %ymm5, %ymm10 +vpxor 352(%rdi), %ymm4, %ymm11 +vpxor 64(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 192(%rdi), %ymm4, %ymm8 +vpxor 704(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 128(%rdi), %ymm1, %ymm11 +vpxor 480(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 768(%rdi), %ymm1, %ymm8 +vpxor 320(%rdi), %ymm5, %ymm9 +vpxor 32(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 256(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 384(%rdi), %ymm3, %ymm8 +vpxor 96(%rdi), %ymm2, %ymm9 +vpxor 608(%rdi), %ymm1, %ymm10 +vpxor 160(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 608(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 352(%rdi), %ymm4, %ymm9 +vpxor 704(%rdi), %ymm3, %ymm10 +vpxor 256(%rdi), %ymm2, %ymm11 +vpxor 608(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 16(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 736(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 480(%rdi), %ymm5, %ymm10 +vpxor 32(%rdi), %ymm4, %ymm11 +vpxor 384(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 512(%rdi), %ymm4, %ymm8 +vpxor 64(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 768(%rdi), %ymm1, %ymm11 +vpxor 160(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 448(%rdi), %ymm1, %ymm8 +vpxor 640(%rdi), %ymm5, %ymm9 +vpxor 192(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 96(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 224(%rdi), %ymm3, %ymm8 +vpxor 576(%rdi), %ymm2, %ymm9 +vpxor 128(%rdi), %ymm1, %ymm10 +vpxor 320(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 128(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 32(%rdi), %ymm4, %ymm9 +vpxor 64(%rdi), %ymm3, %ymm10 +vpxor 96(%rdi), %ymm2, %ymm11 +vpxor 128(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 24(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 256(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 160(%rdi), %ymm5, %ymm10 +vpxor 192(%rdi), %ymm4, %ymm11 +vpxor 224(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 352(%rdi), %ymm4, %ymm8 +vpxor 384(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 448(%rdi), %ymm1, %ymm11 +vpxor 320(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 608(%rdi), %ymm1, %ymm8 +vpxor 480(%rdi), %ymm5, %ymm9 +vpxor 512(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 576(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 704(%rdi), %ymm3, %ymm8 +vpxor 736(%rdi), %ymm2, %ymm9 +vpxor 768(%rdi), %ymm1, %ymm10 +vpxor 640(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 768(%rdi) +addq $32, %rsi +subq $1, %rax +jnz looptop +ret diff --git a/crypto_sign/dilithium2/avx2/fips202x4.c b/crypto_sign/dilithium2/avx2/fips202x4.c index 9fe44d0c..53e72945 100644 --- a/crypto_sign/dilithium2/avx2/fips202x4.c +++ b/crypto_sign/dilithium2/avx2/fips202x4.c @@ -1,233 +1,219 @@ -#include -#include - #include "fips202.h" #include "fips202x4.h" -#include "params.h" +#include +#include +#include +#include #define NROUNDS 24 -#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) -static uint64_t load64(const uint8_t *x) { - uint64_t r = 0; +/* Keccak round constants */ +static const uint64_t KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; - for (size_t i = 0; i < 8; ++i) { - r |= (uint64_t)x[i] << 8 * i; - } - - return r; -} - -static void store64(uint8_t *x, uint64_t u) { - for (size_t i = 0; i < 8; ++i) { - x[i] = (uint8_t)(u >> 8 * i); - } -} - -/* Use implementation from the Keccak Code Package */ -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); -#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds - -static void keccak_absorb4x(__m256i *s, - uint8_t r, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen, - uint8_t p) { +static void keccakx4_absorb_once(__m256i s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, + uint8_t p) { size_t i; - uint8_t t0[200]; - uint8_t t1[200]; - uint8_t t2[200]; - uint8_t t3[200]; - uint64_t *ss = (uint64_t *)s; + uint64_t pos = 0; + __m256i t, idx; for (i = 0; i < 25; ++i) { - s[i] = _mm256_xor_si256(s[i], s[i]); + s[i] = _mm256_setzero_si256(); } - while (mlen >= r) { + idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); + while (inlen >= r) { for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(m0 + 8 * i); - ss[4 * i + 1] ^= load64(m1 + 8 * i); - ss[4 * i + 2] ^= load64(m2 + 8 * i); - ss[4 * i + 3] ^= load64(m3 + 8 * i); + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; } + inlen -= r; - KeccakF1600_StatePermute4x(s); - mlen -= r; - m0 += r; - m1 += r; - m2 += r; - m3 += r; + PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); } - for (i = 0; i < r; ++i) { - t0[i] = 0; - t1[i] = 0; - t2[i] = 0; - t3[i] = 0; + for (i = 0; i < inlen / 8; ++i) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; } - for (i = 0; i < mlen; ++i) { - t0[i] = m0[i]; - t1[i] = m1[i]; - t2[i] = m2[i]; - t3[i] = m3[i]; + inlen -= 8 * i; + + if (inlen) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); + t = _mm256_and_si256(t, idx); + s[i] = _mm256_xor_si256(s[i], t); } - t0[i] = p; - t1[i] = p; - t2[i] = p; - t3[i] = p; - - t0[r - 1] |= 128; - t1[r - 1] |= 128; - t2[r - 1] |= 128; - t3[r - 1] |= 128; - - for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(t0 + 8 * i); - ss[4 * i + 1] ^= load64(t1 + 8 * i); - ss[4 * i + 2] ^= load64(t2 + 8 * i); - ss[4 * i + 3] ^= load64(t3 + 8 * i); - } + t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); + s[i] = _mm256_xor_si256(s[i], t); + t = _mm256_set1_epi64x((long long)(1ULL << 63)); + s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); } - -static void keccak_squeezeblocks4x(uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, +static void keccakx4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, size_t nblocks, - uint8_t r, - __m256i *s) { - uint64_t *ss = (uint64_t *)s; + unsigned int r, + __m256i s[25]) { + unsigned int i; + __m128d t; while (nblocks > 0) { - KeccakF1600_StatePermute4x(s); - for (size_t i = 0; i < r / 8; ++i) { - store64(h0 + 8 * i, ss[4 * i + 0]); - store64(h1 + 8 * i, ss[4 * i + 1]); - store64(h2 + 8 * i, ss[4 * i + 2]); - store64(h3 + 8 * i, ss[4 * i + 3]); + PQCLEAN_DILITHIUM2_AVX2_f1600x4(s, KeccakF_RoundConstants); + for (i = 0; i < r / 8; ++i) { + t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); + _mm_storel_pd((double *)&out0[8 * i], t); + _mm_storeh_pd((double *)&out1[8 * i], t); + t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); + _mm_storel_pd((double *)&out2[8 * i], t); + _mm_storeh_pd((double *)&out3[8 * i], t); } - h0 += r; - h1 += r; - h2 += r; - h3 += r; + out0 += r; + out1 += r; + out2 += r; + out3 += r; --nblocks; } - } -void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F); +void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } -void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s) { - keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s); +void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); } -void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F); +void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); } -void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s) { - keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s); +void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); } -void PQCLEAN_DILITHIUM2_AVX2_shake128_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - size_t nblocks = hlen / SHAKE128_RATE; +void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[4][SHAKE128_RATE]; - __m256i s[25]; + keccakx4_state state; - PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen); - PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); + PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); - h0 += nblocks * SHAKE128_RATE; - h1 += nblocks * SHAKE128_RATE; - h2 += nblocks * SHAKE128_RATE; - h3 += nblocks * SHAKE128_RATE; - hlen -= nblocks * SHAKE128_RATE; + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + out2 += nblocks * SHAKE128_RATE; + out3 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; - if (hlen) { - PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); - for (size_t i = 0; i < hlen; ++i) { - h0[i] = t[0][i]; - h1[i] = t[1][i]; - h2[i] = t[2][i]; - h3[i] = t[3][i]; + if (outlen) { + PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; } } } -void PQCLEAN_DILITHIUM2_AVX2_shake256_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - size_t nblocks = hlen / SHAKE256_RATE; +void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[4][SHAKE256_RATE]; - __m256i s[25]; + keccakx4_state state; - PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen); - PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); + PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); - h0 += nblocks * SHAKE256_RATE; - h1 += nblocks * SHAKE256_RATE; - h2 += nblocks * SHAKE256_RATE; - h3 += nblocks * SHAKE256_RATE; - hlen -= nblocks * SHAKE256_RATE; + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + out2 += nblocks * SHAKE256_RATE; + out3 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; - if (hlen) { - PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); - for (size_t i = 0; i < hlen; ++i) { - h0[i] = t[0][i]; - h1[i] = t[1][i]; - h2[i] = t[2][i]; - h3[i] = t[3][i]; + if (outlen) { + PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; } } } diff --git a/crypto_sign/dilithium2/avx2/fips202x4.h b/crypto_sign/dilithium2/avx2/fips202x4.h index 5702625e..9026106b 100644 --- a/crypto_sign/dilithium2/avx2/fips202x4.h +++ b/crypto_sign/dilithium2/avx2/fips202x4.h @@ -5,62 +5,60 @@ #include #include -#include "params.h" +typedef struct { + __m256i s[25]; +} keccakx4_state; -void PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); +void PQCLEAN_DILITHIUM2_AVX2_f1600x4(__m256i *s, const uint64_t *rc); -void PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s); +void PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); -void PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); +void PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state); -void PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s); +void PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); -void PQCLEAN_DILITHIUM2_AVX2_shake128_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); +void PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state); -void PQCLEAN_DILITHIUM2_AVX2_shake256_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); +void PQCLEAN_DILITHIUM2_AVX2_shake128x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +void PQCLEAN_DILITHIUM2_AVX2_shake256x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); #endif diff --git a/crypto_sign/dilithium2/avx2/invntt.S b/crypto_sign/dilithium2/avx2/invntt.S index 5a5ee1ca..364559b7 100644 --- a/crypto_sign/dilithium2/avx2/invntt.S +++ b/crypto_sign/dilithium2/avx2/invntt.S @@ -1,282 +1,240 @@ +#include "cdecl.h" .include "shuffle.inc" -#include "cdecl.inc" -.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 -vpaddd %ymm2,%ymm\l0,%ymm12 -vpaddd %ymm2,%ymm\l1,%ymm13 -vpaddd %ymm2,%ymm\l2,%ymm14 +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpsubd %ymm\l,%ymm\h,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l -vpsubd %ymm\h0,%ymm12,%ymm12 -vpsubd %ymm\h1,%ymm13,%ymm13 -vpsubd %ymm\h2,%ymm14,%ymm14 +vpmuldq %ymm\zl0,%ymm12,%ymm13 +vmovshdup %ymm12,%ymm\h +vpmuldq %ymm\zl1,%ymm\h,%ymm14 -vpmuludq %ymm\z0,%ymm12,%ymm12 -vpmuludq %ymm\z0,%ymm13,%ymm13 -vpaddd %ymm2,%ymm\l3,%ymm15 +vpmuldq %ymm\zh0,%ymm12,%ymm12 +vpmuldq %ymm\zh1,%ymm\h,%ymm\h -vpmuludq %ymm\z1,%ymm14,%ymm14 -vpsubd %ymm\h3,%ymm15,%ymm15 -vpaddd %ymm\l0,%ymm\h0,%ymm\l0 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 -vpmuludq %ymm\z1,%ymm15,%ymm15 -vpaddd %ymm\l1,%ymm\h1,%ymm\l1 -vpaddd %ymm\l2,%ymm\h2,%ymm\l2 +vpsubd %ymm13,%ymm12,%ymm12 +vpsubd %ymm14,%ymm\h,%ymm\h -vpaddd %ymm\l3,%ymm\h3,%ymm\l3 - -vpmuludq %ymm0,%ymm12,%ymm\h0 -vpmuludq %ymm0,%ymm13,%ymm\h1 -vpmuludq %ymm0,%ymm14,%ymm\h2 -vpmuludq %ymm0,%ymm15,%ymm\h3 -vpmuludq %ymm1,%ymm\h0,%ymm\h0 -vpmuludq %ymm1,%ymm\h1,%ymm\h1 -vpmuludq %ymm1,%ymm\h2,%ymm\h2 -vpmuludq %ymm1,%ymm\h3,%ymm\h3 -vpaddq %ymm12,%ymm\h0,%ymm\h0 -vpaddq %ymm13,%ymm\h1,%ymm\h1 -vpaddq %ymm14,%ymm\h2,%ymm\h2 -vpaddq %ymm15,%ymm\h3,%ymm\h3 -vpsrlq $32,%ymm\h0,%ymm\h0 -vpsrlq $32,%ymm\h1,%ymm\h1 -vpsrlq $32,%ymm\h2,%ymm\h2 -vpsrlq $32,%ymm\h3,%ymm\h3 +vmovshdup %ymm12,%ymm12 +vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h .endm -.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx) -cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 +.macro levels0t5 off +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 -#load -vmovdqa (%rsi),%ymm6 -vmovdqa 32(%rsi),%ymm7 -vmovdqa 64(%rsi),%ymm5 -vmovdqa 96(%rsi),%ymm10 +/* level 0 */ +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,5,1,3,2,15 -#reorder -shuffle8 6,5,8,5 -shuffle8 7,10,6,10 +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 6,7,1,3,2,15 -shuffle4 8,6,4,6 -shuffle4 5,10,8,10 +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,9,1,3,2,15 -vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm6,%ymm7 -vpsrlq $32,%ymm8,%ymm9 -vpsrlq $32,%ymm10,%ymm11 +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 10,11,1,3,2,15 -level0: -vpmovzxdq (%rdx),%ymm3 -vpmovzxdq 16(%rdx),%ymm15 -vpaddd %ymm2,%ymm4,%ymm12 -vpaddd %ymm2,%ymm6,%ymm13 -vpaddd %ymm2,%ymm8,%ymm14 +/* level 1 */ +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,6,1,3,2,15 +butterfly 5,7,1,3,2,15 -vpsubd %ymm5,%ymm12,%ymm12 -vpsubd %ymm7,%ymm13,%ymm13 -vpsubd %ymm9,%ymm14,%ymm14 +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,10,1,3,2,15 +butterfly 9,11,1,3,2,15 -vpmuludq %ymm3,%ymm12,%ymm12 -vpmuludq %ymm15,%ymm13,%ymm13 -vpaddd %ymm2,%ymm10,%ymm15 +/* level 2 */ +vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,8,1,3,2,15 +butterfly 5,9,1,3,2,15 +butterfly 6,10,1,3,2,15 +butterfly 7,11,1,3,2,15 -vpsubd %ymm11,%ymm15,%ymm15 -vpaddd %ymm4,%ymm5,%ymm4 -vpaddd %ymm6,%ymm7,%ymm6 -vpmovzxdq 32(%rdx),%ymm5 -vpmovzxdq 48(%rdx),%ymm7 +/* level 3 */ +shuffle2 4,5,3,5 +shuffle2 6,7,4,7 +shuffle2 8,9,6,9 +shuffle2 10,11,8,11 -vpmuludq %ymm5,%ymm14,%ymm14 -vpmuludq %ymm7,%ymm15,%ymm15 -vpaddd %ymm8,%ymm9,%ymm8 +vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 +butterfly 3,5 +butterfly 4,7 +butterfly 6,9 +butterfly 8,11 -vpaddd %ymm10,%ymm11,%ymm10 +/* level 4 */ +shuffle4 3,4,10,4 +shuffle4 6,8,3,8 +shuffle4 5,7,6,7 +shuffle4 9,11,5,11 -vpmuludq %ymm0,%ymm12,%ymm5 -vpmuludq %ymm0,%ymm13,%ymm7 -vpmuludq %ymm0,%ymm14,%ymm9 -vpmuludq %ymm0,%ymm15,%ymm11 -vpmuludq %ymm1,%ymm5,%ymm5 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm9,%ymm9 -vpmuludq %ymm1,%ymm11,%ymm11 -vpaddq %ymm12,%ymm5,%ymm5 -vpaddq %ymm13,%ymm7,%ymm7 -vpaddq %ymm14,%ymm9,%ymm9 -vpaddq %ymm15,%ymm11,%ymm11 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm7,%ymm7 -vpsrlq $32,%ymm9,%ymm9 -vpsrlq $32,%ymm11,%ymm11 +vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 +butterfly 10,4 +butterfly 3,8 +butterfly 6,7 +butterfly 5,11 -level1: -#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) -vpmovzxdq 64(%rdx),%ymm15 -vpmovzxdq 80(%rdx),%ymm3 +/* level 5 */ +shuffle8 10,3,9,3 +shuffle8 6,5,10,5 +shuffle8 4,8,6,8 +shuffle8 7,11,4,11 -butterfly 4,5,8,9,6,7,10,11 +vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 +butterfly 9,3 +butterfly 10,5 +butterfly 6,8 +butterfly 4,11 -level2: -#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) -vpmovzxdq 96(%rdx),%ymm3 +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm10,256*\off+ 32(%rdi) +vmovdqa %ymm6,256*\off+ 64(%rdi) +vmovdqa %ymm4,256*\off+ 96(%rdi) +vmovdqa %ymm3,256*\off+128(%rdi) +vmovdqa %ymm5,256*\off+160(%rdi) +vmovdqa %ymm8,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm -butterfly 4,5,6,7,8,9,10,11,3,3 +.macro levels6t7 off +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 -#shuffle -shuffle4 4,5,3,5 -shuffle4 6,7,4,7 -shuffle4 8,9,6,9 -shuffle4 10,11,8,11 +/* level 6 */ +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 -level3: -#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) -vpbroadcastd 112(%rdx),%ymm14 -vpbroadcastd 116(%rdx),%ymm15 -vpblendd $0xF0,%ymm15,%ymm14,%ymm10 +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 -butterfly 3,4,6,8,5,7,9,11,10,10 +/* level 7 */ +vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 -#shuffle -shuffle8 3,4,10,4 -shuffle8 6,8,3,8 -shuffle8 5,7,6,7 -shuffle8 9,11,5,11 +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 -level4: -#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) -vpbroadcastd 120(%rdx),%ymm9 +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) -butterfly 10,3,6,5,4,8,7,11,9,9 +vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 +vmovdqa (_8XDIV)*4(%rsi),%ymm2 +vpmuldq %ymm1,%ymm4,%ymm12 +vpmuldq %ymm1,%ymm5,%ymm13 +vmovshdup %ymm4,%ymm8 +vmovshdup %ymm5,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm4,%ymm4 +vpmuldq %ymm2,%ymm5,%ymm5 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm4,%ymm4 +vpsubd %ymm13,%ymm5,%ymm5 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm4,%ymm4 +vmovshdup %ymm5,%ymm5 +vpblendd $0xAA,%ymm8,%ymm4,%ymm4 +vpblendd $0xAA,%ymm9,%ymm5,%ymm5 -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm8,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx) -cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x256q)(%rip),%ymm2 - -#load -vmovdqa (%rsi),%ymm4 -vmovdqa 256(%rsi),%ymm5 -vmovdqa 512(%rsi),%ymm6 -vmovdqa 768(%rsi),%ymm7 -vmovdqa 1024(%rsi),%ymm8 -vmovdqa 1280(%rsi),%ymm9 -vmovdqa 1536(%rsi),%ymm10 -vmovdqa 1792(%rsi),%ymm11 - -level5: -vpbroadcastd (%rdx),%ymm3 -vpbroadcastd 4(%rdx),%ymm15 -vpaddd %ymm2,%ymm4,%ymm12 -vpaddd %ymm2,%ymm6,%ymm13 -vpaddd %ymm2,%ymm8,%ymm14 - -vpsubd %ymm5,%ymm12,%ymm12 -vpsubd %ymm7,%ymm13,%ymm13 -vpsubd %ymm9,%ymm14,%ymm14 - -vpmuludq %ymm3,%ymm12,%ymm12 -vpmuludq %ymm15,%ymm13,%ymm13 -vpaddd %ymm2,%ymm10,%ymm15 - -vpsubd %ymm11,%ymm15,%ymm15 -vpaddd %ymm4,%ymm5,%ymm4 -vpaddd %ymm6,%ymm7,%ymm6 -vpbroadcastd 8(%rdx),%ymm5 -vpbroadcastd 12(%rdx),%ymm7 - -vpmuludq %ymm5,%ymm14,%ymm14 -vpmuludq %ymm7,%ymm15,%ymm15 -vpaddd %ymm8,%ymm9,%ymm8 - -vpaddd %ymm10,%ymm11,%ymm10 - -vpmuludq %ymm0,%ymm12,%ymm5 -vpmuludq %ymm0,%ymm13,%ymm7 -vpmuludq %ymm0,%ymm14,%ymm9 -vpmuludq %ymm0,%ymm15,%ymm11 -vpmuludq %ymm1,%ymm5,%ymm5 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm9,%ymm9 -vpmuludq %ymm1,%ymm11,%ymm11 -vpaddq %ymm12,%ymm5,%ymm5 -vpaddq %ymm13,%ymm7,%ymm7 -vpaddq %ymm14,%ymm9,%ymm9 -vpaddq %ymm15,%ymm11,%ymm11 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm7,%ymm7 -vpsrlq $32,%ymm9,%ymm9 -vpsrlq $32,%ymm11,%ymm11 - -level6: -#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) -vpbroadcastd 16(%rdx),%ymm15 -vpbroadcastd 20(%rdx),%ymm3 - -butterfly 4,5,8,9,6,7,10,11 - -level7: -#cdecl(PQCLEAN_DILITHIUM2_AVX2_zetas) -vpbroadcastd 24(%rdx),%ymm3 - -butterfly 4,5,6,7,8,9,10,11,3,3 - -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xdiv)(%rip),%ymm3 - -vpmuludq %ymm3,%ymm4,%ymm4 -vpmuludq %ymm3,%ymm5,%ymm5 -vpmuludq %ymm3,%ymm6,%ymm6 -vpmuludq %ymm3,%ymm7,%ymm7 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm0,%ymm6,%ymm14 -vpmuludq %ymm0,%ymm7,%ymm15 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm12,%ymm4,%ymm4 -vpaddq %ymm13,%ymm5,%ymm5 -vpaddq %ymm14,%ymm6,%ymm6 -vpaddq %ymm15,%ymm7,%ymm7 -vpsrlq $32,%ymm4,%ymm4 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm6,%ymm6 -vpsrlq $32,%ymm7,%ymm7 - -#store -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_mask)(%rip),%ymm3 -vpermd %ymm4,%ymm3,%ymm4 -vpermd %ymm5,%ymm3,%ymm5 -vpermd %ymm6,%ymm3,%ymm6 -vpermd %ymm7,%ymm3,%ymm7 -vpermd %ymm8,%ymm3,%ymm8 -vpermd %ymm9,%ymm3,%ymm9 -vpermd %ymm10,%ymm3,%ymm10 -vpermd %ymm11,%ymm3,%ymm11 -vmovdqa %xmm4,(%rdi) -vmovdqa %xmm5,128(%rdi) -vmovdqa %xmm6,256(%rdi) -vmovdqa %xmm7,384(%rdi) -vmovdqa %xmm8,512(%rdi) -vmovdqa %xmm9,640(%rdi) -vmovdqa %xmm10,768(%rdi) -vmovdqa %xmm11,896(%rdi) +vpmuldq %ymm1,%ymm6,%ymm12 +vpmuldq %ymm1,%ymm7,%ymm13 +vmovshdup %ymm6,%ymm8 +vmovshdup %ymm7,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm6,%ymm6 +vpmuldq %ymm2,%ymm7,%ymm7 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm6,%ymm6 +vpsubd %ymm13,%ymm7,%ymm7 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm6,%ymm6 +vmovshdup %ymm7,%ymm7 +vpblendd $0xAA,%ymm8,%ymm6,%ymm6 +vpblendd $0xAA,%ymm9,%ymm7,%ymm7 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx): +_cdecl(PQCLEAN_DILITHIUM2_AVX2_invntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t5 0 +levels0t5 1 +levels0t5 2 +levels0t5 3 + +levels6t7 0 +levels6t7 1 +levels6t7 2 +levels6t7 3 ret diff --git a/crypto_sign/dilithium2/avx2/ntt.S b/crypto_sign/dilithium2/avx2/ntt.S index 1e1f7754..5c0a80fc 100644 --- a/crypto_sign/dilithium2/avx2/ntt.S +++ b/crypto_sign/dilithium2/avx2/ntt.S @@ -1,179 +1,199 @@ +#include "cdecl.h" .include "shuffle.inc" -#include "cdecl.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 -#mul -vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 -vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 -vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2 -vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3 +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpmuldq %ymm\zl0,%ymm\h,%ymm13 +vmovshdup %ymm\h,%ymm12 +vpmuldq %ymm\zl1,%ymm12,%ymm14 -#reduce -vpmuludq %ymm0,%ymm\rh0,%ymm12 -vpmuludq %ymm0,%ymm\rh1,%ymm13 -vpmuludq %ymm0,%ymm\rh2,%ymm14 -vpmuludq %ymm0,%ymm\rh3,%ymm15 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm\rh0,%ymm12,%ymm12 -vpaddq %ymm\rh1,%ymm13,%ymm13 -vpaddq %ymm\rh2,%ymm14,%ymm14 -vpaddq %ymm\rh3,%ymm15,%ymm15 -vpsrlq $32,%ymm12,%ymm12 -vpsrlq $32,%ymm13,%ymm13 -vpsrlq $32,%ymm14,%ymm14 -vpsrlq $32,%ymm15,%ymm15 +vpmuldq %ymm\zh0,%ymm\h,%ymm\h +vpmuldq %ymm\zh1,%ymm12,%ymm12 -#update -vpaddd %ymm2,%ymm\rl0,%ymm\rh0 -vpaddd %ymm2,%ymm\rl1,%ymm\rh1 -vpaddd %ymm2,%ymm\rl2,%ymm\rh2 -vpaddd %ymm2,%ymm\rl3,%ymm\rh3 -vpaddd %ymm12,%ymm\rl0,%ymm\rl0 -vpaddd %ymm13,%ymm\rl1,%ymm\rl1 -vpaddd %ymm14,%ymm\rl2,%ymm\rl2 -vpaddd %ymm15,%ymm\rl3,%ymm\rl3 -vpsubd %ymm12,%ymm\rh0,%ymm\rh0 -vpsubd %ymm13,%ymm\rh1,%ymm\rh1 -vpsubd %ymm14,%ymm\rh2,%ymm\rh2 -vpsubd %ymm15,%ymm\rh3,%ymm\rh3 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vmovshdup %ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h + +vpsubd %ymm\h,%ymm\l,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vmovshdup %ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 + +vpaddd %ymm13,%ymm12,%ymm\h +vpsubd %ymm13,%ymm\l,%ymm\l .endm -.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx) -cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 +.macro levels0t1 off +/* level 0 */ +vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 -level0: -#zetas -vpbroadcastd (%rdx),%ymm3 +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 -#load -vpmovzxdq (%rsi),%ymm4 -vpmovzxdq 128(%rsi),%ymm5 -vpmovzxdq 256(%rsi),%ymm6 -vpmovzxdq 384(%rsi),%ymm7 -vpmovzxdq 512(%rsi),%ymm8 -vpmovzxdq 640(%rsi),%ymm9 -vpmovzxdq 768(%rsi),%ymm10 -vpmovzxdq 896(%rsi),%ymm11 +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 -butterfly 4,5,6,7,8,9,10,11 +/* level 1 */ +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 -level1: -#PQCLEAN_DILITHIUM2_AVX2_zetas -vpbroadcastd 4(%rdx),%ymm12 -vpbroadcastd 8(%rdx),%ymm13 +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 -butterfly 4,5,8,9,6,7,10,11,12,12,13,13 +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) +.endm -level2: -#PQCLEAN_DILITHIUM2_AVX2_zetas -vpbroadcastd 12(%rdx),%ymm12 -vpbroadcastd 16(%rdx),%ymm13 -vpbroadcastd 20(%rdx),%ymm14 -vpbroadcastd 24(%rdx),%ymm15 +.macro levels2t7 off +/* level 2 */ +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 -butterfly 4,6,8,10,5,7,9,11,12,13,14,15 +vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,256(%rdi) -vmovdqa %ymm6,512(%rdi) -vmovdqa %ymm7,768(%rdi) -vmovdqa %ymm8,1024(%rdi) -vmovdqa %ymm9,1280(%rdi) -vmovdqa %ymm10,1536(%rdi) -vmovdqa %ymm11,1792(%rdi) - -ret - -.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx) -cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x2q)(%rip),%ymm2 - -#load -vmovdqa (%rsi),%ymm4 -vmovdqa 32(%rsi),%ymm5 -vmovdqa 64(%rsi),%ymm6 -vmovdqa 96(%rsi),%ymm7 -vmovdqa 128(%rsi),%ymm8 -vmovdqa 160(%rsi),%ymm9 -vmovdqa 192(%rsi),%ymm10 -vmovdqa 224(%rsi),%ymm11 - -level3: -#zetas -vpbroadcastd (%rdx),%ymm3 - -butterfly 4,5,6,7,8,9,10,11 - -level4: -#PQCLEAN_DILITHIUM2_AVX2_zetas -vpbroadcastd 4(%rdx),%ymm12 -vpbroadcastd 8(%rdx),%ymm13 -vpblendd $0xF0,%ymm13,%ymm12,%ymm12 +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 shuffle8 4,8,3,8 shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly 3,8,4,9,5,10,6,11,12,12,12,12 +/* level 3 */ +vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 -level5: -#zetas -vpmovzxdq 12(%rdx),%ymm12 +butterfly 3,5 +butterfly 8,10 +butterfly 4,6 +butterfly 9,11 shuffle4 3,5,7,5 shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly 7,5,3,10,8,6,4,11,12,12,12,12 +/* level 4 */ +vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 -level6: -#zetas -vpmovzxdq 28(%rdx),%ymm12 -vpmovzxdq 44(%rdx),%ymm13 +butterfly 7,8 +butterfly 5,6 +butterfly 3,4 +butterfly 10,11 -butterfly 7,5,8,6,3,10,4,11,12,12,13,13 +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 -level7: -#zetas -vpmovzxdq 60(%rdx),%ymm12 -vpmovzxdq 76(%rdx),%ymm13 -vpmovzxdq 92(%rdx),%ymm14 -vpmovzxdq 108(%rdx),%ymm15 +/* level 5 */ +vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 -butterfly 7,3,8,4,5,10,6,11,12,13,14,15 +butterfly 9,5,1,10,2,15 +butterfly 8,4,1,10,2,15 +butterfly 7,3,1,10,2,15 +butterfly 6,11,1,10,2,15 -#store -vpsllq $32,%ymm5,%ymm5 -vpsllq $32,%ymm10,%ymm10 -vpsllq $32,%ymm6,%ymm6 -vpsllq $32,%ymm11,%ymm11 -vpblendd $0xAA,%ymm5,%ymm7,%ymm7 -vpblendd $0xAA,%ymm10,%ymm3,%ymm3 -vpblendd $0xAA,%ymm6,%ymm8,%ymm8 -vpblendd $0xAA,%ymm11,%ymm4,%ymm4 +/* level 6 */ +vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,7,1,10,2,15 +butterfly 8,6,1,10,2,15 -shuffle4 7,3,5,3 -shuffle4 8,4,7,4 +vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,3,1,10,2,15 +butterfly 4,11,1,10,2,15 -shuffle8 5,7,6,7 -shuffle8 3,4,5,4 +/* level 7 */ +vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,8,1,10,2,15 -vmovdqa %ymm6,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm7,64(%rdi) -vmovdqa %ymm4,96(%rdi) +vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 7,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,4,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 3,11,1,10,2,15 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm8,256*\off+ 32(%rdi) +vmovdqa %ymm7,256*\off+ 64(%rdi) +vmovdqa %ymm6,256*\off+ 96(%rdi) +vmovdqa %ymm5,256*\off+128(%rdi) +vmovdqa %ymm4,256*\off+160(%rdi) +vmovdqa %ymm3,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx): +_cdecl(PQCLEAN_DILITHIUM2_AVX2_ntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 + +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 ret + diff --git a/crypto_sign/dilithium2/avx2/ntt.h b/crypto_sign/dilithium2/avx2/ntt.h index 681f6e3f..fd62445f 100644 --- a/crypto_sign/dilithium2/avx2/ntt.h +++ b/crypto_sign/dilithium2/avx2/ntt.h @@ -1,36 +1,14 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM2_AVX2_NTT_H +#define PQCLEAN_DILITHIUM2_AVX2_NTT_H -#include +#include -#include "nttconsts.h" -#include "params.h" +void PQCLEAN_DILITHIUM2_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); +void PQCLEAN_DILITHIUM2_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); -void PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx( - uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas -); -void PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx( - uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas -); +void PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx(__m256i *a); -void PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx( - uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas_inv -); -void PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx( - uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas_inv -); - -void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx( - uint32_t *c, const uint32_t *a, const uint32_t *b); -void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx( - uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); +void PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2_AVX2_qdata); #endif diff --git a/crypto_sign/dilithium2/avx2/nttconsts.c b/crypto_sign/dilithium2/avx2/nttconsts.c deleted file mode 100644 index a351ab32..00000000 --- a/crypto_sign/dilithium2/avx2/nttconsts.c +++ /dev/null @@ -1,80 +0,0 @@ -#include "nttconsts.h" - -#define QINV 4236238847 // -q^(-1) mod 2^32 -#define MONT 4193792ULL -#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) - - -const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, - 256 * Q - } - }; -const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, - 0x7FFFFF, 0x7FFFFF - } - }; -const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; - -#undef QINV -#undef MONT -#undef DIV - - -const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas = { - .as_arr = { - 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776, - 3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667, - 5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191, - 3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439, - 7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422, - 1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579, - 8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390, - 7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620, - 5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868, - 3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076, - 6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435, - 5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599, - 3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165, - 7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031, - 6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064, - 2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112, - 162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237, - 8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977, - 1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735, - 6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892, - 5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443, - 7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090, - 4834730, 7018208, 1976782 - } -}; - -const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv = { - .as_arr = { - 6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985, - 4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748, - 2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646, - 1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087, - 177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422, - 6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573, - 5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061, - 6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386, - 1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252, - 1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818, - 1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496, - 7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525, - 6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443, - 7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093, - 5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647, - 4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669, - 43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330, - 1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900, - 6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326, - 4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745, - 3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293, - 518909, 2608894, 3975713 - } -}; diff --git a/crypto_sign/dilithium2/avx2/nttconsts.h b/crypto_sign/dilithium2/avx2/nttconsts.h deleted file mode 100644 index 107bdc87..00000000 --- a/crypto_sign/dilithium2/avx2/nttconsts.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H -#define PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H - -#include -#include - -#include "alignment.h" -#include "params.h" - -typedef ALIGNED_UINT32(8) aligned_uint32x8_t; - -typedef ALIGNED_UINT32(N) aligned_uint32xN_t; - - -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xqinv; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xq; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x2q; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x256q; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_mask; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8x23ones; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM2_AVX2_8xdiv; - -extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas; -extern const aligned_uint32xN_t PQCLEAN_DILITHIUM2_AVX2_zetas_inv; - -#endif //PQCLEAN_DILITHIUM2_AVX2_NTTCONSTS_H - diff --git a/crypto_sign/dilithium2/avx2/packing.c b/crypto_sign/dilithium2/avx2/packing.c index aedef4e7..d6fafe43 100644 --- a/crypto_sign/dilithium2/avx2/packing.c +++ b/crypto_sign/dilithium2/avx2/packing.c @@ -3,6 +3,7 @@ #include "poly.h" #include "polyvec.h" + /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_pack_pk * @@ -12,17 +13,18 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { pk[i] = rho[i]; } pk += SEEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); } } @@ -35,212 +37,201 @@ void PQCLEAN_DILITHIUM2_AVX2_pack_pk( * - const polyveck *t1: pointer to output vector t1 * - uint8_t pk[]: byte array containing bit-packed pk **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { rho[i] = pk[i]; } pk += SEEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); } } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_pack_sk * -* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). * * Arguments: - uint8_t sk[]: output byte array * - const uint8_t rho[]: byte array containing rho -* - const uint8_t key[]: byte array containing key * - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 -* - const polyveck *t0: pointer to vector t0 **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { sk[i] = rho[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < SEEDBYTES; ++i) { + for (i = 0; i < SEEDBYTES; ++i) { sk[i] = key[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < CRHBYTES; ++i) { + for (i = 0; i < CRHBYTES; ++i) { sk[i] = tr[i]; } sk += CRHBYTES; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); } - sk += L * POLETA_SIZE_PACKED; + sk += L * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); } - sk += K * POLETA_SIZE_PACKED; + sk += K * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); } } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sk * -* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). * * Arguments: - const uint8_t rho[]: output byte array for rho -* - const uint8_t key[]: output byte array for key * - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 * - const polyvecl *s1: pointer to output vector s1 * - const polyveck *s2: pointer to output vector s2 -* - const polyveck *r0: pointer to output vector t0 * - uint8_t sk[]: byte array containing bit-packed sk **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { rho[i] = sk[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < SEEDBYTES; ++i) { + for (i = 0; i < SEEDBYTES; ++i) { key[i] = sk[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < CRHBYTES; ++i) { + for (i = 0; i < CRHBYTES; ++i) { tr[i] = sk[i]; } sk += CRHBYTES; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); } - sk += L * POLETA_SIZE_PACKED; + sk += L * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); } - sk += K * POLETA_SIZE_PACKED; + sk += K * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); } } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_pack_sig * -* Description: Bit-pack signature sig = (z, h, c). +* Description: Bit-pack signature sig = (c, z, h). * * Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_AVX2_challenge hash length SEEDBYTES * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { - size_t k; - uint64_t signs, mask; +void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; } - sig += L * POLZ_SIZE_PACKED; + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; /* Encode h */ - k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { - if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (uint8_t)j; - } - } - - sig[OMEGA + i] = (uint8_t)k; - } - while (k < OMEGA) { - sig[k++] = 0; - } - sig += OMEGA + K; - - /* Encode c */ - signs = 0; - mask = 1; - for (size_t i = 0; i < N / 8; ++i) { + for (i = 0; i < OMEGA + K; ++i) { sig[i] = 0; - for (size_t j = 0; j < 8; ++j) { - if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (uint8_t)(1u << j); - if (c->coeffs[8 * i + j] == (Q - 1)) { - signs |= mask; - } - mask <<= 1; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; } } - } - sig += N / 8; - for (size_t i = 0; i < 8; ++i) { - sig[i] = (uint8_t)(signs >> 8u * i); + + sig[OMEGA + i] = (uint8_t) k; } } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_unpack_sig * -* Description: Unpack signature sig = (z, h, c). +* Description: Unpack signature sig = (c, z, h). * -* Arguments: - polyvecl *z: pointer to output vector z +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_AVX2_challenge hash +* - polyvecl *z: pointer to output vector z * - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial * - const uint8_t sig[]: byte array containing * bit-packed signature * * Returns 1 in case of malformed signature; otherwise 0. **************************************************/ -int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]) { - size_t k; - uint64_t signs; +int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]) { + unsigned int i, j, k; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; } - sig += L * POLZ_SIZE_PACKED; + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; /* Decode h */ k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { h->vec[i].coeffs[j] = 0; } @@ -248,7 +239,7 @@ int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( return 1; } - for (size_t j = k; j < sig[OMEGA + i]; ++j) { + for (j = k; j < sig[OMEGA + i]; ++j) { /* Coefficients are ordered for strong unforgeability */ if (j > k && sig[j] <= sig[j - 1]) { return 1; @@ -260,38 +251,11 @@ int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( } /* Extra indices are zero for strong unforgeability */ - for (size_t j = k; j < OMEGA; ++j) { + for (j = k; j < OMEGA; ++j) { if (sig[j]) { return 1; } } - sig += OMEGA + K; - - /* Decode c */ - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)sig[N / 8 + i] << 8 * i; - } - - /* Extra sign bits are zero for strong unforgeability */ - if (signs >> 60) { - return 1; - } - - for (size_t i = 0; i < N / 8; ++i) { - for (size_t j = 0; j < 8; ++j) { - if ((sig[i] >> j) & 0x01) { - c->coeffs[8 * i + j] = 1; - c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - } - } - return 0; } diff --git a/crypto_sign/dilithium2/avx2/packing.h b/crypto_sign/dilithium2/avx2/packing.h index 42e97930..12400f09 100644 --- a/crypto_sign/dilithium2/avx2/packing.h +++ b/crypto_sign/dilithium2/avx2/packing.h @@ -1,42 +1,31 @@ #ifndef PQCLEAN_DILITHIUM2_AVX2_PACKING_H #define PQCLEAN_DILITHIUM2_AVX2_PACKING_H - -#include "api.h" #include "params.h" #include "polyvec.h" +#include -void PQCLEAN_DILITHIUM2_AVX2_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1); -void PQCLEAN_DILITHIUM2_AVX2_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[SEEDBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM2_AVX2_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, const polyveck *h, const poly *c); +void PQCLEAN_DILITHIUM2_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); -void PQCLEAN_DILITHIUM2_AVX2_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM2_AVX2_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t *sk); -int PQCLEAN_DILITHIUM2_AVX2_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]); +void PQCLEAN_DILITHIUM2_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM2_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM2_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM2_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM2_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES]); #endif diff --git a/crypto_sign/dilithium2/avx2/params.h b/crypto_sign/dilithium2/avx2/params.h index 5d5068cd..6556cc70 100644 --- a/crypto_sign/dilithium2/avx2/params.h +++ b/crypto_sign/dilithium2/avx2/params.h @@ -2,28 +2,40 @@ #define PQCLEAN_DILITHIUM2_AVX2_PARAMS_H + #define SEEDBYTES 32 #define CRHBYTES 48 #define N 256 #define Q 8380417 -#define QBITS 23 -#define D 14 -#define GAMMA1 ((Q - 1)/16) -#define GAMMA2 (GAMMA1/2) -#define ALPHA (2*GAMMA2) +#define D 13 +#define ROOT_OF_UNITY 1753 #define K 4 -#define L 3 -#define ETA 6 -#define SETABITS 4 -#define BETA 325 +#define L 4 +#define ETA 2 +#define TAU 39 +#define BETA 78 +#define GAMMA1 (1 << 17) +#define GAMMA2 ((Q-1)/88) #define OMEGA 80 +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_ALGNAME "Dilithium2" -#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) -#define POLT0_SIZE_PACKED ((N*D)/8) -#define POLETA_SIZE_PACKED ((N*SETABITS)/8) -#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) -#define POLW1_SIZE_PACKED ((N*4)/8) +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 576 + +#define POLYW1_PACKEDBYTES 192 + +#define POLYETA_PACKEDBYTES 96 + +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/crypto_sign/dilithium2/avx2/pointwise.S b/crypto_sign/dilithium2/avx2/pointwise.S index 1f638b1e..1c8c8122 100644 --- a/crypto_sign/dilithium2/avx2/pointwise.S +++ b/crypto_sign/dilithium2/avx2/pointwise.S @@ -1,11 +1,14 @@ #include "params.h" -#include "cdecl.inc" +#include "cdecl.h" +.text .global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) +.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): +_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_avx): #consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 xor %eax,%eax _looptop1: @@ -18,41 +21,41 @@ vmovdqa 32(%rdx),%ymm12 vmovdqa 64(%rdx),%ymm14 vpsrlq $32,%ymm2,%ymm3 vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm6,%ymm7 +vmovshdup %ymm6,%ymm7 vpsrlq $32,%ymm10,%ymm11 vpsrlq $32,%ymm12,%ymm13 -vpsrlq $32,%ymm14,%ymm15 +vmovshdup %ymm14,%ymm15 #mul -vpmuludq %ymm2,%ymm10,%ymm2 -vpmuludq %ymm3,%ymm11,%ymm3 -vpmuludq %ymm4,%ymm12,%ymm4 -vpmuludq %ymm5,%ymm13,%ymm5 -vpmuludq %ymm6,%ymm14,%ymm6 -vpmuludq %ymm7,%ymm15,%ymm7 +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm6,%ymm14,%ymm6 +vpmuldq %ymm7,%ymm15,%ymm7 #reduce -vpmuludq %ymm0,%ymm2,%ymm10 -vpmuludq %ymm0,%ymm3,%ymm11 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm0,%ymm6,%ymm14 -vpmuludq %ymm0,%ymm7,%ymm15 -vpmuludq %ymm1,%ymm10,%ymm10 -vpmuludq %ymm1,%ymm11,%ymm11 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm2,%ymm10,%ymm2 -vpaddq %ymm3,%ymm11,%ymm3 -vpaddq %ymm4,%ymm12,%ymm4 -vpaddq %ymm5,%ymm13,%ymm5 -vpaddq %ymm6,%ymm14,%ymm6 -vpaddq %ymm7,%ymm15,%ymm7 +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm0,%ymm6,%ymm14 +vpmuldq %ymm0,%ymm7,%ymm15 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpmuldq %ymm1,%ymm14,%ymm14 +vpmuldq %ymm1,%ymm15,%ymm15 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsubq %ymm14,%ymm6,%ymm6 +vpsubq %ymm15,%ymm7,%ymm7 vpsrlq $32,%ymm2,%ymm2 vpsrlq $32,%ymm4,%ymm4 -vpsrlq $32,%ymm6,%ymm6 +vmovshdup %ymm6,%ymm6 #store vpblendd $0xAA,%ymm3,%ymm2,%ymm2 @@ -67,7 +70,7 @@ add $96,%rsi add $96,%rdx add $1,%eax cmp $10,%eax -jb _looptop1 +jb _looptop1 vmovdqa (%rsi),%ymm2 vmovdqa 32(%rsi),%ymm4 @@ -75,30 +78,30 @@ vmovdqa (%rdx),%ymm10 vmovdqa 32(%rdx),%ymm12 vpsrlq $32,%ymm2,%ymm3 vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm10,%ymm11 -vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 #mul -vpmuludq %ymm2,%ymm10,%ymm2 -vpmuludq %ymm3,%ymm11,%ymm3 -vpmuludq %ymm4,%ymm12,%ymm4 -vpmuludq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 #reduce -vpmuludq %ymm0,%ymm2,%ymm10 -vpmuludq %ymm0,%ymm3,%ymm11 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm1,%ymm10,%ymm10 -vpmuludq %ymm1,%ymm11,%ymm11 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpaddq %ymm2,%ymm10,%ymm2 -vpaddq %ymm3,%ymm11,%ymm3 -vpaddq %ymm4,%ymm12,%ymm4 -vpaddq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 vpsrlq $32,%ymm2,%ymm2 -vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm4,%ymm4 #store vpblendd $0x55,%ymm2,%ymm3,%ymm2 @@ -116,14 +119,14 @@ vmovdqa \off(%rdx),%ymm10 vmovdqa \off+32(%rdx),%ymm12 vpsrlq $32,%ymm6,%ymm7 vpsrlq $32,%ymm8,%ymm9 -vpsrlq $32,%ymm10,%ymm11 -vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 #mul -vpmuludq %ymm6,%ymm10,%ymm6 -vpmuludq %ymm7,%ymm11,%ymm7 -vpmuludq %ymm8,%ymm12,%ymm8 -vpmuludq %ymm9,%ymm13,%ymm9 +vpmuldq %ymm6,%ymm10,%ymm6 +vpmuldq %ymm7,%ymm11,%ymm7 +vpmuldq %ymm8,%ymm12,%ymm8 +vpmuldq %ymm9,%ymm13,%ymm9 .endm .macro acc @@ -134,10 +137,12 @@ vpaddq %ymm9,%ymm5,%ymm5 .endm .global cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) +.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx) cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): +_cdecl(PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx): #consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm1 +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 xor %eax,%eax _looptop2: @@ -155,23 +160,27 @@ acc pointwise 2048 acc +pointwise 3072 +acc + + #reduce -vpmuludq %ymm0,%ymm2,%ymm6 -vpmuludq %ymm0,%ymm3,%ymm7 -vpmuludq %ymm0,%ymm4,%ymm8 -vpmuludq %ymm0,%ymm5,%ymm9 -vpmuludq %ymm1,%ymm6,%ymm6 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm8,%ymm8 -vpmuludq %ymm1,%ymm9,%ymm9 -vpaddq %ymm2,%ymm6,%ymm2 -vpaddq %ymm3,%ymm7,%ymm3 -vpaddq %ymm4,%ymm8,%ymm4 -vpaddq %ymm5,%ymm9,%ymm5 +vpmuldq %ymm0,%ymm2,%ymm6 +vpmuldq %ymm0,%ymm3,%ymm7 +vpmuldq %ymm0,%ymm4,%ymm8 +vpmuldq %ymm0,%ymm5,%ymm9 +vpmuldq %ymm1,%ymm6,%ymm6 +vpmuldq %ymm1,%ymm7,%ymm7 +vpmuldq %ymm1,%ymm8,%ymm8 +vpmuldq %ymm1,%ymm9,%ymm9 +vpsubq %ymm6,%ymm2,%ymm2 +vpsubq %ymm7,%ymm3,%ymm3 +vpsubq %ymm8,%ymm4,%ymm4 +vpsubq %ymm9,%ymm5,%ymm5 vpsrlq $32,%ymm2,%ymm2 -vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm4,%ymm4 #store vpblendd $0xAA,%ymm3,%ymm2,%ymm2 diff --git a/crypto_sign/dilithium2/avx2/poly.c b/crypto_sign/dilithium2/avx2/poly.c index a3f4dfc9..113e5fca 100644 --- a/crypto_sign/dilithium2/avx2/poly.c +++ b/crypto_sign/dilithium2/avx2/poly.c @@ -1,52 +1,94 @@ -#include -#include - +#include "align.h" +#include "consts.h" #include "fips202x4.h" #include "ntt.h" -#include "nttconsts.h" #include "params.h" #include "poly.h" -#include "reduce.h" #include "rejsample.h" #include "rounding.h" #include "symmetric.h" +#include +#include +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_reduce * -* Description: Reduce all coefficients of input polynomial to representative -* in [0,2*Q[. +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. Assumes input +* coefficients to be at most 2^31 - 2^22 - 1 in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a) { - PQCLEAN_DILITHIUM2_AVX2_reduce_avx(a->coeffs); + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2_AVX2_qdata.vec[_8XQ / 8]); + const __m256i off = _mm256_set1_epi32(1 << 22); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_add_epi32(f, off); + g = _mm256_srai_epi32(g, 23); + g = _mm256_mullo_epi32(g, q); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); } /************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_poly_csubq +* Name: poly_addq * -* Description: For all coefficients of input polynomial subtract Q if -* coefficient is bigger than Q. +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_poly_csubq(poly *a) { - PQCLEAN_DILITHIUM2_AVX2_csubq_avx(a->coeffs); +void PQCLEAN_DILITHIUM2_AVX2_poly_caddq(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2_AVX2_qdata.vec[_8XQ / 8]); + const __m256i zero = _mm256_setzero_si256(); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_blendv_epi32(zero, q, f); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_freeze * -* Description: Reduce all coefficients of the polynomial to standard -* representatives. +* Description: Inplace reduction of all coefficients of polynomial to +* positive standard representatives. Assumes input +* coefficients to be at most 2^31 - 2^22 + 1 in +* absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a) { - PQCLEAN_DILITHIUM2_AVX2_reduce_avx(a->coeffs); - PQCLEAN_DILITHIUM2_AVX2_csubq_avx(a->coeffs); + DBENCH_START(); + PQCLEAN_DILITHIUM2_AVX2_poly_reduce(a); + PQCLEAN_DILITHIUM2_AVX2_poly_caddq(a); + + DBENCH_STOP(*tred); } /************************************************* @@ -59,20 +101,24 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a) { * - const poly *b: pointer to second summand **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) { - __m256i vec0, vec1; - for (size_t i = 0; i < N / 8; i++) { - vec0 = _mm256_load_si256(&a->coeffs_x8[i]); - vec1 = _mm256_load_si256(&b->coeffs_x8[i]); - vec0 = _mm256_add_epi32(vec0, vec1); - _mm256_store_si256(&c->coeffs_x8[i], vec0); + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); } + + DBENCH_STOP(*tadd); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_sub * -* Description: Subtract polynomials. Assumes coefficients of second input -* polynomial to be less than 2*Q. No modular reduction is +* Description: Subtract polynomials. No modular reduction is * performed. * * Arguments: - poly *c: pointer to output polynomial @@ -81,227 +127,239 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b) { * subtraced from first input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { - __m256i vec0, vec1; - const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM2_AVX2_8x2q.as_vec); + unsigned int i; + __m256i f, g; + DBENCH_START(); - for (size_t i = 0; i < N / 8; i++) { - vec0 = _mm256_load_si256(&a->coeffs_x8[i]); - vec1 = _mm256_load_si256(&b->coeffs_x8[i]); - vec0 = _mm256_add_epi32(vec0, twoq); - vec0 = _mm256_sub_epi32(vec0, vec1); - _mm256_store_si256(&c->coeffs_x8[i], vec0); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); } + + DBENCH_STOP(*tadd); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_shiftl * * Description: Multiply polynomial by 2^D without modular reduction. Assumes -* input coefficients to be less than 2^{32-D}. +* input coefficients to be less than 2^{31-D} in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a) { - __m256i vec; + unsigned int i; + __m256i f; + DBENCH_START(); - for (size_t i = 0; i < N / 8; i++) { - vec = _mm256_load_si256(&a->coeffs_x8[i]); - vec = _mm256_slli_epi32(vec, D); - _mm256_store_si256(&a->coeffs_x8[i], vec); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_slli_epi32(f, D); + _mm256_store_si256(&a->vec[i], f); } + + DBENCH_STOP(*tmul); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_ntt * -* Description: Forward NTT. Output coefficients can be up to 16*Q larger than -* input coefficients. +* Description: Inplace forward NTT. Coefficients can grow by up to +* 8*Q in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a) { - ALIGNED_UINT64(N) tmp; + DBENCH_START(); - for (size_t i = 0; i < N / 32; ++i) { - PQCLEAN_DILITHIUM2_AVX2_ntt_levels0t2_avx(tmp.as_arr + 4 * i, a->coeffs + 4 * i, PQCLEAN_DILITHIUM2_AVX2_zetas.as_arr + 1); - } - for (size_t i = 0; i < N / 32; ++i) { - PQCLEAN_DILITHIUM2_AVX2_ntt_levels3t8_avx(a->coeffs + 32 * i, tmp.as_arr + 32 * i, PQCLEAN_DILITHIUM2_AVX2_zetas.as_arr + 8 + 31 * i); - } + PQCLEAN_DILITHIUM2_AVX2_ntt_avx(a->vec, PQCLEAN_DILITHIUM2_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); } /************************************************* -* Name: poly_invntt_montgomery +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont * -* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients -* need to be less than 2*Q. Output coefficients are less than 2*Q. +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(poly *a) { - ALIGNED_UINT64(N) tmp; +void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(poly *a) { + DBENCH_START(); - for (size_t i = 0; i < N / 32; i++) { - PQCLEAN_DILITHIUM2_AVX2_invntt_levels0t4_avx(tmp.as_arr + 32 * i, a->coeffs + 32 * i, PQCLEAN_DILITHIUM2_AVX2_zetas_inv.as_arr + 31 * i); - } - for (size_t i = 0; i < N / 32; i++) { - PQCLEAN_DILITHIUM2_AVX2_invntt_levels5t7_avx(a->coeffs + 4 * i, tmp.as_arr + 4 * i, PQCLEAN_DILITHIUM2_AVX2_zetas_inv.as_arr + 248); - } + PQCLEAN_DILITHIUM2_AVX2_invntt_avx(a->vec, PQCLEAN_DILITHIUM2_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +void PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx(a->vec); + + DBENCH_STOP(*tmul); } /************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery * * Description: Pointwise multiplication of polynomials in NTT domain * representation and multiplication of resulting polynomial -* with 2^{-32}. Output coefficients are less than 2*Q if input -* coefficient are less than 22*Q. +* by 2^{-32}. * * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(c->coeffs, a->coeffs, b->coeffs); +void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2_AVX2_pointwise_avx(c->vec, a->vec, b->vec, PQCLEAN_DILITHIUM2_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_power2round * * Description: For all coefficients c of the input polynomial, -* compute c0, c1 such that c mod Q = c1*2^D + c0 +* compute c0, c1 such that c mod^+ Q = c1*2^D + c0 * with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be -* standard representatives. +* positive standard representatives. * * Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *v: pointer to input polynomial +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *restrict a1, - poly *restrict a0, - const poly *restrict a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_power2round(a->coeffs[i], &a0->coeffs[i]); - } +void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2_AVX2_power2round_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_decompose * * Description: For all coefficients c of the input polynomial, -* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 -* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we * set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. -* Assumes coefficients to be standard representatives. +* Assumes coefficients to be positive standard representatives. * * Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *c: pointer to input polynomial +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_poly_decompose( - poly *restrict a1, - poly *restrict a0, - const poly *restrict a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_decompose(a->coeffs[i], &a0->coeffs[i]); - } +void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2_AVX2_decompose_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_make_hint * -* Description: Compute hint polynomial. The coefficients of which indicate -* whether the low bits of the corresponding coefficient of -* the input polynomial overflow into the high bits. +* Description: Compute hint array. The coefficients of which are the +* indices of the coefficients of the input polynomial +* whose low bits overflow into the high bits. * -* Arguments: - poly *h: pointer to output hint polynomial +* Arguments: - uint8_t *h: pointer to output hint array (preallocated of length N) * - const poly *a0: pointer to low part of input polynomial * - const poly *a1: pointer to high part of input polynomial * -* Returns number of 1 bits. +* Returns number of hints, i.e. length of hint array. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_AVX2_poly_make_hint( - poly *restrict h, - const poly *restrict a0, - const poly *restrict a1) { - uint32_t s = 0; - for (size_t i = 0; i < N; ++i) { - h->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_make_hint(a0->coeffs[i], a1->coeffs[i]); - s += h->coeffs[i]; - } - return s; +unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1) { + unsigned int r; + DBENCH_START(); + + r = PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(hint, a0->vec, a1->vec); + + DBENCH_STOP(*tround); + return r; } /************************************************* - * Name: PQCLEAN_DILITHIUM2_AVX2_poly_use_hint - * - * Description: Use hint polynomial to correct the high bits of a polynomial. +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_use_hint * -* Arguments: - poly *a: pointer to output polynomial with corrected high bits -* - const poly *b: pointer to input polynomial +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial * - const poly *h: pointer to input hint polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint( - poly *restrict a, - const poly *restrict b, - const poly *restrict h) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM2_AVX2_use_hint(b->coeffs[i], h->coeffs[i]); - } +void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(b->vec, a->vec, h->vec); + + DBENCH_STOP(*tround); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_poly_chknorm * * Description: Check infinity norm of polynomial against given bound. -* Assumes input coefficients to be standard representatives. +* Assumes input polynomial to be reduced by PQCLEAN_DILITHIUM2_AVX2_poly_reduce(). * * Arguments: - const poly *a: pointer to polynomial -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm is strictly smaller than B and 1 otherwise. +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, uint32_t B) { - int32_t t; +int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int r; + __m256i f, t; + const __m256i bound = _mm256_set1_epi32(B - 1); + DBENCH_START(); - /* It is ok to leak which coefficient violates the bound since - the probability for each coefficient is independent of secret - data but we must not leak the sign of the centralized representative. */ - for (size_t i = 0; i < N; ++i) { - /* Absolute value of centralized representative */ - t = (Q - 1) / 2 - a->coeffs[i]; - t ^= (t >> 31); - t = (Q - 1) / 2 - t; - - if ((uint32_t)t >= B) { - return 1; - } + if (B > (Q - 1) / 8) { + return 1; } - return 0; + t = _mm256_setzero_si256(); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_abs_epi32(f); + f = _mm256_cmpgt_epi32(f, bound); + t = _mm256_or_si256(t, f); + } + + r = 1 - _mm256_testz_si256(t, t); + DBENCH_STOP(*tsample); + return r; } /************************************************* -* Name: rej_uniform_ref +* Name: rej_uniform * * Description: Sample uniformly random coefficients in [0, Q-1] by -* performing rejection sampling using array of random bytes. +* performing rejection sampling on array of random bytes. * -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled * - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes +* - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough * random bytes were given. **************************************************/ -static size_t rej_uniform_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; uint32_t t; + DBENCH_START(); ctr = pos = 0; while (ctr < len && pos + 3 <= buflen) { @@ -315,101 +373,87 @@ static size_t rej_uniform_ref( } } + DBENCH_STOP(*tsample); return ctr; } /************************************************* -* Name: poly_uniform +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_uniform * * Description: Sample polynomial with uniformly random coefficients -* in [0,Q-1] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, - const uint8_t seed[SEEDBYTES], - uint16_t nonce) { - size_t ctr, off; - size_t nblocks = POLY_UNIFORM_NBLOCKS; - size_t buflen = POLY_UNIFORM_BUFLEN; - uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; - stream128_state state; +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + /* PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx reads up to 8 additional bytes */ + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf; - stream128_init(&state, seed, nonce); - stream128_squeezeblocks(buf, nblocks, &state); - - ctr = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a->coeffs, N, buf, buflen); + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(a->coeffs, buf.coeffs); while (ctr < N) { - off = buflen % 3; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } - - buflen = STREAM128_BLOCKBYTES + off; - stream128_squeezeblocks(buf + off, 1, &state); - ctr += rej_uniform_ref(a->coeffs + ctr, N - ctr, buf, buflen); + /* length of buf is always divisible by 3; hence, no bytes left */ + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); } - stream128_ctx_release(&state); +} + +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_preinit(a, &state); + stream128_release(&state); } void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t seed[SEEDBYTES], + const uint8_t seed[32], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][SEEDBYTES + 2]; - uint8_t outbuf[4][5 * SHAKE128_RATE]; - __m256i state[25]; + unsigned int ctr0, ctr1, ctr2, ctr3; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf[4]; + keccakx4_state state; + __m256i f; - for (size_t i = 0; i < SEEDBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][SEEDBYTES + 0] = nonce0; - inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; - inbuf[1][SEEDBYTES + 0] = nonce1; - inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; - inbuf[2][SEEDBYTES + 0] = nonce2; - inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; - inbuf[3][SEEDBYTES + 0] = nonce3; - inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); - PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - SEEDBYTES + 2); - PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, - state); + buf[0].coeffs[SEEDBYTES + 0] = nonce0; + buf[0].coeffs[SEEDBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[SEEDBYTES + 0] = nonce1; + buf[1].coeffs[SEEDBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[SEEDBYTES + 0] = nonce2; + buf[2].coeffs[SEEDBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[SEEDBYTES + 0] = nonce3; + buf[3].coeffs[SEEDBYTES + 1] = nonce3 >> 8; - ctr0 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a0->coeffs, N, outbuf[0], 5 * SHAKE128_RATE); - ctr1 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a1->coeffs, N, outbuf[1], 5 * SHAKE128_RATE); - ctr2 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a2->coeffs, N, outbuf[2], 5 * SHAKE128_RATE); - ctr3 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform(a3->coeffs, N, outbuf[3], 5 * SHAKE128_RATE); + PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, SEEDBYTES + 2); + PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_NBLOCKS, &state); + + ctr0 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(a0->coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(a1->coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(a2->coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(a3->coeffs, buf[3].coeffs); while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); + PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); - ctr0 += rej_uniform_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], - SHAKE128_RATE); - ctr1 += rej_uniform_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], - SHAKE128_RATE); - ctr2 += rej_uniform_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], - SHAKE128_RATE); - ctr3 += rej_uniform_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], - SHAKE128_RATE); + ctr0 += rej_uniform(a0->coeffs + ctr0, N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_uniform(a1->coeffs + ctr1, N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_uniform(a2->coeffs + ctr2, N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_uniform(a3->coeffs + ctr3, N - ctr3, buf[3].coeffs, SHAKE128_RATE); } } @@ -417,485 +461,567 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0, * Name: rej_eta * * Description: Sample uniformly random coefficients in [-ETA, ETA] by -* performing rejection sampling using array of random bytes. +* performing rejection sampling on array of random bytes. * -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled * - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes +* - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough * random bytes were given. **************************************************/ -static size_t rej_eta_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; uint32_t t0, t1; + DBENCH_START(); ctr = pos = 0; while (ctr < len && pos < buflen) { t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - if (t0 <= 2 * ETA) { - a[ctr++] = Q + ETA - t0; + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; } - if (t1 <= 2 * ETA && ctr < len) { - a[ctr++] = Q + ETA - t1; + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; } } + DBENCH_STOP(*tsample); return ctr; } /************************************************* -* Name: poly_uniform_eta +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta * * Description: Sample polynomial with uniformly random coefficients * in [-ETA,ETA] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). +* output stream of SHAKE256(seed|nonce) +* or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta( - poly *a, - const uint8_t seed[SEEDBYTES], - uint16_t nonce) { - size_t ctr; - uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN * STREAM128_BLOCKBYTES) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_ETA_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { stream128_state state; - stream128_init(&state, seed, nonce); - stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); - - ctr = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); - - while (ctr < N) { - stream128_squeezeblocks(buf, 1, &state); - ctr += rej_eta_ref(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); - } - stream128_ctx_release(&state); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_preinit(a, &state); + stream128_release(&state); } -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x( - poly *a0, - poly *a1, - poly *a2, - poly *a3, - const uint8_t seed[SEEDBYTES], - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][SEEDBYTES + 2]; - uint8_t outbuf[4][2 * SHAKE128_RATE]; - __m256i state[25]; - - for (size_t i = 0; i < SEEDBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][SEEDBYTES + 0] = nonce0; - inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; - inbuf[1][SEEDBYTES + 0] = nonce1; - inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; - inbuf[2][SEEDBYTES + 0] = nonce2; - inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; - inbuf[3][SEEDBYTES + 0] = nonce3; - inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; - - PQCLEAN_DILITHIUM2_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - SEEDBYTES + 2); - PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 2, - state); - - ctr0 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a0->coeffs, N, outbuf[0], 2 * SHAKE128_RATE); - ctr1 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a1->coeffs, N, outbuf[1], 2 * SHAKE128_RATE); - ctr2 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a2->coeffs, N, outbuf[2], 2 * SHAKE128_RATE); - ctr3 = PQCLEAN_DILITHIUM2_AVX2_rej_eta(a3->coeffs, N, outbuf[3], 2 * SHAKE128_RATE); - - while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM2_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); - - ctr0 += rej_eta_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], SHAKE128_RATE); - ctr1 += rej_eta_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], SHAKE128_RATE); - ctr2 += rej_eta_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], SHAKE128_RATE); - ctr3 += rej_eta_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], SHAKE128_RATE); - } -} - -/************************************************* -* Name: rej_gamma1m1_ref -* -* Description: Sample uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling -* using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_gamma1m1_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint32_t t0, t1; - - ctr = pos = 0; - while (ctr < len && pos + 5 <= buflen) { - t0 = buf[pos]; - t0 |= (uint32_t)buf[pos + 1] << 8; - t0 |= (uint32_t)buf[pos + 2] << 16; - t0 &= 0xFFFFF; - - t1 = buf[pos + 2] >> 4; - t1 |= (uint32_t)buf[pos + 3] << 4; - t1 |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (t0 <= 2 * GAMMA1 - 2) { - a[ctr++] = Q + GAMMA1 - 1 - t0; - } - if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { - a[ctr++] = Q + GAMMA1 - 1 - t1; - } - } - return ctr; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1 -* -* Description: Sample polynomial with uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection -* sampling on output stream of SHAKE256(seed|nonce). -* -* Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* CRHBYTES -* - uint16_t nonce: 16-bit nonce -**************************************************/ -#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) -#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1( - poly *a, - const uint8_t seed[CRHBYTES], - uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; - stream256_state state; - - stream256_init(&state, seed, nonce); - stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); - - ctr = PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1(a->coeffs, N, buf, POLY_UNIFORM_GAMMA1M1_BUFLEN); - - while (ctr < N) { - off = buflen % 5; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } - - buflen = STREAM256_BLOCKBYTES + off; - stream256_squeezeblocks(buf + off, 1, &state); - ctr += rej_gamma1m1_ref(a->coeffs + ctr, N - ctr, buf, buflen); - } - stream256_ctx_release(&state); -} - -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(poly *a0, +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t seed[CRHBYTES], + const uint8_t seed[32], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][CRHBYTES + 2]; - uint8_t outbuf[4][5 * SHAKE256_RATE]; - __m256i state[25]; + unsigned int ctr0, ctr1, ctr2, ctr3; + ALIGNED_UINT8(REJ_UNIFORM_ETA_BUFLEN) buf[4]; - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][CRHBYTES + 0] = nonce0 & 0xFF; - inbuf[0][CRHBYTES + 1] = nonce0 >> 8; - inbuf[1][CRHBYTES + 0] = nonce1 & 0xFF; - inbuf[1][CRHBYTES + 1] = nonce1 >> 8; - inbuf[2][CRHBYTES + 0] = nonce2 & 0xFF; - inbuf[2][CRHBYTES + 1] = nonce2 >> 8; - inbuf[3][CRHBYTES + 0] = nonce3 & 0xFF; - inbuf[3][CRHBYTES + 1] = nonce3 >> 8; + __m256i f; + keccakx4_state state; - PQCLEAN_DILITHIUM2_AVX2_shake256_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - CRHBYTES + 2); - PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, - state); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); - ctr0 = rej_gamma1m1_ref(a0->coeffs, N, outbuf[0], 5 * SHAKE256_RATE); - ctr1 = rej_gamma1m1_ref(a1->coeffs, N, outbuf[1], 5 * SHAKE256_RATE); - ctr2 = rej_gamma1m1_ref(a2->coeffs, N, outbuf[2], 5 * SHAKE256_RATE); - ctr3 = rej_gamma1m1_ref(a3->coeffs, N, outbuf[3], 5 * SHAKE256_RATE); + buf[0].coeffs[SEEDBYTES + 0] = nonce0; + buf[0].coeffs[SEEDBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[SEEDBYTES + 0] = nonce1; + buf[1].coeffs[SEEDBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[SEEDBYTES + 0] = nonce2; + buf[2].coeffs[SEEDBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[SEEDBYTES + 0] = nonce3; + buf[3].coeffs[SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM2_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, SEEDBYTES + 2); + PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_ETA_NBLOCKS, &state); + + ctr0 = PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(a0->coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(a1->coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(a2->coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(a3->coeffs, buf[3].coeffs); while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM2_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); + PQCLEAN_DILITHIUM2_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); - ctr0 += rej_gamma1m1_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], - SHAKE256_RATE); - ctr1 += rej_gamma1m1_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], - SHAKE256_RATE); - ctr2 += rej_gamma1m1_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], - SHAKE256_RATE); - ctr3 += rej_gamma1m1_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], - SHAKE256_RATE); + ctr0 += rej_eta(a0->coeffs + ctr0, N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_eta(a1->coeffs + ctr1, N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_eta(a2->coeffs + ctr2, N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_eta(a3->coeffs + ctr3, N - ctr3, buf[3].coeffs, SHAKE128_RATE); } } +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES+STREAM256_BLOCKBYTES-1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state) { + /* PQCLEAN_DILITHIUM2_AVX2_polyz_unpack reads 14 additional bytes */ + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf; + stream256_squeezeblocks(buf.coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, state); + PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(a, buf.coeffs); +} + +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { + stream256_state state; + stream256_init(&state, seed, nonce); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_preinit(a, &state); + stream256_release(&state); +} + +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t seed[48], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf[4]; + keccakx4_state state; + __m256i f; + __m128i g; + + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + g = _mm_loadu_si128((__m128i *)&seed[32]); + _mm_store_si128((__m128i *)&buf[0].vec[1], g); + _mm_store_si128((__m128i *)&buf[1].vec[1], g); + _mm_store_si128((__m128i *)&buf[2].vec[1], g); + _mm_store_si128((__m128i *)&buf[3].vec[1], g); + + buf[0].coeffs[CRHBYTES + 0] = nonce0; + buf[0].coeffs[CRHBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[CRHBYTES + 0] = nonce1; + buf[1].coeffs[CRHBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[CRHBYTES + 0] = nonce2; + buf[2].coeffs[CRHBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[CRHBYTES + 0] = nonce3; + buf[3].coeffs[CRHBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM2_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, CRHBYTES + 2); + PQCLEAN_DILITHIUM2_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + + PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(a0, buf[0].coeffs); + PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(a1, buf[1].coeffs); + PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(a2, buf[2].coeffs); + PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(a3, buf[3].coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_poly_challenge(poly *restrict c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + ALIGNED_UINT8(SHAKE256_RATE) buf; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + + memcpy(&signs, buf.coeffs, 8); + pos = 8; + + memset(c->vec, 0, sizeof(poly)); + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + pos = 0; + } + + b = buf.coeffs[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyeta_pack * * Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. -* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLETA_SIZE_PACKED bytes +* POLYETA_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t *restrict r, const poly *restrict a) { - uint8_t t[N / 2]; - for (size_t i = 0; i < N / 2; ++i) { - t[0] = Q + ETA - a->coeffs[2 * i + 0]; - t[1] = Q + ETA - a->coeffs[2 * i + 1]; - r[i] = t[0] | (t[1] << 4); +void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = ETA - a->coeffs[8 * i + 0]; + t[1] = ETA - a->coeffs[8 * i + 1]; + t[2] = ETA - a->coeffs[8 * i + 2]; + t[3] = ETA - a->coeffs[8 * i + 3]; + t[4] = ETA - a->coeffs[8 * i + 4]; + t[5] = ETA - a->coeffs[8 * i + 5]; + t[6] = ETA - a->coeffs[8 * i + 6]; + t[7] = ETA - a->coeffs[8 * i + 7]; + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack * * Description: Unpack polynomial with coefficients in [-ETA,ETA]. -* Output coefficients lie in [Q-ETA,Q+ETA]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[i] & 0x0F; - r->coeffs[2 * i + 1] = a[i] >> 4; - r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1]; +void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *restrict r, const uint8_t a[POLYETA_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyt1_pack * -* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. -* Input coefficients are assumed to be standard representatives. +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be positive standard representatives. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT1_SIZE_PACKED bytes +* POLYT1_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t *restrict r, const poly *restrict a) { - for (size_t i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); +void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2); + r[5 * i + 2] = (a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4); + r[5 * i + 3] = (a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6); + r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack * -* Description: Unpack polynomial t1 with 9-bit coefficients. -* Output coefficients are standard representatives. +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are positive standard representatives. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] >> 0) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; +void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *restrict r, const uint8_t a[POLYT1_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyt0_pack * * Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT0_SIZE_PACKED bytes +* POLYT0_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t *restrict r, const poly *restrict a) { - uint32_t t[4]; +void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); - for (size_t i = 0; i < N / 4; ++i) { - t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; - r[7 * i + 0] = t[0]; - r[7 * i + 1] = t[0] >> 8; - r[7 * i + 1] |= t[1] << 6; - r[7 * i + 2] = t[1] >> 2; - r[7 * i + 3] = t[1] >> 10; - r[7 * i + 3] |= t[2] << 4; - r[7 * i + 4] = t[2] >> 4; - r[7 * i + 5] = t[2] >> 12; - r[7 * i + 5] |= t[3] << 2; - r[7 * i + 6] = t[3] >> 6; + r[13 * i + 0] = t[0]; + r[13 * i + 1] = t[0] >> 8; + r[13 * i + 1] |= t[1] << 5; + r[13 * i + 2] = t[1] >> 3; + r[13 * i + 3] = t[1] >> 11; + r[13 * i + 3] |= t[2] << 2; + r[13 * i + 4] = t[2] >> 6; + r[13 * i + 4] |= t[3] << 7; + r[13 * i + 5] = t[3] >> 1; + r[13 * i + 6] = t[3] >> 9; + r[13 * i + 6] |= t[4] << 4; + r[13 * i + 7] = t[4] >> 4; + r[13 * i + 8] = t[4] >> 12; + r[13 * i + 8] |= t[5] << 1; + r[13 * i + 9] = t[5] >> 7; + r[13 * i + 9] |= t[6] << 6; + r[13 * i + 10] = t[6] >> 2; + r[13 * i + 11] = t[6] >> 10; + r[13 * i + 11] |= t[7] << 3; + r[13 * i + 12] = t[7] >> 5; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack * * Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; +void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *restrict r, const uint8_t a[POLYT0_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; - r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; - r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; - r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; - r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyz_pack * -* Description: Bit-pack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Input coefficients are assumed to be standard representatives. +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLZ_SIZE_PACKED bytes +* POLYZ_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t *restrict r, const poly *restrict a) { - uint32_t t[2]; +void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); - for (size_t i = 0; i < N / 2; ++i) { - /* Map to {0,...,2*GAMMA1 - 2} */ - t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; - t[0] += ((int32_t)t[0] >> 31) & Q; - t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; - t[1] += ((int32_t)t[1] >> 31) & Q; + for (i = 0; i < N / 4; ++i) { + t[0] = GAMMA1 - a->coeffs[4 * i + 0]; + t[1] = GAMMA1 - a->coeffs[4 * i + 1]; + t[2] = GAMMA1 - a->coeffs[4 * i + 2]; + t[3] = GAMMA1 - a->coeffs[4 * i + 3]; - r[5 * i + 0] = t[0]; - r[5 * i + 1] = t[0] >> 8; - r[5 * i + 2] = t[0] >> 16; - r[5 * i + 2] |= t[1] << 4; - r[5 * i + 3] = t[1] >> 4; - r[5 * i + 4] = t[1] >> 12; + r[9 * i + 0] = t[0]; + r[9 * i + 1] = t[0] >> 8; + r[9 * i + 2] = t[0] >> 16; + r[9 * i + 2] |= t[1] << 2; + r[9 * i + 3] = t[1] >> 6; + r[9 * i + 4] = t[1] >> 14; + r[9 * i + 4] |= t[2] << 4; + r[9 * i + 5] = t[2] >> 4; + r[9 * i + 6] = t[2] >> 12; + r[9 * i + 6] |= t[3] << 6; + r[9 * i + 7] = t[3] >> 2; + r[9 * i + 8] = t[3] >> 10; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyz_unpack * * Description: Unpack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Output coefficients are standard representatives. +* in [-(GAMMA1 - 1), GAMMA1]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; +void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *restrict r, const uint8_t a[POLYZ_PACKEDBYTES + 14]) { + unsigned int i; + __m256i f; + const __m256i shufbidx = _mm256_set_epi8(-1, 9, 8, 7, -1, 7, 6, 5, -1, 5, 4, 3, -1, 3, 2, 1, + -1, 8, 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0); + const __m256i srlvdidx = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0); + const __m256i mask = _mm256_set1_epi32(0x3FFFF); + const __m256i gamma1 = _mm256_set1_epi32(GAMMA1); + DBENCH_START(); - r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; - - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; - r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; + for (i = 0; i < N / 8; i++) { + f = _mm256_loadu_si256((__m256i *)&a[18 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_srlv_epi32(f, srlvdidx); + f = _mm256_and_si256(f, mask); + f = _mm256_sub_epi32(gamma1, f); + _mm256_store_si256(&r->vec[i], f); } + DBENCH_STOP(*tpack); } + /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyw1_pack * -* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. -* Input coefficients are assumed to be standard representatives. +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be positive standard representatives. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLW1_SIZE_PACKED bytes +* POLYW1_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack( - uint8_t *restrict r, - const poly *restrict a) { - for (size_t i = 0; i < N / 2; ++i) { - r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); +void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *restrict a) { + unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1); + const __m256i shift2 = _mm256_set1_epi32((4096 << 16) + 1); + const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0); + const __m256i shufbidx = _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, + -1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); + DBENCH_START(); + + for (i = 0; i < N / 32; i++) { + f0 = _mm256_load_si256(&a->vec[4 * i + 0]); + f1 = _mm256_load_si256(&a->vec[4 * i + 1]); + f2 = _mm256_load_si256(&a->vec[4 * i + 2]); + f3 = _mm256_load_si256(&a->vec[4 * i + 3]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_maddubs_epi16(f0, shift1); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2); + _mm256_storeu_si256((__m256i *)&r[24 * i], f0); } + + DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium2/avx2/poly.h b/crypto_sign/dilithium2/avx2/poly.h index d726a55f..8310e4a1 100644 --- a/crypto_sign/dilithium2/avx2/poly.h +++ b/crypto_sign/dilithium2/avx2/poly.h @@ -1,19 +1,14 @@ -#ifndef POLY_H -#define POLY_H - -#include +#ifndef PQCLEAN_DILITHIUM2_AVX2_POLY_H +#define PQCLEAN_DILITHIUM2_AVX2_POLY_H +#include "align.h" +#include "params.h" +#include "symmetric.h" #include -#include "alignment.h" -#include "params.h" - -typedef union { - uint32_t coeffs[N]; - __m256i coeffs_x8[N / 8]; -} poly; +typedef ALIGNED_INT32(N) poly; void PQCLEAN_DILITHIUM2_AVX2_poly_reduce(poly *a); -void PQCLEAN_DILITHIUM2_AVX2_poly_csubq(poly *a); +void PQCLEAN_DILITHIUM2_AVX2_poly_caddq(poly *a); void PQCLEAN_DILITHIUM2_AVX2_poly_freeze(poly *a); void PQCLEAN_DILITHIUM2_AVX2_poly_add(poly *c, const poly *a, const poly *b); @@ -21,63 +16,64 @@ void PQCLEAN_DILITHIUM2_AVX2_poly_sub(poly *c, const poly *a, const poly *b); void PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(poly *a); void PQCLEAN_DILITHIUM2_AVX2_poly_ntt(poly *a); -void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(poly *a); -void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(poly *a); +void PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); void PQCLEAN_DILITHIUM2_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); void PQCLEAN_DILITHIUM2_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); -unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1); -void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h); +unsigned int PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM2_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); -int PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(const poly *a, uint32_t B); -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform(poly *a, - const uint8_t *seed, - uint16_t nonce); void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t *seed, + const uint8_t seed[SEEDBYTES], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3); -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(poly *a, - const uint8_t *seed, - uint16_t nonce); void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t *seed, + const uint8_t seed[SEEDBYTES], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3); -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1(poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(poly *a0, +void PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t *seed, + const uint8_t seed[CRHBYTES], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3); -void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t *a); +void PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); -void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t *a); +void PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); -void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t *a); +void PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); -void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t *a); +void PQCLEAN_DILITHIUM2_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); + +void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); -void PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(uint8_t *r, const poly *a); #endif diff --git a/crypto_sign/dilithium2/avx2/polyvec.c b/crypto_sign/dilithium2/avx2/polyvec.c index 3603303a..9b989246 100644 --- a/crypto_sign/dilithium2/avx2/polyvec.c +++ b/crypto_sign/dilithium2/avx2/polyvec.c @@ -1,14 +1,103 @@ -#include - +#include "consts.h" #include "ntt.h" #include "params.h" #include "poly.h" #include "polyvec.h" +#include + +#define UNUSED(x) (void)x + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(&mat[0], NULL, rho); + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(&mat[1], NULL, rho); + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(&mat[2], NULL, rho); + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho); +} + +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + UNUSED(rowb); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); +} + +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + UNUSED(rowb); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 256, 257, 258, 259); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); +} + +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + UNUSED(rowb); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 512, 513, 514, 515); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); +} + +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + UNUSED(rowb); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 768, 769, 770, 771); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[0]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM2_AVX2_poly_nttunpack(&rowa->vec[3]); +} + + +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} /**************************************************************/ /************ Vectors of polynomials of length L **************/ /**************************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]); + } +} + /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze * @@ -18,7 +107,9 @@ * Arguments: - polyvecl *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]); } } @@ -34,7 +125,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v) { * - const polyvecl *v: pointer to second summand **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -48,44 +141,60 @@ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const * Arguments: - polyvecl *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]); } } +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + /************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery +* Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery * * Description: Pointwise multiply vectors of polynomials of length L, multiply * resulting vector by 2^{-32} and add (accumulate) polynomials * in it. Input/output vectors are in NTT domain representation. -* Input coefficients are assumed to be less than 22*Q. Output -* coeffcient are less than 2*L*Q. * * Arguments: - poly *w: output polynomial * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, - const polyvecl *u, - const polyvecl *v) { - PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs); +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { + PQCLEAN_DILITHIUM2_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM2_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm * * Description: Check infinity norm of polynomials in vector of length L. -* Assumes input coefficients to be standard representatives. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(). * * Arguments: - const polyvecl *v: pointer to vector -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm of all polynomials is strictly smaller than B and 1 -* otherwise. +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { - for (size_t i = 0; i < L; ++i) { +int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) { return 1; } @@ -98,37 +207,48 @@ int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) /************ Vectors of polynomials of length K **************/ /**************************************************************/ +void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce * * Description: Reduce coefficients of polynomials in vector of length K -* to representatives in [0,2*Q[. +* to representatives in [-6283009,6283007]. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&v->vec[i]); } } /************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq * * Description: For all coefficients of polynomials in vector of length K -* subtract Q if coefficient is bigger than Q. +* add Q if coefficient is negative. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_poly_csubq(&v->vec[i]); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&v->vec[i]); } } /************************************************* -* Name: polyveck_freeze +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze * * Description: Reduce coefficients of polynomials in vector of length K * to standard representatives. @@ -136,7 +256,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v) { * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_freeze(&v->vec[i]); } } @@ -152,7 +274,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v) { * - const polyveck *v: pointer to second summand **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -161,8 +285,7 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const * Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_sub * * Description: Subtract vectors of polynomials of length K. -* Assumes coefficients of polynomials in second input vector -* to be less than 2*Q. No modular reduction is performed. +* No modular reduction is performed. * * Arguments: - polyveck *w: pointer to output vector * - const polyveck *u: pointer to first input vector @@ -170,7 +293,9 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const * subtracted from first input vector **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -179,12 +304,14 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const * Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl * * Description: Multiply vector of polynomials of Length K by 2^D without modular -* reduction. Assumes input coefficients to be less than 2^{32-D}. +* reduction. Assumes input coefficients to be less than 2^{31-D}. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&v->vec[i]); } } @@ -198,13 +325,15 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v) { * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&v->vec[i]); } } /************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont * * Description: Inverse NTT and multiplication by 2^{32} of polynomials * in vector of length K. Input coefficients need to be less @@ -212,9 +341,19 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v) { * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&v->vec[i]); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); } } @@ -222,16 +361,18 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v) { * Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm * * Description: Check infinity norm of polynomials in vector of length K. -* Assumes input coefficients to be standard representatives. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(). * * Arguments: - const polyveck *v: pointer to vector -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm of all polynomials are strictly smaller than B and 1 -* otherwise. +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) { - for (size_t i = 0; i < K; ++i) { +int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&v->vec[i], bound)) { return 1; } @@ -244,18 +385,20 @@ int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) * Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round * * Description: For all coefficients a of polynomials in vector of length K, -* compute a0, a1 such that a mod Q = a1*2^D + a0 +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 * with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be * standard representatives. * * Arguments: - polyveck *v1: pointer to output vector of polynomials with * coefficients a1 * - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 +* coefficients a0 * - const polyveck *v: pointer to input vector **************************************************/ void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); } } @@ -264,7 +407,7 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, co * Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose * * Description: For all coefficients a of polynomials in vector of length K, -* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 * with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we * set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. * Assumes coefficients to be standard representatives. @@ -272,12 +415,13 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, co * Arguments: - polyveck *v1: pointer to output vector of polynomials with * coefficients a1 * - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 +* coefficients a0 * - const polyveck *v: pointer to input vector **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); } } @@ -287,37 +431,44 @@ void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose( * * Description: Compute hint vector. * -* Arguments: - polyveck *h: pointer to output vector +* Arguments: - uint8_t *hint: pointer to output hint array * - const polyveck *v0: pointer to low part of input vector * - const polyveck *v1: pointer to high part of input vector * * Returns number of 1 bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1) { - uint32_t s = 0; +unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { + unsigned int i, n = 0; - for (size_t i = 0; i < K; ++i) { - s += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + for (i = 0; i < K; ++i) { + n += PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); } - return s; + return n; } /************************************************* -* Name: polyveck_use_hint +* Name: PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint * * Description: Use hint vector to correct the high bits of input vector. * * Arguments: - polyveck *w: pointer to output vector of polynomials with * corrected high bits -* - const polyveck *v: pointer to input vector +* - const polyveck *u: pointer to input vector * - const polyveck *h: pointer to input hint vector **************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); } } diff --git a/crypto_sign/dilithium2/avx2/polyvec.h b/crypto_sign/dilithium2/avx2/polyvec.h index 8191a384..ee8b05ff 100644 --- a/crypto_sign/dilithium2/avx2/polyvec.h +++ b/crypto_sign/dilithium2/avx2/polyvec.h @@ -1,58 +1,72 @@ #ifndef PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H #define PQCLEAN_DILITHIUM2_AVX2_POLYVEC_H - -#include - #include "params.h" #include "poly.h" +#include /* Vectors of polynomials of length L */ typedef struct { poly vec[L]; } polyvecl; +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_reduce(polyvecl *v); + void PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(polyvecl *v); void PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); void PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(polyvecl *v); -void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v); - -int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B); - +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); +int PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); /* Vectors of polynomials of length K */ typedef struct { poly vec[K]; } polyveck; +void PQCLEAN_DILITHIUM2_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + void PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(polyveck *v); -void PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(polyveck *v); void PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(polyveck *v); -void PQCLEAN_DILITHIUM2_AVX2_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v); -void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); void PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(polyveck *v); void PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(polyveck *v); -void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); -int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm( - const polyveck *v, uint32_t B); +int PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); -void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v); -void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v); -uint32_t PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1); -void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); +void PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); #endif diff --git a/crypto_sign/dilithium2/avx2/reduce.S b/crypto_sign/dilithium2/avx2/reduce.S deleted file mode 100644 index 941828d5..00000000 --- a/crypto_sign/dilithium2/avx2/reduce.S +++ /dev/null @@ -1,93 +0,0 @@ -#include "cdecl.inc" - -.global cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx) -cdecl(PQCLEAN_DILITHIUM2_AVX2_reduce_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8x23ones)(%rip),%ymm0 - -xor %eax,%eax -_looptop_rdc32: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm7 - -#reduce -vpsrld $23,%ymm1,%ymm2 -vpsrld $23,%ymm3,%ymm4 -vpsrld $23,%ymm5,%ymm6 -vpsrld $23,%ymm7,%ymm8 -vpand %ymm0,%ymm1,%ymm1 -vpand %ymm0,%ymm3,%ymm3 -vpand %ymm0,%ymm5,%ymm5 -vpand %ymm0,%ymm7,%ymm7 -vpsubd %ymm2,%ymm1,%ymm1 -vpsubd %ymm4,%ymm3,%ymm3 -vpsubd %ymm6,%ymm5,%ymm5 -vpsubd %ymm8,%ymm7,%ymm7 -vpslld $13,%ymm2,%ymm2 -vpslld $13,%ymm4,%ymm4 -vpslld $13,%ymm6,%ymm6 -vpslld $13,%ymm8,%ymm8 -vpaddd %ymm2,%ymm1,%ymm1 -vpaddd %ymm4,%ymm3,%ymm3 -vpaddd %ymm6,%ymm5,%ymm5 -vpaddd %ymm8,%ymm7,%ymm7 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -add $128,%rdi -add $1,%eax -cmp $8,%eax -jb _looptop_rdc32 - -ret - -.global cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx) -cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM2_AVX2_8xq)(%rip),%ymm0 - -xor %eax,%eax -_looptop_csubq: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm7 - -#cdecl(PQCLEAN_DILITHIUM2_AVX2_csubq) -vpsubd %ymm0,%ymm1,%ymm1 -vpsubd %ymm0,%ymm3,%ymm3 -vpsubd %ymm0,%ymm5,%ymm5 -vpsubd %ymm0,%ymm7,%ymm7 -vpsrad $31,%ymm1,%ymm2 -vpsrad $31,%ymm3,%ymm4 -vpsrad $31,%ymm5,%ymm6 -vpsrad $31,%ymm7,%ymm8 -vpand %ymm0,%ymm2,%ymm2 -vpand %ymm0,%ymm4,%ymm4 -vpand %ymm0,%ymm6,%ymm6 -vpand %ymm0,%ymm8,%ymm8 -vpaddd %ymm2,%ymm1,%ymm1 -vpaddd %ymm4,%ymm3,%ymm3 -vpaddd %ymm6,%ymm5,%ymm5 -vpaddd %ymm8,%ymm7,%ymm7 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -add $128,%rdi -add $1,%eax -cmp $8,%eax -jb _looptop_csubq - -ret diff --git a/crypto_sign/dilithium2/avx2/reduce.h b/crypto_sign/dilithium2/avx2/reduce.h deleted file mode 100644 index 74d4dd2e..00000000 --- a/crypto_sign/dilithium2/avx2/reduce.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef REDUCE_H -#define REDUCE_H - -#include - -void PQCLEAN_DILITHIUM2_AVX2_reduce_avx(uint32_t a[N]); -void PQCLEAN_DILITHIUM2_AVX2_csubq_avx(uint32_t a[N]); - -#endif diff --git a/crypto_sign/dilithium2/avx2/rejsample.c b/crypto_sign/dilithium2/avx2/rejsample.c index d7d85ae1..1c8352e6 100644 --- a/crypto_sign/dilithium2/avx2/rejsample.c +++ b/crypto_sign/dilithium2/avx2/rejsample.c @@ -1,9 +1,10 @@ -#include - #include "params.h" #include "rejsample.h" +#include "symmetric.h" +#include +#include -static const uint8_t idx[256][8] = { +const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8] = { { 0, 0, 0, 0, 0, 0, 0, 0}, { 0, 0, 0, 0, 0, 0, 0, 0}, { 1, 0, 0, 0, 0, 0, 0, 0}, @@ -262,178 +263,144 @@ static const uint8_t idx[256][8] = { { 0, 1, 2, 3, 4, 5, 6, 7} }; -uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_uniform( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos, vec[8]; - __m256i d, tmp; +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { + unsigned int ctr, pos; uint32_t good; + __m256i d, tmp; const __m256i bound = _mm256_set1_epi32(Q); + const __m256i mask = _mm256_set1_epi32(0x7FFFFF); + const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, + -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, + -1, 5, 4, 3, -1, 2, 1, 0); ctr = pos = 0; - while (ctr + 8 <= len && pos + 24 <= buflen) { - for (size_t i = 0; i < 8; i++) { - vec[i] = buf[pos++]; - vec[i] |= (uint32_t)buf[pos++] << 8; - vec[i] |= (uint32_t)buf[pos++] << 16; - vec[i] &= 0x7FFFFF; - } + while (pos <= REJ_UNIFORM_BUFLEN - 24) { + d = _mm256_loadu_si256((__m256i *)&buf[pos]); + d = _mm256_permute4x64_epi64(d, 0x94); + d = _mm256_shuffle_epi8(d, idx8); + d = _mm256_and_si256(d, mask); + pos += 24; - d = _mm256_loadu_si256((__m256i_u *)vec); - tmp = _mm256_cmpgt_epi32(bound, d); + tmp = _mm256_sub_epi32(d, bound); good = _mm256_movemask_ps((__m256)tmp); - - __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); - tmp = _mm256_cvtepu8_epi32(rid); + tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good])); d = _mm256_permutevar8x32_epi32(d, tmp); - _mm256_storeu_si256((__m256i_u *)&r[ctr], d); - ctr += __builtin_popcount(good); + + _mm256_storeu_si256((__m256i *)&r[ctr], d); + ctr += _mm_popcnt_u32(good); + + if (ctr > N - 8) { + break; + } } - while (ctr < len && pos + 3 <= buflen) { - vec[0] = buf[pos++]; - vec[0] |= (uint32_t)buf[pos++] << 8; - vec[0] |= (uint32_t)buf[pos++] << 16; - vec[0] &= 0x7FFFFF; + uint32_t t; + while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; - if (vec[0] < Q) { - r[ctr++] = vec[0]; + if (t < Q) { + r[ctr++] = t; } } return ctr; } -uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_eta( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos; - uint8_t vec[32]; - __m256i tmp0, tmp1; - __m128i d0, d1, rid; +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { + unsigned int ctr, pos; uint32_t good; - const __m256i bound = _mm256_set1_epi8(2 * ETA + 1); - const __m256i off = _mm256_set1_epi32(Q + ETA); + __m256i f0, f1, f2; + __m128i g0, g1; + const __m256i mask = _mm256_set1_epi8(15); + const __m256i eta = _mm256_set1_epi8(ETA); + const __m256i bound = mask; + const __m256i v = _mm256_set1_epi32(-6560); + const __m256i p = _mm256_set1_epi32(5); ctr = pos = 0; - while (ctr + 32 <= len && pos + 16 <= buflen) { - for (size_t i = 0; i < 16; i++) { - vec[2 * i + 0] = buf[pos] & 0x0F; - vec[2 * i + 1] = buf[pos++] >> 4; + while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { + f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); + f1 = _mm256_slli_epi16(f0, 4); + f0 = _mm256_or_si256(f0, f1); + f0 = _mm256_and_si256(f0, mask); + + f1 = _mm256_sub_epi8(f0, bound); + f0 = _mm256_sub_epi8(eta, f0); + good = _mm256_movemask_epi8(f1); + + g0 = _mm256_castsi256_si128(f0); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; - tmp0 = _mm256_loadu_si256((__m256i_u *)vec); - tmp1 = _mm256_cmpgt_epi8(bound, tmp0); - good = _mm256_movemask_epi8(tmp1); + if (ctr > N - 8) { + break; + } + g0 = _mm256_extracti128_si256(f0, 1); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; - d0 = _mm256_castsi256_si128(tmp0); - rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount(good & 0xFF); - - d0 = _mm_bsrli_si128(d0, 8); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 8) & 0xFF); - - d0 = _mm256_extracti128_si256(tmp0, 1); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 16) & 0xFF); - - d0 = _mm_bsrli_si128(d0, 8); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 24) & 0xFF); + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2_AVX2_idxlut[good]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good); + pos += 4; } - while (ctr < len && pos < buflen) { - vec[0] = buf[pos] & 0x0F; - vec[1] = buf[pos++] >> 4; + uint32_t t0, t1; + while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; - if (vec[0] <= 2 * ETA) { - r[ctr++] = Q + ETA - vec[0]; + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + r[ctr++] = 2 - t0; } - if (vec[1] <= 2 * ETA && ctr < len) { - r[ctr++] = Q + ETA - vec[1]; - } - } - - return ctr; -} - -uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos; - uint32_t vec[8]; - __m256i d, tmp; - uint32_t good; - const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1); - const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1); - - ctr = pos = 0; - while (ctr + 8 <= len && pos + 20 <= buflen) { - for (size_t i = 0; i < 4; i++) { - vec[2 * i + 0] = buf[pos + 0]; - vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8; - vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16; - vec[2 * i + 0] &= 0xFFFFF; - - vec[2 * i + 1] = buf[pos + 2] >> 4; - vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4; - vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - } - - d = _mm256_loadu_si256((__m256i_u *)vec); - tmp = _mm256_cmpgt_epi32(bound, d); - good = _mm256_movemask_ps((__m256)tmp); - d = _mm256_sub_epi32(off, d); - - __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); - tmp = _mm256_cvtepu8_epi32(rid); - d = _mm256_permutevar8x32_epi32(d, tmp); - _mm256_storeu_si256((__m256i_u *)&r[ctr], d); - ctr += __builtin_popcount(good); - } - - while (ctr < len && pos + 5 <= buflen) { - vec[0] = buf[pos + 0]; - vec[0] |= (uint32_t)buf[pos + 1] << 8; - vec[0] |= (uint32_t)buf[pos + 2] << 16; - vec[0] &= 0xFFFFF; - - vec[1] = buf[pos + 2] >> 4; - vec[1] |= (uint32_t)buf[pos + 3] << 4; - vec[1] |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (vec[0] <= 2 * GAMMA1 - 2) { - r[ctr++] = Q + GAMMA1 - 1 - vec[0]; - } - if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) { - r[ctr++] = Q + GAMMA1 - 1 - vec[1]; + if (t1 < 15 && ctr < N) { + t1 = t1 - (205 * t1 >> 10) * 5; + r[ctr++] = 2 - t1; } } diff --git a/crypto_sign/dilithium2/avx2/rejsample.h b/crypto_sign/dilithium2/avx2/rejsample.h index 31b7fae0..f7f3cbb3 100644 --- a/crypto_sign/dilithium2/avx2/rejsample.h +++ b/crypto_sign/dilithium2/avx2/rejsample.h @@ -1,25 +1,19 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H - -#include +#ifndef PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H +#define PQCLEAN_DILITHIUM2_AVX2_REJSAMPLE_H +#include "params.h" +#include "symmetric.h" #include -uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_uniform( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); +#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) -uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_eta( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); +#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) -uint32_t PQCLEAN_DILITHIUM2_AVX2_rej_gamma1m1( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); +extern const uint8_t PQCLEAN_DILITHIUM2_AVX2_idxlut[256][8]; + +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); + +unsigned int PQCLEAN_DILITHIUM2_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); #endif diff --git a/crypto_sign/dilithium2/avx2/rounding.c b/crypto_sign/dilithium2/avx2/rounding.c index ae8f6f5b..6a13031c 100644 --- a/crypto_sign/dilithium2/avx2/rounding.c +++ b/crypto_sign/dilithium2/avx2/rounding.c @@ -1,115 +1,157 @@ +#include "consts.h" +#include "params.h" +#include "rejsample.h" #include "rounding.h" +#include +#include +#include + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) /************************************************* * Name: power2round * -* Description: For finite field element a, compute a0, a1 such that -* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. -* Assumes a to be standard representative. +* Description: For finite field elements a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be positive standard representative. * -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 +* Arguments: - __m256i *a1: output array of length N/8 with high bits +* - __m256i *a0: output array of length N/8 with low bits a0 +* - const __m256i *a: input array of length N/8 * -* Returns a1. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0) { - int32_t t; +void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i mask = _mm256_set1_epi32(-(1 << D)); + const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); - /* Centralized remainder mod 2^D */ - t = a & ((1U << D) - 1); - t -= (1U << (D - 1)) + 1; - t += (t >> 31) & (1U << D); - t -= (1U << (D - 1)) - 1; - *a0 = Q + t; - a = (a - t) >> D; - return a; + for (i = 0; i < N / 8; ++i) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, half); + f0 = _mm256_and_si256(f1, mask); + f1 = _mm256_srli_epi32(f1, D); + f0 = _mm256_sub_epi32(f, f0); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } } /************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_decompose +* Name: decompose * -* Description: For finite field element a, compute high and low bits a0, a1 such -* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* Description: For finite field element a, compute high and low parts a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except * if a1 = (Q-1)/ALPHA where we set a1 = 0 and -* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard * representative. * -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 +* Arguments: - __m256i *a1: output array of length N/8 with high parts +* - __m256i *a0: output array of length N/8 with low parts a0 +* - const __m256i *a: input array of length N/8 * -* Returns a1. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0) { - int32_t t, u; +void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1, t; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2_AVX2_qdata.vec[_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + const __m256i v = _mm256_set1_epi32(11275); + const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(128); + const __m256i max = _mm256_set1_epi32(43); + const __m256i zero = _mm256_setzero_si256(); - /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFF; - t += (a >> 19) << 9; - t -= ALPHA / 2 + 1; - t += (t >> 31) & ALPHA; - t -= ALPHA / 2 - 1; - a -= t; - - /* Divide by ALPHA (possible to avoid) */ - u = a - 1; - u >>= 31; - a = (a >> 19) + 1; - a -= u & 1; - - /* Border case */ - *a0 = Q + t - (a >> 4); - a &= 0xF; - return a; + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + t = _mm256_sub_epi32(max, f1); + f1 = _mm256_blendv_epi32(f1, zero, t); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } } /************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_make_hint +* Name: make_hint * -* Description: Compute hint bit indicating whether the low bits of the -* input element overflow into the high bits. Inputs assumed to be -* standard representatives. +* Description: Compute indices of polynomial coefficients whose low bits +* overflow into the high bits. * -* Arguments: - uint32_t a0: low bits of input element -* - uint32_t a1: high bits of input element +* Arguments: - uint8_t *hint: hint array +* - const __m256i *a0: low bits of input elements +* - const __m256i *a1: high bits of input elements * -* Returns 1 if high bits of a and b differ and 0 otherwise. +* Returns number of overflowing low bits **************************************************/ -unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(const uint32_t a0, const uint32_t a1) { - if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { - return 0; +unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { + unsigned int i, n = 0; + __m256i f0, f1, g0, g1; + uint32_t bad; + uint64_t idx; + const __m256i low = _mm256_set1_epi32(-GAMMA2); + const __m256i high = _mm256_set1_epi32(GAMMA2); + + for (i = 0; i < N / 8; ++i) { + f0 = _mm256_load_si256(&a0[i]); + f1 = _mm256_load_si256(&a1[i]); + g0 = _mm256_abs_epi32(f0); + g0 = _mm256_cmpgt_epi32(g0, high); + g1 = _mm256_cmpeq_epi32(f0, low); + g1 = _mm256_sign_epi32(g1, f1); + g0 = _mm256_or_si256(g0, g1); + + bad = _mm256_movemask_ps((__m256)g0); + memcpy(&idx, PQCLEAN_DILITHIUM2_AVX2_idxlut[bad], 8); + idx += (uint64_t)0x0808080808080808 * i; + memcpy(&hint[n], &idx, 8); + n += _mm_popcnt_u32(bad); } - return 1; + return n; } /************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_use_hint +* Name: use_hint * -* Description: Correct high bits according to hint. +* Description: Correct high parts according to hint. * -* Arguments: - uint32_t a: input element -* - unsigned int hint: hint bit +* Arguments: - __m256i *b: output array of length N/8 with corrected high parts +* - const __m256i *a: input array of length N/8 +* - const __m256i *a: input array of length N/8 with hint bits * -* Returns corrected high bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(const uint32_t a, const unsigned int hint) { - uint32_t a0, a1; +void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { + unsigned int i; + __m256i a0[N / 8]; + __m256i f, g, h, t; + const __m256i zero = _mm256_setzero_si256(); + const __m256i max = _mm256_set1_epi32(43); - a1 = PQCLEAN_DILITHIUM2_AVX2_decompose(a, &a0); - if (hint == 0) { - return a1; + PQCLEAN_DILITHIUM2_AVX2_decompose_avx(b, a0, a); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a0[i]); + g = _mm256_load_si256(&b[i]); + h = _mm256_load_si256(&hint[i]); + t = _mm256_blendv_epi32(zero, h, f); + t = _mm256_slli_epi32(t, 1); + h = _mm256_sub_epi32(h, t); + g = _mm256_add_epi32(g, h); + g = _mm256_blendv_epi32(g, max, g); + f = _mm256_cmpgt_epi32(g, max); + g = _mm256_blendv_epi32(g, zero, f); + _mm256_store_si256(&b[i], g); } - if (a0 > Q) { - return (a1 + 1) & 0xF; - } - return (a1 - 1) & 0xF; - - /* If decompose does not divide out ALPHA: - if(hint == 0) - return a1; - else if(a0 > Q) - return (a1 + ALPHA) % (Q - 1); - else - return (a1 - ALPHA) % (Q - 1); - */ } diff --git a/crypto_sign/dilithium2/avx2/rounding.h b/crypto_sign/dilithium2/avx2/rounding.h index 3a30aa15..6d200335 100644 --- a/crypto_sign/dilithium2/avx2/rounding.h +++ b/crypto_sign/dilithium2/avx2/rounding.h @@ -1,12 +1,12 @@ -#ifndef ROUNDING_H -#define ROUNDING_H - +#ifndef PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H +#define PQCLEAN_DILITHIUM2_AVX2_ROUNDING_H #include "params.h" +#include #include -uint32_t PQCLEAN_DILITHIUM2_AVX2_power2round(uint32_t a, uint32_t *a0); -uint32_t PQCLEAN_DILITHIUM2_AVX2_decompose(uint32_t a, uint32_t *a0); -unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint(uint32_t a0, uint32_t a1); -uint32_t PQCLEAN_DILITHIUM2_AVX2_use_hint(uint32_t a, unsigned int hint); +void PQCLEAN_DILITHIUM2_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); +void PQCLEAN_DILITHIUM2_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); +unsigned int PQCLEAN_DILITHIUM2_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); +void PQCLEAN_DILITHIUM2_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); #endif diff --git a/crypto_sign/dilithium2/avx2/shuffle.S b/crypto_sign/dilithium2/avx2/shuffle.S new file mode 100644 index 00000000..fd8eece7 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/shuffle.S @@ -0,0 +1,54 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +#store +vmovdqa %ymm9,(%rdi) +vmovdqa %ymm8,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm3,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx) +.global _cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx) +cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx): +_cdecl(PQCLEAN_DILITHIUM2_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret diff --git a/crypto_sign/dilithium2/avx2/shuffle.inc b/crypto_sign/dilithium2/avx2/shuffle.inc index df352030..73e9ffe0 100644 --- a/crypto_sign/dilithium2/avx2/shuffle.inc +++ b/crypto_sign/dilithium2/avx2/shuffle.inc @@ -9,15 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm diff --git a/crypto_sign/dilithium2/avx2/sign.c b/crypto_sign/dilithium2/avx2/sign.c index 3b7b8c75..5dcf7261 100644 --- a/crypto_sign/dilithium2/avx2/sign.c +++ b/crypto_sign/dilithium2/avx2/sign.c @@ -1,6 +1,4 @@ -#include -#include - +#include "align.h" #include "fips202.h" #include "packing.h" #include "params.h" @@ -9,93 +7,28 @@ #include "randombytes.h" #include "sign.h" #include "symmetric.h" +#include +#include -/************************************************* -* Name: expand_mat -* -* Description: Implementation of ExpandA. Generates matrix A with uniformly -* random coefficients a_{i,j} by performing rejection -* sampling on the output stream of SHAKE128(rho|i|j). -* -* Arguments: - polyvecl mat[K]: output matrix -* - const uint8_t rho[]: byte array containing seed rho -**************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[4], const uint8_t rho[SEEDBYTES]) { - PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[0].vec[0], - &mat[0].vec[1], - &mat[0].vec[2], - &mat[1].vec[0], - rho, 0, 1, 2, 256); - PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[1].vec[1], - &mat[1].vec[2], - &mat[2].vec[0], - &mat[2].vec[1], - rho, 257, 258, 512, 513); - PQCLEAN_DILITHIUM2_AVX2_poly_uniform_4x(&mat[2].vec[2], - &mat[3].vec[0], - &mat[3].vec[1], - &mat[3].vec[2], - rho, 514, 768, 769, 770); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM2_AVX2_challenge -* -* Description: Implementation of H. Samples polynomial with 60 nonzero -* coefficients in {-1,1} using the output stream of -* SHAKE256(mu|w1). -* -* Arguments: - poly *c: pointer to output polynomial -* - const uint8_t mu[]: byte array containing mu -* - const polyveck *w1: pointer to vector w1 -**************************************************/ -void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, - const uint8_t mu[CRHBYTES], - const polyveck *w1) { - uint8_t b; - size_t pos; - uint64_t signs; - uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - uint8_t outbuf[SHAKE256_RATE]; - shake256ctx state; - - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[i] = mu[i]; +static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) { + switch (i) { + case 0: + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho); + *row = buf; + break; + case 1: + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho); + *row = buf + 1; + break; + case 2: + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho); + *row = buf; + break; + case 3: + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho); + *row = buf + 1; + break; } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); - } - - shake256_absorb(&state, inbuf, sizeof(inbuf)); - shake256_squeezeblocks(outbuf, 1, &state); - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t) outbuf[i] << 8 * i; - } - - pos = 8; - - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - for (size_t i = 196; i < 256; ++i) { - do { - if (pos >= SHAKE256_RATE) { - shake256_squeezeblocks(outbuf, 1, &state); - pos = 0; - } - - b = outbuf[pos++]; - } while (b > i); - - c->coeffs[i] = c->coeffs[b]; - c->coeffs[b] = 1; - c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - shake256_ctx_release(&state); } /************************************************* @@ -104,56 +37,69 @@ void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, * Description: Generates public and private key. * * Arguments: - uint8_t *pk: pointer to output public key (allocated -* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES bytes) * - uint8_t *sk: pointer to output private key (allocated -* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* array of PQCLEAN_DILITHIUM2_AVX2_CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; uint8_t seedbuf[3 * SEEDBYTES]; - uint8_t tr[CRHBYTES]; const uint8_t *rho, *rhoprime, *key; - uint16_t nonce = 0; - polyvecl mat[K]; - polyvecl s1, s1hat; - polyveck s2, t, t1, t0; + polyvecl rowbuf[2]; + polyvecl s1, *row = rowbuf; + polyveck s2; + poly t1, t0; - /* Expand 32 bytes of randomness into rho, rhoprime and key */ - randombytes(seedbuf, 3 * SEEDBYTES); + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); rho = seedbuf; rhoprime = seedbuf + SEEDBYTES; key = seedbuf + 2 * SEEDBYTES; - /* Expand matrix */ - PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); + /* Store rho, key */ + memcpy(pk, rho, SEEDBYTES); + memcpy(sk, rho, SEEDBYTES); + memcpy(sk + SEEDBYTES, key, SEEDBYTES); /* Sample short vectors s1 and s2 */ - PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s2.vec[0], rhoprime, - nonce, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[1], &s2.vec[2], &s2.vec[3], &t.vec[0], rhoprime, - nonce + 4, nonce + 5, nonce + 6, 0); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3); + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_eta_4x(&s2.vec[0], &s2.vec[1], &s2.vec[2], &s2.vec[3], rhoprime, 4, 5, 6, 7); - /* Matrix-vector multiplication */ - s1hat = s1; - PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1hat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); - //PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&t.vec[i]); - PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&t.vec[i]); + /* Pack secret vectors */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); + } + for (i = 0; i < K; i++) { + PQCLEAN_DILITHIUM2_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); } - /* Add error vector s2 */ - PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&t, &t, &s2); + /* Transform s1 */ + PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1); - /* Extract t1 and write public key */ - PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&t); - PQCLEAN_DILITHIUM2_AVX2_polyveck_power2round(&t1, &t0, &t); - PQCLEAN_DILITHIUM2_AVX2_pack_pk(pk, rho, &t1); - /* Compute CRH(rho, t1) and write secret key */ - crh(tr, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); - PQCLEAN_DILITHIUM2_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); + for (i = 0; i < K; i++) { + /* Expand matrix row */ + polyvec_matrix_expand_row(&row, rowbuf, rho, i); + + /* Compute inner-product */ + PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&t1); + + /* Add error polynomial */ + PQCLEAN_DILITHIUM2_AVX2_poly_add(&t1, &t1, &s2.vec[i]); + + /* Round t and pack t1, t0 */ + PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&t1); + PQCLEAN_DILITHIUM2_AVX2_poly_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM2_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); + PQCLEAN_DILITHIUM2_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); + } + + /* Compute CRH(rho, t1) and store in secret key */ + crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); return 0; } @@ -161,42 +107,40 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature * -* Description: Compute signed message. +* Description: Computes signature. * -* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES -* of len) -* - size_t *siglen: pointer to output length of signed message -* (should be PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) -* - uint8_t *m: pointer to message to be signed -* - size_t mlen: length of message -* - uint8_t *sk: pointer to bit-packed secret key +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key * * Returns 0 (success) **************************************************/ -int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - uint32_t n; +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + unsigned int i, n, pos; uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; uint8_t *rho, *tr, *key, *mu, *rhoprime; - uint16_t nonce = 0; - poly c, chat; - polyvecl mat[K], s1, y, yhat, z; - polyveck t0, s2, w, w1, w0; - polyveck h, cs2, ct0; + uint8_t hintbuf[N]; + uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce = 0; + polyvecl mat[K], s1, z; + polyveck t0, s2, w1; + poly c, tmp; + union { + polyvecl y; + polyveck w0; + } tmpv; + shake256incctx state; rho = seedbuf; tr = rho + SEEDBYTES; key = tr + CRHBYTES; mu = key + SEEDBYTES; rhoprime = mu + CRHBYTES; - PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); + PQCLEAN_DILITHIUM2_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); - - // use incremental hash API instead of copying around buffers - /* Compute CRH(tr, m) */ - shake256incctx state; + /* Compute CRH(tr, msg) */ shake256_inc_init(&state); shake256_inc_absorb(&state, tr, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -207,76 +151,88 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature( crh(rhoprime, key, SEEDBYTES + CRHBYTES); /* Expand matrix and transform vectors */ - PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_expand(mat, rho); PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&s1); PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&s2); PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t0); + rej: /* Sample intermediate vector y */ - PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &yhat.vec[0], - rhoprime, nonce, nonce + 1, nonce + 2, 0); - nonce += 3; + PQCLEAN_DILITHIUM2_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3], + rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); + nonce += 4; - /* Matrix-vector multiplication */ - yhat = y; - PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&yhat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); - PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w.vec[i]); - PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&w.vec[i]); - } + /* Matrix-vector product */ + tmpv.y = z; + PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&tmpv.y); + PQCLEAN_DILITHIUM2_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); + PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_tomont(&w1); /* Decompose w and call the random oracle */ - PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w); - PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &w0, &w); - PQCLEAN_DILITHIUM2_AVX2_challenge(&c, mu, &w1); - chat = c; - PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat); + PQCLEAN_DILITHIUM2_AVX2_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM2_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); + PQCLEAN_DILITHIUM2_AVX2_polyveck_pack_w1(sig, &w1); - /* Check that subtracting cs2 does not change high bits of w and low bits - * do not reveal secret information */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); - PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&cs2.vec[i]); - } - PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&w0, &w0, &cs2); - PQCLEAN_DILITHIUM2_AVX2_polyveck_freeze(&w0); - if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) { - goto rej; - } + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c); /* Compute z, reject if it reveals secret */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); - PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&z.vec[i]); - } - PQCLEAN_DILITHIUM2_AVX2_polyvecl_add(&z, &z, &y); - PQCLEAN_DILITHIUM2_AVX2_polyvecl_freeze(&z); - if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { - goto rej; + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM2_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); + PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&z.vec[i]); + if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { + goto rej; + } } - /* Compute hints for w1 */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); - PQCLEAN_DILITHIUM2_AVX2_poly_invntt_montgomery(&ct0.vec[i]); + /* Zero hint vector in signature */ + pos = 0; + memset(hint, 0, OMEGA); + + for (i = 0; i < K; i++) { + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM2_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmpv.w0.vec[i]); + if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints */ + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&tmp); + if (PQCLEAN_DILITHIUM2_AVX2_poly_chknorm(&tmp, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM2_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + n = PQCLEAN_DILITHIUM2_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); + if (pos + n > OMEGA) { + goto rej; + } + + /* Store hints in signature */ + memcpy(&hint[pos], hintbuf, n); + hint[OMEGA + i] = pos = pos + n; } - PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&ct0); - if (PQCLEAN_DILITHIUM2_AVX2_polyveck_chknorm(&ct0, GAMMA2)) { - goto rej; + /* Pack z into signature */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM2_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); } - PQCLEAN_DILITHIUM2_AVX2_polyveck_add(&w0, &w0, &ct0); - PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&w0); - n = PQCLEAN_DILITHIUM2_AVX2_polyveck_make_hint(&h, &w0, &w1); - if (n > OMEGA) { - goto rej; - } - - /* Write signature */ - PQCLEAN_DILITHIUM2_AVX2_pack_sig(sig, &z, &h, &c); *siglen = PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; return 0; } @@ -290,63 +246,55 @@ rej: * array with PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen bytes), * can be equal to m * - size_t *smlen: pointer to output length of signed -* message +* message * - const uint8_t *m: pointer to message to be signed * - size_t mlen: length of message * - const uint8_t *sk: pointer to bit-packed secret key * * Returns 0 (success) **************************************************/ -int PQCLEAN_DILITHIUM2_AVX2_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - int rc; - memmove(sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, m, mlen); - rc = PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk); +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, mlen, sk); *smlen += mlen; - return rc; + return 0; } /************************************************* * Name: PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify * -* Description: Verify signed message. +* Description: Verifies signature. * -* Arguments: - uint8_t *sig: signature -* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) -* - uint8_t *m: pointer to message -* - size_t *mlen: pointer to output length of message -* - uint8_t *pk: pointer to bit-packed public key +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key * -* Returns 0 if signed message could be verified correctly and -1 otherwise +* Returns 0 if signature could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, - const uint8_t *pk) { - uint8_t rho[SEEDBYTES]; +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { + unsigned int i, j, pos = 0; + /* PQCLEAN_DILITHIUM2_AVX2_polyw1_pack writes additional 14 bytes */ + ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; uint8_t mu[CRHBYTES]; - poly c, chat, cp; - polyvecl mat[K], z; - polyveck t1, w1, h, tmp1, tmp2; + const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + polyvecl rowbuf[2]; + polyvecl *row = rowbuf; + polyvecl z; + poly c, w1, h; + shake256incctx state; - if (siglen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { - return -1; - } - - PQCLEAN_DILITHIUM2_AVX2_unpack_pk(rho, &t1, pk); - if (PQCLEAN_DILITHIUM2_AVX2_unpack_sig(&z, &h, &c, sig)) { - return -1; - } - if (PQCLEAN_DILITHIUM2_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + if (siglen != PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { return -1; } /* Compute CRH(CRH(rho, t1), msg) */ crh(mu, pk, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_PUBLICKEYBYTES); - - shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -354,33 +302,69 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); - /* Matrix-vector multiplication; compute Az - c2^dt1 */ - PQCLEAN_DILITHIUM2_AVX2_expand_mat(mat, rho); - PQCLEAN_DILITHIUM2_AVX2_polyvecl_ntt(&z); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); + /* Expand PQCLEAN_DILITHIUM2_AVX2_challenge */ + PQCLEAN_DILITHIUM2_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&c); + + /* Unpack z; shortness follows from unpacking */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM2_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); + PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&z.vec[i]); } - chat = c; - PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&chat); - PQCLEAN_DILITHIUM2_AVX2_polyveck_shiftl(&t1); - PQCLEAN_DILITHIUM2_AVX2_polyveck_ntt(&t1); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + polyvec_matrix_expand_row(&row, rowbuf, pk, i); + + /* Compute i-th row of Az - c2^Dt1 */ + PQCLEAN_DILITHIUM2_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); + + PQCLEAN_DILITHIUM2_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); + PQCLEAN_DILITHIUM2_AVX2_poly_shiftl(&h); + PQCLEAN_DILITHIUM2_AVX2_poly_ntt(&h); + PQCLEAN_DILITHIUM2_AVX2_poly_pointwise_montgomery(&h, &c, &h); + + PQCLEAN_DILITHIUM2_AVX2_poly_sub(&w1, &w1, &h); + PQCLEAN_DILITHIUM2_AVX2_poly_reduce(&w1); + PQCLEAN_DILITHIUM2_AVX2_poly_invntt_tomont(&w1); + + /* Get hint polynomial and reconstruct w1 */ + memset(h.vec, 0, sizeof(poly)); + if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { + return -1; + } + + for (j = pos; j < hint[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > pos && hint[j] <= hint[j - 1]) { + return -1; + } + h.coeffs[hint[j]] = 1; + } + pos = hint[OMEGA + i]; + + PQCLEAN_DILITHIUM2_AVX2_poly_caddq(&w1); + PQCLEAN_DILITHIUM2_AVX2_poly_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM2_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); } - PQCLEAN_DILITHIUM2_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2); - PQCLEAN_DILITHIUM2_AVX2_polyveck_reduce(&tmp1); - PQCLEAN_DILITHIUM2_AVX2_polyveck_invntt_montgomery(&tmp1); + /* Extra indices are zero for strong unforgeability */ + for (j = pos; j < OMEGA; ++j) { + if (hint[j]) { + return -1; + } + } - /* Reconstruct w1 */ - PQCLEAN_DILITHIUM2_AVX2_polyveck_csubq(&tmp1); - PQCLEAN_DILITHIUM2_AVX2_polyveck_use_hint(&w1, &tmp1, &h); - - /* Call random oracle and verify challenge */ - PQCLEAN_DILITHIUM2_AVX2_challenge(&cp, mu, &w1); - for (size_t i = 0; i < N; ++i) { - if (c.coeffs[i] != cp.coeffs[i]) { + /* Call random oracle and verify PQCLEAN_DILITHIUM2_AVX2_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (buf.coeffs[i] != sig[i]) { return -1; } } @@ -394,7 +378,7 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( * Description: Verify signed message. * * Arguments: - uint8_t *m: pointer to output message (allocated -* array with smlen bytes), can be equal to sm +* array with smlen bytes), can be equal to sm * - size_t *mlen: pointer to output length of message * - const uint8_t *sm: pointer to signed message * - size_t smlen: length of signed message @@ -402,30 +386,28 @@ int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify( * * Returns 0 if signed message could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk) { +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { + size_t i; + if (smlen < PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; - if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, - sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ - for (size_t i = 0; i < *mlen; ++i) { + for (i = 0; i < *mlen; ++i) { m[i] = sm[PQCLEAN_DILITHIUM2_AVX2_CRYPTO_BYTES + i]; } return 0; } - /* Signature verification failed */ badsig: - *mlen = (size_t) -1; - for (size_t i = 0; i < smlen; ++i) { + /* Signature verification failed */ + *mlen = -1; + for (i = 0; i < smlen; ++i) { m[i] = 0; } diff --git a/crypto_sign/dilithium2/avx2/sign.h b/crypto_sign/dilithium2/avx2/sign.h index a8e5c368..94dc98b1 100644 --- a/crypto_sign/dilithium2/avx2/sign.h +++ b/crypto_sign/dilithium2/avx2/sign.h @@ -1,15 +1,29 @@ -#ifndef SIGN_H -#define SIGN_H - -#include "api.h" +#ifndef PQCLEAN_DILITHIUM2_AVX2_SIGN_H +#define PQCLEAN_DILITHIUM2_AVX2_SIGN_H #include "params.h" #include "poly.h" #include "polyvec.h" +#include +#include -void PQCLEAN_DILITHIUM2_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES], - const polyveck *w1); +void PQCLEAN_DILITHIUM2_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM2_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); #endif - diff --git a/crypto_sign/dilithium2/avx2/stream.c b/crypto_sign/dilithium2/avx2/stream.c deleted file mode 100644 index 98e7a6d4..00000000 --- a/crypto_sign/dilithium2/avx2/stream.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "stream.h" - -#include - -void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { - - uint8_t buf[SEEDBYTES + 2]; - memcpy(buf, seed, SEEDBYTES); - buf[SEEDBYTES] = (uint8_t)nonce; - buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); - - shake128_absorb(state, buf, SEEDBYTES + 2); -} - - -void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { - - uint8_t buf[CRHBYTES + 2]; - memcpy(buf, seed, CRHBYTES); - buf[CRHBYTES] = (uint8_t)nonce; - buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); - - shake256_absorb(state, buf, CRHBYTES + 2); -} diff --git a/crypto_sign/dilithium2/avx2/stream.h b/crypto_sign/dilithium2/avx2/stream.h deleted file mode 100644 index 9185af8c..00000000 --- a/crypto_sign/dilithium2/avx2/stream.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM2_AVX2_STREAM_H -#define PQCLEAN_DILITHIUM2_AVX2_STREAM_H - -#include - -#include "fips202.h" -#include "params.h" - -void PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); - -void PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); - -#endif diff --git a/crypto_sign/dilithium2/avx2/symmetric-shake.c b/crypto_sign/dilithium2/avx2/symmetric-shake.c new file mode 100644 index 00000000..803b8121 --- /dev/null +++ b/crypto_sign/dilithium2/avx2/symmetric-shake.c @@ -0,0 +1,26 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake128_inc_init(state); + shake128_inc_absorb(state, seed, SEEDBYTES); + shake128_inc_absorb(state, t, 2); + shake128_inc_finalize(state); +} + +void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake256_inc_init(state); + shake256_inc_absorb(state, seed, CRHBYTES); + shake256_inc_absorb(state, t, 2); + shake256_inc_finalize(state); +} diff --git a/crypto_sign/dilithium2/avx2/symmetric.h b/crypto_sign/dilithium2/avx2/symmetric.h index 33d32654..deb8cf6c 100644 --- a/crypto_sign/dilithium2/avx2/symmetric.h +++ b/crypto_sign/dilithium2/avx2/symmetric.h @@ -1,25 +1,36 @@ #ifndef PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H #define PQCLEAN_DILITHIUM2_AVX2_SYMMETRIC_H - -#include "params.h" -#include "stream.h" - - #include "fips202.h" +#include "params.h" +#include -#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) -#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake128_stream_init(STATE, SEED, NONCE) -#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream128_ctx_release(STATE) shake128_ctx_release(STATE) -#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_AVX2_shake256_stream_init(STATE, SEED, NONCE) -#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream256_ctx_release(STATE) shake256_ctx_release(STATE) + + +typedef shake128incctx stream128_state; +typedef shake256incctx stream256_state; + +void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(shake128incctx *state, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); + +void PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(shake256incctx *state, + const uint8_t seed[CRHBYTES], + uint16_t nonce); #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE -typedef shake128ctx stream128_state; -typedef shake256ctx stream256_state; +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM2_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) +#define stream128_release(STATE) shake128_inc_ctx_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM2_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) +#define stream256_release(STATE) shake256_inc_ctx_release(STATE) #endif diff --git a/crypto_sign/dilithium2/clean/LICENSE b/crypto_sign/dilithium2/clean/LICENSE index 40541676..08473af7 100644 --- a/crypto_sign/dilithium2/clean/LICENSE +++ b/crypto_sign/dilithium2/clean/LICENSE @@ -1,6 +1,5 @@ Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) -For Keccak and the random number generator -we are using public-domain code from sources -and by authors listed in comments on top of -the respective files. +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium2/clean/Makefile b/crypto_sign/dilithium2/clean/Makefile index f9448299..6c1aea72 100644 --- a/crypto_sign/dilithium2/clean/Makefile +++ b/crypto_sign/dilithium2/clean/Makefile @@ -1,13 +1,10 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium2_clean.a +HEADERS=api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS=ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o -SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c -OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o -HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - reduce.h rounding.h symmetric.h stream.h - -CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) all: $(LIB) diff --git a/crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake index 5f22b2c1..410bd6ac 100644 --- a/crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake +++ b/crypto_sign/dilithium2/clean/Makefile.Microsoft_nmake @@ -2,8 +2,13 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libdilithium2_clean.lib -OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj -CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX +OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj + +# Warning C4146 is raised when a unary minus operator is applied to an +# unsigned type; this has nonetheless been standard and portable for as +# long as there has been a C standard, and we need it for constant-time +# computations. Thus, we disable that spurious warning. +CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 all: $(LIBRARY) @@ -11,7 +16,7 @@ all: $(LIBRARY) $(OBJECTS): *.h $(LIBRARY): $(OBJECTS) - LIB.EXE /NOLOGO /WX /OUT:$@ $** + LIB.EXE /NOLOGO /WX /OUT:$@ $** clean: -DEL $(OBJECTS) diff --git a/crypto_sign/dilithium2/clean/api.h b/crypto_sign/dilithium2/clean/api.h index 1053371a..dabeeae8 100644 --- a/crypto_sign/dilithium2/clean/api.h +++ b/crypto_sign/dilithium2/clean/api.h @@ -4,26 +4,13 @@ #include #include - -#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1184U -#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2800U -#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2044U - +#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES 1312 +#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES 2544 +#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES 2420 #define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2" -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair( - uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( uint8_t *sig, size_t *siglen, @@ -33,6 +20,12 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); #endif diff --git a/crypto_sign/dilithium2/clean/ntt.c b/crypto_sign/dilithium2/clean/ntt.c index a684c3fd..af3c776b 100644 --- a/crypto_sign/dilithium2/clean/ntt.c +++ b/crypto_sign/dilithium2/clean/ntt.c @@ -1,138 +1,98 @@ +#include "ntt.h" +#include "params.h" +#include "reduce.h" #include -#include "params.h" -#include "ntt.h" -#include "poly.h" -#include "reduce.h" - -/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM2_CLEAN_ntt */ -static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas[N] = { - 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, - 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464, - 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231, - 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005, 2706023, - 95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439, 4519302, - 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118, 6681150, - 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596, 811944, - 531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638, 4450022, - 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196, 7122806, - 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922, 3412210, - 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370, 7709315, - 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987, 5037034, - 264944, 508951, 3097992, 44288, 7280319, 904516, 3958618, 4656075, 8371839, - 1653064, 5130689, 2389356, 8169440, 759969, 7063561, 189548, 4827145, - 3159746, 6529015, 5971092, 8202977, 1315589, 1341330, 1285669, 6795489, - 7567685, 6940675, 5361315, 4499357, 4751448, 3839961, 2091667, 3407706, - 2316500, 3817976, 5037939, 2244091, 5933984, 4817955, 266997, 2434439, - 7144689, 3513181, 4860065, 4621053, 7183191, 5187039, 900702, 1859098, - 909542, 819034, 495491, 6767243, 8337157, 7857917, 7725090, 5257975, - 2031748, 3207046, 4823422, 7855319, 7611795, 4784579, 342297, 286988, - 5942594, 4108315, 3437287, 5038140, 1735879, 203044, 2842341, 2691481, - 5790267, 1265009, 4055324, 1247620, 2486353, 1595974, 4613401, 1250494, - 2635921, 4832145, 5386378, 1869119, 1903435, 7329447, 7047359, 1237275, - 5062207, 6950192, 7929317, 1312455, 3306115, 6417775, 7100756, 1917081, - 5834105, 7005614, 1500165, 777191, 2235880, 3406031, 7838005, 5548557, - 6709241, 6533464, 5796124, 4656147, 594136, 4603424, 6366809, 2432395, - 2454455, 8215696, 1957272, 3369112, 185531, 7173032, 5196991, 162844, - 1616392, 3014001, 810149, 1652634, 4686184, 6581310, 5341501, 3523897, - 3866901, 269760, 2213111, 7404533, 1717735, 472078, 7953734, 1723600, - 6577327, 1910376, 6712985, 7276084, 8119771, 4546524, 5441381, 6144432, - 7959518, 6094090, 183443, 7403526, 1612842, 4834730, 7826001, 3919660, - 8332111, 7018208, 3937738, 1400424, 7534263, 1976782 -}; - -/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM2_CLEAN_ntt */ -static const uint32_t PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[N] = { - 6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416, - 3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036, - 3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683, - 7908339, 6662682, 975884, 6167306, 8110657, 4513516, 4856520, 3038916, - 1799107, 3694233, 6727783, 7570268, 5366416, 6764025, 8217573, 3183426, - 1207385, 8194886, 5011305, 6423145, 164721, 5925962, 5948022, 2013608, - 3776993, 7786281, 3724270, 2584293, 1846953, 1671176, 2831860, 542412, - 4974386, 6144537, 7603226, 6880252, 1374803, 2546312, 6463336, 1279661, - 1962642, 5074302, 7067962, 451100, 1430225, 3318210, 7143142, 1333058, - 1050970, 6476982, 6511298, 2994039, 3548272, 5744496, 7129923, 3767016, - 6784443, 5894064, 7132797, 4325093, 7115408, 2590150, 5688936, 5538076, - 8177373, 6644538, 3342277, 4943130, 4272102, 2437823, 8093429, 8038120, - 3595838, 768622, 525098, 3556995, 5173371, 6348669, 3122442, 655327, - 522500, 43260, 1613174, 7884926, 7561383, 7470875, 6521319, 7479715, - 3193378, 1197226, 3759364, 3520352, 4867236, 1235728, 5945978, 8113420, - 3562462, 2446433, 6136326, 3342478, 4562441, 6063917, 4972711, 6288750, - 4540456, 3628969, 3881060, 3019102, 1439742, 812732, 1584928, 7094748, - 7039087, 7064828, 177440, 2409325, 1851402, 5220671, 3553272, 8190869, - 1316856, 7620448, 210977, 5991061, 3249728, 6727353, 8578, 3724342, - 4421799, 7475901, 1100098, 8336129, 5282425, 7871466, 8115473, 3343383, - 1430430, 6527646, 7031341, 381987, 1308169, 22981, 1228525, 671102, - 2477047, 411027, 3693493, 2967645, 5665122, 6232521, 983419, 4968207, - 8253495, 3632928, 3157330, 3190144, 1000202, 4083598, 6441103, 1257611, - 1585221, 6203962, 4904467, 1452451, 3041255, 3677745, 1528703, 3930395, - 2797779, 6308525, 2556880, 4479693, 4499374, 7426187, 7849063, 7568473, - 4680821, 1600420, 2140649, 4873154, 3821735, 4874723, 1643818, 1699267, - 539299, 6031717, 300467, 4840449, 2867647, 4805995, 3043716, 3861115, - 4464978, 2537516, 3592148, 1661693, 4849980, 5303092, 8284641, 5674394, - 8100412, 4369920, 19422, 6623180, 3277672, 1399561, 3859737, 2118186, - 2108549, 5760665, 1119584, 549488, 4794489, 1079900, 7356305, 5654953, - 5700314, 5268920, 2884855, 5260684, 2091905, 359251, 6026966, 6554070, - 7913949, 876248, 777960, 8143293, 518909, 2608894, 8354570 +static const int32_t zetas[N] = { + 0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, + 2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, + -2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, + -1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, + 3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, + -671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, + -3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, + -3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, + 189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, + 1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, + 2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, + 266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, + 900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, + -655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, + 342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, + 2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, + -3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, + -1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, + -1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, + -542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, + -2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, + -3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, + -3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, + -426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, + -2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, + -554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 }; /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_ntt * * Description: Forward NTT, in-place. No modular reduction is performed after -* additions or subtractions. Hence output coefficients can be up -* to 16*Q larger than the coefficients of the input polynomial. -* Output vector is in bitreversed order. +* additions or subtractions. Output vector is in bitreversed order. * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]) { - size_t k, j; - uint32_t zeta, t; +void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]) { + unsigned int len, start, j, k; + int32_t zeta, t; - k = 1; - for (size_t len = 128; len > 0; len >>= 1) { - for (size_t start = 0; start < N; start = j + len) { - zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas[k++]; + k = 0; + for (len = 128; len > 0; len >>= 1) { + for (start = 0; start < N; start = j + len) { + zeta = zetas[++k]; for (j = start; j < start + len; ++j) { - t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); - p[j + len] = p[j] + 2 * Q - t; - p[j] = p[j] + t; + t = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + a[j + len] = a[j] - t; + a[j] = a[j] + t; } } } } /************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont +* Name: PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont * * Description: Inverse NTT and multiplication by Montgomery factor 2^32. * In-place. No modular reductions after additions or -* subtractions. Input coefficient need to be smaller than 2*Q. -* Output coefficient are smaller than 2*Q. +* subtractions; input coefficients need to be smaller than +* Q in absolute value. Output coefficient are smaller than Q in +* absolute value. * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]) { - size_t start, len, j, k; - uint32_t t, zeta; - const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q; +void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]) { + unsigned int start, len, j, k; + int32_t t, zeta; + const int32_t f = 41978; // mont^2/256 - k = 0; + k = 256; for (len = 1; len < N; len <<= 1) { for (start = 0; start < N; start = j + len) { - zeta = PQCLEAN_DILITHIUM2_CLEAN_zetas_inv[k++]; + zeta = -zetas[--k]; for (j = start; j < start + len; ++j) { - t = p[j]; - p[j] = t + p[j + len]; - p[j + len] = t + 256 * Q - p[j + len]; - p[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); + t = a[j]; + a[j] = t + a[j + len]; + a[j + len] = t - a[j + len]; + a[j + len] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); } } } for (j = 0; j < N; ++j) { - p[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t) f * p[j]); + a[j] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)f * a[j]); } } diff --git a/crypto_sign/dilithium2/clean/ntt.h b/crypto_sign/dilithium2/clean/ntt.h index b02c9dab..3b0ff001 100644 --- a/crypto_sign/dilithium2/clean/ntt.h +++ b/crypto_sign/dilithium2/clean/ntt.h @@ -1,11 +1,10 @@ #ifndef PQCLEAN_DILITHIUM2_CLEAN_NTT_H #define PQCLEAN_DILITHIUM2_CLEAN_NTT_H - +#include "params.h" #include -#include "params.h" +void PQCLEAN_DILITHIUM2_CLEAN_ntt(int32_t a[N]); -void PQCLEAN_DILITHIUM2_CLEAN_ntt(uint32_t p[N]); -void PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(uint32_t p[N]); +void PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(int32_t a[N]); #endif diff --git a/crypto_sign/dilithium2/clean/packing.c b/crypto_sign/dilithium2/clean/packing.c index d91fe1ab..b54c9646 100644 --- a/crypto_sign/dilithium2/clean/packing.c +++ b/crypto_sign/dilithium2/clean/packing.c @@ -3,6 +3,7 @@ #include "poly.h" #include "polyvec.h" + /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_pack_pk * @@ -12,17 +13,18 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { pk[i] = rho[i]; } pk += SEEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); } } @@ -35,212 +37,201 @@ void PQCLEAN_DILITHIUM2_CLEAN_pack_pk( * - const polyveck *t1: pointer to output vector t1 * - uint8_t pk[]: byte array containing bit-packed pk **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { rho[i] = pk[i]; } pk += SEEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); } } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sk * -* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). * * Arguments: - uint8_t sk[]: output byte array * - const uint8_t rho[]: byte array containing rho -* - const uint8_t key[]: byte array containing key * - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 -* - const polyveck *t0: pointer to vector t0 **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { sk[i] = rho[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < SEEDBYTES; ++i) { + for (i = 0; i < SEEDBYTES; ++i) { sk[i] = key[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < CRHBYTES; ++i) { + for (i = 0; i < CRHBYTES; ++i) { sk[i] = tr[i]; } sk += CRHBYTES; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); } - sk += L * POLETA_SIZE_PACKED; + sk += L * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); } - sk += K * POLETA_SIZE_PACKED; + sk += K * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); } } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sk * -* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). * * Arguments: - const uint8_t rho[]: output byte array for rho -* - const uint8_t key[]: output byte array for key * - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 * - const polyvecl *s1: pointer to output vector s1 * - const polyveck *s2: pointer to output vector s2 -* - const polyveck *r0: pointer to output vector t0 * - uint8_t sk[]: byte array containing bit-packed sk **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { rho[i] = sk[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < SEEDBYTES; ++i) { + for (i = 0; i < SEEDBYTES; ++i) { key[i] = sk[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < CRHBYTES; ++i) { + for (i = 0; i < CRHBYTES; ++i) { tr[i] = sk[i]; } sk += CRHBYTES; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); } - sk += L * POLETA_SIZE_PACKED; + sk += L * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); } - sk += K * POLETA_SIZE_PACKED; + sk += K * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); } } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_pack_sig * -* Description: Bit-pack signature sig = (z, h, c). +* Description: Bit-pack signature sig = (c, z, h). * * Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2_CLEAN_challenge hash length SEEDBYTES * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { - size_t k; - uint64_t signs, mask; +void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; } - sig += L * POLZ_SIZE_PACKED; + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; /* Encode h */ - k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { - if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (uint8_t)j; - } - } - - sig[OMEGA + i] = (uint8_t)k; - } - while (k < OMEGA) { - sig[k++] = 0; - } - sig += OMEGA + K; - - /* Encode c */ - signs = 0; - mask = 1; - for (size_t i = 0; i < N / 8; ++i) { + for (i = 0; i < OMEGA + K; ++i) { sig[i] = 0; - for (size_t j = 0; j < 8; ++j) { - if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (uint8_t)(1u << j); - if (c->coeffs[8 * i + j] == (Q - 1)) { - signs |= mask; - } - mask <<= 1; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; } } - } - sig += N / 8; - for (size_t i = 0; i < 8; ++i) { - sig[i] = (uint8_t)(signs >> 8u * i); + + sig[OMEGA + i] = (uint8_t) k; } } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_unpack_sig * -* Description: Unpack signature sig = (z, h, c). +* Description: Unpack signature sig = (c, z, h). * -* Arguments: - polyvecl *z: pointer to output vector z +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2_CLEAN_challenge hash +* - polyvecl *z: pointer to output vector z * - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial * - const uint8_t sig[]: byte array containing * bit-packed signature * * Returns 1 in case of malformed signature; otherwise 0. **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]) { - size_t k; - uint64_t signs; +int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]) { + unsigned int i, j, k; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; } - sig += L * POLZ_SIZE_PACKED; + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; /* Decode h */ k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { h->vec[i].coeffs[j] = 0; } @@ -248,7 +239,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( return 1; } - for (size_t j = k; j < sig[OMEGA + i]; ++j) { + for (j = k; j < sig[OMEGA + i]; ++j) { /* Coefficients are ordered for strong unforgeability */ if (j > k && sig[j] <= sig[j - 1]) { return 1; @@ -260,38 +251,11 @@ int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( } /* Extra indices are zero for strong unforgeability */ - for (size_t j = k; j < OMEGA; ++j) { + for (j = k; j < OMEGA; ++j) { if (sig[j]) { return 1; } } - sig += OMEGA + K; - - /* Decode c */ - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)sig[N / 8 + i] << 8 * i; - } - - /* Extra sign bits are zero for strong unforgeability */ - if (signs >> 60) { - return 1; - } - - for (size_t i = 0; i < N / 8; ++i) { - for (size_t j = 0; j < 8; ++j) { - if ((sig[i] >> j) & 0x01) { - c->coeffs[8 * i + j] = 1; - c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - } - } - return 0; } diff --git a/crypto_sign/dilithium2/clean/packing.h b/crypto_sign/dilithium2/clean/packing.h index 16377f9c..44a088fd 100644 --- a/crypto_sign/dilithium2/clean/packing.h +++ b/crypto_sign/dilithium2/clean/packing.h @@ -1,42 +1,31 @@ #ifndef PQCLEAN_DILITHIUM2_CLEAN_PACKING_H #define PQCLEAN_DILITHIUM2_CLEAN_PACKING_H - -#include "api.h" #include "params.h" #include "polyvec.h" +#include -void PQCLEAN_DILITHIUM2_CLEAN_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1); -void PQCLEAN_DILITHIUM2_CLEAN_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[SEEDBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM2_CLEAN_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, const polyveck *h, const poly *c); +void PQCLEAN_DILITHIUM2_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); -void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t *sk); -int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]); +void PQCLEAN_DILITHIUM2_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM2_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES]); #endif diff --git a/crypto_sign/dilithium2/clean/params.h b/crypto_sign/dilithium2/clean/params.h index 5d31774d..93aa7897 100644 --- a/crypto_sign/dilithium2/clean/params.h +++ b/crypto_sign/dilithium2/clean/params.h @@ -2,28 +2,40 @@ #define PQCLEAN_DILITHIUM2_CLEAN_PARAMS_H + #define SEEDBYTES 32 #define CRHBYTES 48 #define N 256 #define Q 8380417 -#define QBITS 23 -#define D 14 -#define GAMMA1 ((Q - 1)/16) -#define GAMMA2 (GAMMA1/2) -#define ALPHA (2*GAMMA2) +#define D 13 +#define ROOT_OF_UNITY 1753 #define K 4 -#define L 3 -#define ETA 6 -#define SETABITS 4 -#define BETA 325 +#define L 4 +#define ETA 2 +#define TAU 39 +#define BETA 78 +#define GAMMA1 (1 << 17) +#define GAMMA2 ((Q-1)/88) #define OMEGA 80 +#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_ALGNAME "Dilithium2" -#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) -#define POLT0_SIZE_PACKED ((N*D)/8) -#define POLETA_SIZE_PACKED ((N*SETABITS)/8) -#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) -#define POLW1_SIZE_PACKED ((N*4)/8) +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 576 + +#define POLYW1_PACKEDBYTES 192 + +#define POLYETA_PACKEDBYTES 96 + +#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/crypto_sign/dilithium2/clean/poly.c b/crypto_sign/dilithium2/clean/poly.c index 733efb17..f46027a5 100644 --- a/crypto_sign/dilithium2/clean/poly.c +++ b/crypto_sign/dilithium2/clean/poly.c @@ -4,48 +4,66 @@ #include "reduce.h" #include "rounding.h" #include "symmetric.h" +#include +#define DBENCH_START() +#define DBENCH_STOP(t) /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_poly_reduce * -* Description: Reduce all coefficients of input polynomial to representative -* in [0,2*Q[. +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a) { - for (size_t i = 0; i < N; ++i) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a->coeffs[i]); } + + DBENCH_STOP(*tred); } /************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_csubq +* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_caddq * -* Description: For all coefficients of input polynomial subtract Q if -* coefficient is bigger than Q. +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(poly *a) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_csubq(a->coeffs[i]); +void PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_caddq(a->coeffs[i]); } + + DBENCH_STOP(*tred); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_poly_freeze * -* Description: Reduce all coefficients of the polynomial to standard -* representatives. +* Description: Inplace reduction of all coefficients of polynomial to +* standard representatives. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a) { - for (size_t i = 0; i < N; ++i) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_freeze(a->coeffs[i]); } + + DBENCH_STOP(*tred); } /************************************************* @@ -57,85 +75,111 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a) { * - const poly *a: pointer to first summand * - const poly *b: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } + + DBENCH_STOP(*tadd); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_poly_sub * -* Description: Subtract polynomials. Assumes coefficients of second input -* polynomial to be less than 2*Q. No modular reduction is +* Description: Subtract polynomials. No modular reduction is * performed. * * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial to be -* subtracted from first input polynomial +* subtraced from first input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i]; + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; } + + DBENCH_STOP(*tadd); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl * * Description: Multiply polynomial by 2^D without modular reduction. Assumes -* input coefficients to be less than 2^{32-D}. +* input coefficients to be less than 2^{31-D} in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a) { - for (size_t i = 0; i < N; ++i) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { a->coeffs[i] <<= D; } + + DBENCH_STOP(*tmul); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_poly_ntt * -* Description: Forward NTT. Output coefficients can be up to 16*Q larger than -* input coefficients. +* Description: Inplace forward NTT. Coefficients can grow by +* 8*Q in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(poly *a) { + DBENCH_START(); + PQCLEAN_DILITHIUM2_CLEAN_ntt(a->coeffs); + + DBENCH_STOP(*tmul); } /************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery +* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont * -* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients -* need to be less than 2*Q. Output coefficients are less than 2*Q. +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(poly *a) { - PQCLEAN_DILITHIUM2_CLEAN_invntt_frominvmont(a->coeffs); +void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2_CLEAN_invntt_tomont(a->coeffs); + + DBENCH_STOP(*tmul); } /************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery +* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery * * Description: Pointwise multiplication of polynomials in NTT domain * representation and multiplication of resulting polynomial -* with 2^{-32}. Output coefficients are less than 2*Q if input -* coefficient are less than 22*Q. +* by 2^{-32}. * * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]); +void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); } + DBENCH_STOP(*tmul); } /************************************************* @@ -147,13 +191,18 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly * * standard representatives. * * Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *v: pointer to input polynomial +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_power2round(a->coeffs[i], &a0->coeffs[i]); + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]); } + + DBENCH_STOP(*tround); } /************************************************* @@ -166,13 +215,18 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a * Assumes coefficients to be standard representatives. * * Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *c: pointer to input polynomial +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_decompose(a->coeffs[i], &a0->coeffs[i]); + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]); } + + DBENCH_STOP(*tround); } /************************************************* @@ -188,12 +242,16 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) * * Returns number of 1 bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { - uint32_t s = 0; - for (size_t i = 0; i < N; ++i) { +unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { + unsigned int i, s = 0; + DBENCH_START(); + + for (i = 0; i < N; ++i) { h->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); s += h->coeffs[i]; } + + DBENCH_STOP(*tround); return s; } @@ -202,42 +260,56 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const * * Description: Use hint polynomial to correct the high bits of a polynomial. * -* Arguments: - poly *a: pointer to output polynomial with corrected high bits -* - const poly *b: pointer to input polynomial +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial * - const poly *h: pointer to input hint polynomial **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *h) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_use_hint(b->coeffs[i], h->coeffs[i]); +void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + b->coeffs[i] = PQCLEAN_DILITHIUM2_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]); } + + DBENCH_STOP(*tround); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm * * Description: Check infinity norm of polynomial against given bound. -* Assumes input coefficients to be standard representatives. +* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM2_CLEAN_reduce32(). * * Arguments: - const poly *a: pointer to polynomial -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm is strictly smaller than B and 1 otherwise. +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) { +int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; int32_t t; + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + /* It is ok to leak which coefficient violates the bound since the probability for each coefficient is independent of secret data but we must not leak the sign of the centralized representative. */ - for (size_t i = 0; i < N; ++i) { - /* Absolute value of centralized representative */ - t = (int32_t)((Q - 1) / 2 - a->coeffs[i]); - t ^= (t >> 31); - t = (Q - 1) / 2 - t; + for (i = 0; i < N; ++i) { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2 * a->coeffs[i]); - if ((uint32_t)t >= B) { + if (t >= B) { + DBENCH_STOP(*tsample); return 1; } } + + DBENCH_STOP(*tsample); return 0; } @@ -245,23 +317,23 @@ int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, uint32_t B) { * Name: rej_uniform * * Description: Sample uniformly random coefficients in [0, Q-1] by -* performing rejection sampling using array of random bytes. +* performing rejection sampling on array of random bytes. * -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled * - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes +* - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough * random bytes were given. **************************************************/ -static size_t rej_uniform( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; uint32_t t; + DBENCH_START(); ctr = pos = 0; while (ctr < len && pos + 3 <= buflen) { @@ -275,6 +347,7 @@ static size_t rej_uniform( } } + DBENCH_STOP(*tsample); return ctr; } @@ -282,22 +355,20 @@ static size_t rej_uniform( * Name: PQCLEAN_DILITHIUM2_CLEAN_poly_uniform * * Description: Sample polynomial with uniformly random coefficients -* in [0,Q-1] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_BUFLEN; - uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2]; stream128_state state; stream128_init(&state, seed, nonce); @@ -307,52 +378,55 @@ void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, while (ctr < N) { off = buflen % 3; - for (size_t i = 0; i < off; ++i) { + for (i = 0; i < off; ++i) { buf[i] = buf[buflen - off + i]; } - buflen = STREAM128_BLOCKBYTES + off; stream128_squeezeblocks(buf + off, 1, &state); + buflen = STREAM128_BLOCKBYTES + off; ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); } - stream128_ctx_release(&state); + stream128_release(&state); } /************************************************* * Name: rej_eta * * Description: Sample uniformly random coefficients in [-ETA, ETA] by -* performing rejection sampling using array of random bytes. +* performing rejection sampling on array of random bytes. * -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled * - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes +* - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough * random bytes were given. **************************************************/ -static size_t rej_eta( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; uint32_t t0, t1; + DBENCH_START(); ctr = pos = 0; while (ctr < len && pos < buflen) { t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - if (t0 <= 2 * ETA) { - a[ctr++] = Q + ETA - t0; + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; } - if (t1 <= 2 * ETA && ctr < len) { - a[ctr++] = Q + ETA - t1; + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; } } + DBENCH_STOP(*tsample); return ctr; } @@ -360,345 +434,434 @@ static size_t rej_eta( * Name: PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta * * Description: Sample polynomial with uniformly random coefficients -* in [-ETA,ETA] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). +* in [-ETA,ETA] by performing rejection sampling on the +* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, - const uint8_t *seed, + const uint8_t seed[SEEDBYTES], uint16_t nonce) { - size_t ctr; - uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES]; stream128_state state; stream128_init(&state, seed, nonce); stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); - ctr = rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); + ctr = rej_eta(a->coeffs, N, buf, buflen); while (ctr < N) { stream128_squeezeblocks(buf, 1, &state); ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); } - stream128_ctx_release(&state); + stream128_release(&state); } /************************************************* -* Name: rej_gamma1m1 -* -* Description: Sample uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling -* using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_gamma1m1( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - - size_t ctr, pos; - uint32_t t0, t1; - - ctr = pos = 0; - while (ctr < len && pos + 5 <= buflen) { - t0 = buf[pos]; - t0 |= (uint32_t)buf[pos + 1] << 8; - t0 |= (uint32_t)buf[pos + 2] << 16; - t0 &= 0xFFFFF; - - t1 = buf[pos + 2] >> 4; - t1 |= (uint32_t)buf[pos + 3] << 4; - t1 |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (t0 <= 2 * GAMMA1 - 2) { - a[ctr++] = Q + GAMMA1 - 1 - t0; - } - if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { - a[ctr++] = Q + GAMMA1 - 1 - t1; - } - } - - return ctr; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1 +* Name: poly_uniform_gamma1m1 * * Description: Sample polynomial with uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection -* sampling on output stream of SHAKE256(seed|nonce). +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* CRHBYTES +* - const uint8_t seed[]: byte array with seed of length CRHBYTES * - uint16_t nonce: 16-bit nonce **************************************************/ -#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) -#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) -void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(poly *a, +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; + uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES]; stream256_state state; stream256_init(&state, seed, nonce); - stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + stream256_release(&state); + PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(a, buf); +} - ctr = rej_gamma1m1(a->coeffs, N, buf, buflen); +/************************************************* +* Name: PQCLEAN_DILITHIUM2_CLEAN_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[SHAKE256_RATE]; + shake256incctx state; - while (ctr < N) { - off = buflen % 5; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf, sizeof buf, &state); - buflen = STREAM256_BLOCKBYTES + off; - stream256_squeezeblocks(buf + off, 1, &state); - ctr += rej_gamma1m1(a->coeffs + ctr, N - ctr, buf, buflen); + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)buf[i] << 8 * i; } - stream256_ctx_release(&state); + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf, sizeof buf, &state); + pos = 0; + } + + b = buf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack * * Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. -* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLETA_SIZE_PACKED bytes +* POLYETA_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { + unsigned int i; uint8_t t[8]; + DBENCH_START(); - for (size_t i = 0; i < N / 2; ++i) { - t[0] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 0]); - t[1] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 1]); - r[i] = (uint8_t)(t[0] | (t[1] << 4)); + for (i = 0; i < N / 8; ++i) { + t[0] = (uint8_t) (ETA - a->coeffs[8 * i + 0]); + t[1] = (uint8_t) (ETA - a->coeffs[8 * i + 1]); + t[2] = (uint8_t) (ETA - a->coeffs[8 * i + 2]); + t[3] = (uint8_t) (ETA - a->coeffs[8 * i + 3]); + t[4] = (uint8_t) (ETA - a->coeffs[8 * i + 4]); + t[5] = (uint8_t) (ETA - a->coeffs[8 * i + 5]); + t[6] = (uint8_t) (ETA - a->coeffs[8 * i + 6]); + t[7] = (uint8_t) (ETA - a->coeffs[8 * i + 7]); + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack * * Description: Unpack polynomial with coefficients in [-ETA,ETA]. -* Output coefficients lie in [Q-ETA,Q+ETA]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[i] & 0x0F; - r->coeffs[2 * i + 1] = a[i] >> 4; - r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1]; + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; } + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack * -* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. * Input coefficients are assumed to be standard representatives. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT1_SIZE_PACKED bytes +* POLYT1_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); - for (size_t i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)); + r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)); + r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)); + r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2); } + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack * -* Description: Unpack polynomial t1 with 9-bit coefficients. +* Description: Unpack polynomial t1 with 10-bit coefficients. * Output coefficients are standard representatives. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t) a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t) a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t) a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t) a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t) a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t) a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t) a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t) a[9 * i + 8] << 1)) & 0x1FF; + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack * * Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT0_SIZE_PACKED bytes +* POLYT0_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { - uint32_t t[4]; + unsigned int i; + uint32_t t[8]; + DBENCH_START(); - for (size_t i = 0; i < N / 4; ++i) { - t[0] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 3]; + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; - r[7 * i + 0] = (uint8_t)(t[0]); - r[7 * i + 1] = (uint8_t)(t[0] >> 8); - r[7 * i + 1] |= (uint8_t)(t[1] << 6); - r[7 * i + 2] = (uint8_t)(t[1] >> 2); - r[7 * i + 3] = (uint8_t)(t[1] >> 10); - r[7 * i + 3] |= (uint8_t)(t[2] << 4); - r[7 * i + 4] = (uint8_t)(t[2] >> 4); - r[7 * i + 5] = (uint8_t)(t[2] >> 12); - r[7 * i + 5] |= (uint8_t)(t[3] << 2); - r[7 * i + 6] = (uint8_t)(t[3] >> 6); + r[13 * i + 0] = (uint8_t) t[0]; + r[13 * i + 1] = (uint8_t) (t[0] >> 8); + r[13 * i + 1] |= (uint8_t) (t[1] << 5); + r[13 * i + 2] = (uint8_t) (t[1] >> 3); + r[13 * i + 3] = (uint8_t) (t[1] >> 11); + r[13 * i + 3] |= (uint8_t) (t[2] << 2); + r[13 * i + 4] = (uint8_t) (t[2] >> 6); + r[13 * i + 4] |= (uint8_t) (t[3] << 7); + r[13 * i + 5] = (uint8_t) (t[3] >> 1); + r[13 * i + 6] = (uint8_t) (t[3] >> 9); + r[13 * i + 6] |= (uint8_t) (t[4] << 4); + r[13 * i + 7] = (uint8_t) (t[4] >> 4); + r[13 * i + 8] = (uint8_t) (t[4] >> 12); + r[13 * i + 8] |= (uint8_t) (t[5] << 1); + r[13 * i + 9] = (uint8_t) (t[5] >> 7); + r[13 * i + 9] |= (uint8_t) (t[6] << 6); + r[13 * i + 10] = (uint8_t) (t[6] >> 2); + r[13 * i + 11] = (uint8_t) (t[6] >> 10); + r[13 * i + 11] |= (uint8_t) (t[7] << 3); + r[13 * i + 12] = (uint8_t) (t[7] >> 5); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack * * Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); - for (size_t i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t) (a[7 * i + 1] & 0x3F) << 8; + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t) a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t) (a[7 * i + 3] & 0x0F) << 10; + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t) a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t) (a[7 * i + 5] & 0x03) << 12; + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t) a[7 * i + 6] << 6; + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; - r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; - r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; - r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; - r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyz_pack * -* Description: Bit-pack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Input coefficients are assumed to be standard representatives. +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLZ_SIZE_PACKED bytes +* POLYZ_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyz_pack(uint8_t *r, const poly *a) { - uint32_t t[2]; + unsigned int i; + uint32_t t[4]; + DBENCH_START(); - for (size_t i = 0; i < N / 2; ++i) { - /* Map to {0,...,2*GAMMA1 - 2} */ - t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; - t[0] += ((int32_t)t[0] >> 31) & Q; - t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; - t[1] += ((int32_t)t[1] >> 31) & Q; + for (i = 0; i < N / 4; ++i) { + t[0] = GAMMA1 - a->coeffs[4 * i + 0]; + t[1] = GAMMA1 - a->coeffs[4 * i + 1]; + t[2] = GAMMA1 - a->coeffs[4 * i + 2]; + t[3] = GAMMA1 - a->coeffs[4 * i + 3]; - r[5 * i + 0] = (uint8_t)t[0]; - r[5 * i + 1] = (uint8_t)(t[0] >> 8); - r[5 * i + 2] = (uint8_t)(t[0] >> 16); - r[5 * i + 2] |= (uint8_t)(t[1] << 4); - r[5 * i + 3] = (uint8_t)(t[1] >> 4); - r[5 * i + 4] = (uint8_t)(t[1] >> 12); + r[9 * i + 0] = (uint8_t) t[0]; + r[9 * i + 1] = (uint8_t) (t[0] >> 8); + r[9 * i + 2] = (uint8_t) (t[0] >> 16); + r[9 * i + 2] |= (uint8_t) (t[1] << 2); + r[9 * i + 3] = (uint8_t) (t[1] >> 6); + r[9 * i + 4] = (uint8_t) (t[1] >> 14); + r[9 * i + 4] |= (uint8_t) (t[2] << 4); + r[9 * i + 5] = (uint8_t) (t[2] >> 4); + r[9 * i + 6] = (uint8_t) (t[2] >> 12); + r[9 * i + 6] |= (uint8_t) (t[3] << 6); + r[9 * i + 7] = (uint8_t) (t[3] >> 2); + r[9 * i + 8] = (uint8_t) (t[3] >> 10); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack * * Description: Unpack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Output coefficients are standard representatives. +* in [-(GAMMA1 - 1), GAMMA1]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t) a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t) (a[5 * i + 2] & 0x0F) << 16; + unsigned int i; + DBENCH_START(); - r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 4] << 12; + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[9 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16; + r->coeffs[4 * i + 0] &= 0x3FFFF; - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; - r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; + r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14; + r->coeffs[4 * i + 1] &= 0x3FFFF; + + r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12; + r->coeffs[4 * i + 2] &= 0x3FFFF; + + r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10; + r->coeffs[4 * i + 3] &= 0x3FFFF; + + r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0]; + r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1]; + r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2]; + r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack * -* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. * Input coefficients are assumed to be standard representatives. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLW1_SIZE_PACKED bytes +* POLYW1_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { - for (size_t i = 0; i < N / 2; ++i) { - r[i] = (uint8_t)(a->coeffs[2 * i + 0] | a->coeffs[2 * i + 1] << 4); + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[3 * i + 0] = (uint8_t) a->coeffs[4 * i + 0]; + r[3 * i + 0] |= (uint8_t) (a->coeffs[4 * i + 1] << 6); + r[3 * i + 1] = (uint8_t) (a->coeffs[4 * i + 1] >> 2); + r[3 * i + 1] |= (uint8_t) (a->coeffs[4 * i + 2] << 4); + r[3 * i + 2] = (uint8_t) (a->coeffs[4 * i + 2] >> 4); + r[3 * i + 2] |= (uint8_t) (a->coeffs[4 * i + 3] << 2); } + DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium2/clean/poly.h b/crypto_sign/dilithium2/clean/poly.h index 30eb9e9d..cd5660c1 100644 --- a/crypto_sign/dilithium2/clean/poly.h +++ b/crypto_sign/dilithium2/clean/poly.h @@ -1,53 +1,40 @@ #ifndef PQCLEAN_DILITHIUM2_CLEAN_POLY_H #define PQCLEAN_DILITHIUM2_CLEAN_POLY_H - -#include +#include "params.h" #include -#include "params.h" - typedef struct { - uint32_t coeffs[N]; + int32_t coeffs[N]; } poly; void PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(poly *a); +void PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(poly *a); void PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_poly_add( - poly *c, const poly *a, const poly *b); -void PQCLEAN_DILITHIUM2_CLEAN_poly_sub( - poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2_CLEAN_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); void PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(poly *a); void PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery( - poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); -void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round( - poly *a1, poly *a0, const poly *a); -void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose( - poly *a1, poly *a0, const poly *a); -uint32_t PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint( - poly *h, const poly *a0, const poly *a1); -void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint( - poly *a, const poly *b, const poly *h); +void PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); -int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm( - const poly *a, uint32_t B); -void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform( - poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta( - poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1( - poly *a, - const uint8_t seed[CRHBYTES], - uint16_t nonce); +int PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); void PQCLEAN_DILITHIUM2_CLEAN_polyeta_pack(uint8_t *r, const poly *a); void PQCLEAN_DILITHIUM2_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); diff --git a/crypto_sign/dilithium2/clean/polyvec.c b/crypto_sign/dilithium2/clean/polyvec.c index 4d156d18..d2bec78d 100644 --- a/crypto_sign/dilithium2/clean/polyvec.c +++ b/crypto_sign/dilithium2/clean/polyvec.c @@ -1,14 +1,65 @@ -#include -#include - #include "params.h" #include "poly.h" #include "polyvec.h" +#include + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + + for (i = 0; i < K; ++i) { + for (j = 0; j < L; ++j) { + PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); + } + } +} + +void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} /**************************************************************/ /************ Vectors of polynomials of length L **************/ /**************************************************************/ +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); + } +} + +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]); + } +} + /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze * @@ -18,7 +69,9 @@ * Arguments: - polyvecl *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]); } } @@ -33,9 +86,10 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v) { * - const polyvecl *u: pointer to first summand * - const polyvecl *v: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add( - polyvecl *w, const polyvecl *u, const polyvecl *v) { - for (size_t i = 0; i < L; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -49,32 +103,49 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add( * Arguments: - polyvecl *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]); } } +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + /************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery * * Description: Pointwise multiply vectors of polynomials of length L, multiply * resulting vector by 2^{-32} and add (accumulate) polynomials * in it. Input/output vectors are in NTT domain representation. -* Input coefficients are assumed to be less than 22*Q. Output -* coeffcient are less than 2*L*Q. * * Arguments: - poly *w: output polynomial * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v) { +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + unsigned int i; poly t; - PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(w, &u->vec[0], &v->vec[0]); - - for (size_t i = 1; i < L; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&t, &u->vec[i], &v->vec[i]); + PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); + for (i = 1; i < L; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); PQCLEAN_DILITHIUM2_CLEAN_poly_add(w, w, &t); } } @@ -83,17 +154,19 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery( * Name: PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm * * Description: Check infinity norm of polynomials in vector of length L. -* Assumes input coefficients to be standard representatives. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(). * * Arguments: - const polyvecl *v: pointer to vector -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm of all polynomials is strictly smaller than B and 1 -* otherwise. +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { - for (size_t i = 0; i < L; ++i) { - if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) { +int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) { return 1; } } @@ -105,32 +178,43 @@ int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { /************ Vectors of polynomials of length K **************/ /**************************************************************/ +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce * * Description: Reduce coefficients of polynomials in vector of length K -* to representatives in [0,2*Q[. +* to representatives in [-6283009,6283007]. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&v->vec[i]); } } /************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq * * Description: For all coefficients of polynomials in vector of length K -* subtract Q if coefficient is bigger than Q. +* add Q if coefficient is negative. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_csubq(&v->vec[i]); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_caddq(&v->vec[i]); } } @@ -143,7 +227,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v) { * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_freeze(&v->vec[i]); } } @@ -158,9 +244,10 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v) { * - const polyveck *u: pointer to first summand * - const polyveck *v: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -169,17 +256,17 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add( * Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub * * Description: Subtract vectors of polynomials of length K. -* Assumes coefficients of polynomials in second input vector -* to be less than 2*Q. No modular reduction is performed. +* No modular reduction is performed. * * Arguments: - polyveck *w: pointer to output vector * - const polyveck *u: pointer to first input vector * - const polyveck *v: pointer to second input vector to be * subtracted from first input vector **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -188,12 +275,14 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub( * Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl * * Description: Multiply vector of polynomials of Length K by 2^D without modular -* reduction. Assumes input coefficients to be less than 2^{32-D}. +* reduction. Assumes input coefficients to be less than 2^{31-D}. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_shiftl(&v->vec[i]); } } @@ -207,13 +296,15 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v) { * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&v->vec[i]); } } /************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery +* Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont * * Description: Inverse NTT and multiplication by 2^{32} of polynomials * in vector of length K. Input coefficients need to be less @@ -221,27 +312,40 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v) { * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&v->vec[i]); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_tomont(&v->vec[i]); } } +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + + /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm * * Description: Check infinity norm of polynomials in vector of length K. -* Assumes input coefficients to be standard representatives. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(). * * Arguments: - const polyveck *v: pointer to vector -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm of all polynomials are strictly smaller than B and 1 -* otherwise. +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { - for (size_t i = 0; i < K; ++i) { - if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], B)) { +int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM2_CLEAN_poly_chknorm(&v->vec[i], bound)) { return 1; } } @@ -253,19 +357,20 @@ int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { * Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round * * Description: For all coefficients a of polynomials in vector of length K, -* compute a0, a1 such that a mod Q = a1*2^D + a0 +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 * with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be * standard representatives. * * Arguments: - polyveck *v1: pointer to output vector of polynomials with * coefficients a1 * - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 +* coefficients a0 * - const polyveck *v: pointer to input vector **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); } } @@ -274,7 +379,7 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round( * Name: PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose * * Description: For all coefficients a of polynomials in vector of length K, -* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 * with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we * set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. * Assumes coefficients to be standard representatives. @@ -282,12 +387,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round( * Arguments: - polyveck *v1: pointer to output vector of polynomials with * coefficients a1 * - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 +* coefficients a0 * - const polyveck *v: pointer to input vector **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM2_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); } } @@ -303,15 +409,13 @@ void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose( * * Returns number of 1 bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1) { - uint32_t s = 0; +unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; - for (size_t i = 0; i < K; ++i) { - s += PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint( - &h->vec[i], &v0->vec[i], &v1->vec[i]); + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM2_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); } return s; @@ -324,13 +428,21 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint( * * Arguments: - polyveck *w: pointer to output vector of polynomials with * corrected high bits -* - const polyveck *v: pointer to input vector +* - const polyveck *u: pointer to input vector * - const polyveck *h: pointer to input hint vector **************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint( - &w->vec[i], &v->vec[i], &h->vec[i]); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); } } diff --git a/crypto_sign/dilithium2/clean/polyvec.h b/crypto_sign/dilithium2/clean/polyvec.h index 3ffae9da..6d0eb473 100644 --- a/crypto_sign/dilithium2/clean/polyvec.h +++ b/crypto_sign/dilithium2/clean/polyvec.h @@ -1,25 +1,33 @@ #ifndef PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H #define PQCLEAN_DILITHIUM2_CLEAN_POLYVEC_H - -#include - #include "params.h" #include "poly.h" +#include /* Vectors of polynomials of length L */ typedef struct { poly vec[L]; } polyvecl; +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(polyvecl *v); + void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(polyvecl *v); void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(polyvecl *v); -void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); -int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B); + +int PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); @@ -28,31 +36,33 @@ typedef struct { poly vec[K]; } polyveck; +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + void PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(polyveck *v); -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(polyveck *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(polyveck *v); void PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(polyveck *v); -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v); -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); void PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(polyveck *v); void PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(polyveck *v); -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(polyveck *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); -int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm( - const polyveck *v, uint32_t B); +int PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v); -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v); -uint32_t PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1); -void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); #endif diff --git a/crypto_sign/dilithium2/clean/reduce.c b/crypto_sign/dilithium2/clean/reduce.c index 8444de2d..4d3946ff 100644 --- a/crypto_sign/dilithium2/clean/reduce.c +++ b/crypto_sign/dilithium2/clean/reduce.c @@ -1,60 +1,54 @@ -#include - #include "params.h" #include "reduce.h" +#include /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce * -* Description: For finite field element a with 0 <= a <= Q*2^32, -* compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q. +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. * -* Arguments: - uint64_t: finite field element a +* Arguments: - int64_t: finite field element a * * Returns r. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(uint64_t a) { - uint64_t t; +int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a) { + int32_t t; - t = a * QINV; - t &= (1ULL << 32) - 1; - t *= Q; - t = a + t; - t >>= 32; - return (uint32_t)t; + t = (int32_t)((uint64_t)a * (uint64_t)QINV); + t = (a - (int64_t)t * Q) >> 32; + return t; } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_reduce32 * -* Description: For finite field element a, compute r \equiv a (mod Q) -* such that 0 <= r < 2*Q. +* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, +* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. * -* Arguments: - uint32_t: finite field element a +* Arguments: - int32_t: finite field element a * * Returns r. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(uint32_t a) { - uint32_t t; +int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a) { + int32_t t; - t = a & 0x7FFFFF; - a >>= 23; - t += (a << 13) - a; + t = (a + (1 << 22)) >> 23; + t = a - t * Q; return t; } /************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_csubq +* Name: PQCLEAN_DILITHIUM2_CLEAN_caddq * -* Description: Subtract Q if input coefficient is bigger than Q. +* Description: Add Q if input coefficient is negative. * -* Arguments: - uint32_t: finite field element a +* Arguments: - int32_t: finite field element a * * Returns r. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a) { - a -= Q; - a += ((int32_t)a >> 31) & Q; +int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a) { + a += (a >> 31) & Q; return a; } @@ -62,14 +56,14 @@ uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a) { * Name: PQCLEAN_DILITHIUM2_CLEAN_freeze * * Description: For finite field element a, compute standard -* representative r = a mod Q. +* representative r = a mod^+ Q. * -* Arguments: - uint32_t: finite field element a +* Arguments: - int32_t: finite field element a * * Returns r. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(uint32_t a) { +int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a) { a = PQCLEAN_DILITHIUM2_CLEAN_reduce32(a); - a = PQCLEAN_DILITHIUM2_CLEAN_csubq(a); + a = PQCLEAN_DILITHIUM2_CLEAN_caddq(a); return a; } diff --git a/crypto_sign/dilithium2/clean/reduce.h b/crypto_sign/dilithium2/clean/reduce.h index fbd4b573..41cd3a31 100644 --- a/crypto_sign/dilithium2/clean/reduce.h +++ b/crypto_sign/dilithium2/clean/reduce.h @@ -1,21 +1,17 @@ #ifndef PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H #define PQCLEAN_DILITHIUM2_CLEAN_REDUCE_H - +#include "params.h" #include -#define MONT 4193792U // 2^32 % Q -#define QINV 4236238847U // -q^(-1) mod 2^32 +#define MONT (-4186625) // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 -/* a <= Q*2^32 => r < 2*Q */ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(uint64_t a); +int32_t PQCLEAN_DILITHIUM2_CLEAN_montgomery_reduce(int64_t a); -/* r < 2*Q */ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(uint32_t a); +int32_t PQCLEAN_DILITHIUM2_CLEAN_reduce32(int32_t a); -/* a < 2*Q => r < Q */ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_csubq(uint32_t a); +int32_t PQCLEAN_DILITHIUM2_CLEAN_caddq(int32_t a); -/* r < Q */ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(uint32_t a); +int32_t PQCLEAN_DILITHIUM2_CLEAN_freeze(int32_t a); #endif diff --git a/crypto_sign/dilithium2/clean/rounding.c b/crypto_sign/dilithium2/clean/rounding.c index ac7ce1e9..9b49fb2f 100644 --- a/crypto_sign/dilithium2/clean/rounding.c +++ b/crypto_sign/dilithium2/clean/rounding.c @@ -1,86 +1,70 @@ #include "params.h" #include "rounding.h" +#include /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_power2round * * Description: For finite field element a, compute a0, a1 such that -* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. * Assumes a to be standard representative. * -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 * * Returns a1. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0) { - uint32_t t; +int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a) { + int32_t a1; - /* Centralized remainder mod 2^D */ - t = a & ((1U << D) - 1); - t -= (1U << (D - 1)) + 1; - t += ((uint32_t)((int32_t)t >> 31) & (1 << D)); - t -= (1U << (D - 1)) - 1; - *a0 = Q + t; - a = (a - t) >> D; - return a; + a1 = (a + (1 << (D - 1)) - 1) >> D; + *a0 = a - (a1 << D); + return a1; } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_decompose * * Description: For finite field element a, compute high and low bits a0, a1 such -* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except * if a1 = (Q-1)/ALPHA where we set a1 = 0 and -* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard +* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard * representative. * -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 * * Returns a1. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0) { - int32_t t, u; +int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a) { + int32_t a1; - /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFFu; - t += (int32_t)((a >> 19u) << 9u); - t -= ALPHA / 2 + 1; - t += (t >> 31) & ALPHA; - t -= ALPHA / 2 - 1; - a -= (uint32_t)t; + a1 = (a + 127) >> 7; + a1 = (a1 * 11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; - /* Divide by ALPHA (possible to avoid) */ - u = (int32_t)(a - 1); - u >>= 31; - a = (a >> 19) + 1; - a -= u & 1; - - /* Border case */ - *a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u)); - a &= 0xFu; - return a; + *a0 = a - a1 * 2 * GAMMA2; + *a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; + return a1; } /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_make_hint * * Description: Compute hint bit indicating whether the low bits of the -* input element overflow into the high bits. Inputs assumed to be -* standard representatives. +* input element overflow into the high bits. * -* Arguments: - uint32_t a0: low bits of input element -* - uint32_t a1: high bits of input element +* Arguments: - int32_t a0: low bits of input element +* - int32_t a1: high bits of input element * -* Returns 1 if high bits of a and b differ and 0 otherwise. +* Returns 1 if overflow. **************************************************/ -unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) { - if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { - return 0; +unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1) { + if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { + return 1; } - return 1; + return 0; } /************************************************* @@ -88,30 +72,27 @@ unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1) { * * Description: Correct high bits according to hint. * -* Arguments: - uint32_t a: input element +* Arguments: - int32_t a: input element * - unsigned int hint: hint bit * * Returns corrected high bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint) { - uint32_t a0, a1; +int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint) { + int32_t a0, a1; - a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(a, &a0); + a1 = PQCLEAN_DILITHIUM2_CLEAN_decompose(&a0, a); if (hint == 0) { return a1; } - if (a0 > Q) { - return (a1 + 1) & 0xF; + + if (a0 > 0) { + if (a1 == 43) { + return 0; + } + return a1 + 1; } - - return (a1 - 1) & 0xF; - - /* If PQCLEAN_DILITHIUM2_CLEAN_decompose does not divide out ALPHA: - if(hint == 0) - return a1; - else if(a0 > Q) - return (a1 + ALPHA) % (Q - 1); - else - return (a1 - ALPHA) % (Q - 1); - */ + if (a1 == 0) { + return 43; + } + return a1 - 1; } diff --git a/crypto_sign/dilithium2/clean/rounding.h b/crypto_sign/dilithium2/clean/rounding.h index 5010ce35..8542a00e 100644 --- a/crypto_sign/dilithium2/clean/rounding.h +++ b/crypto_sign/dilithium2/clean/rounding.h @@ -1,11 +1,14 @@ #ifndef PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H #define PQCLEAN_DILITHIUM2_CLEAN_ROUNDING_H - +#include "params.h" #include -uint32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(uint32_t a, uint32_t *a0); -uint32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(uint32_t a, uint32_t *a0); -unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(uint32_t a0, uint32_t a1); -uint32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(uint32_t a, unsigned int hint); +int32_t PQCLEAN_DILITHIUM2_CLEAN_power2round(int32_t *a0, int32_t a); + +int32_t PQCLEAN_DILITHIUM2_CLEAN_decompose(int32_t *a0, int32_t a); + +unsigned int PQCLEAN_DILITHIUM2_CLEAN_make_hint(int32_t a0, int32_t a1); + +int32_t PQCLEAN_DILITHIUM2_CLEAN_use_hint(int32_t a, unsigned int hint); #endif diff --git a/crypto_sign/dilithium2/clean/sign.c b/crypto_sign/dilithium2/clean/sign.c index 24187654..0ea6d242 100644 --- a/crypto_sign/dilithium2/clean/sign.c +++ b/crypto_sign/dilithium2/clean/sign.c @@ -1,6 +1,3 @@ -#include -#include - #include "fips202.h" #include "packing.h" #include "params.h" @@ -9,84 +6,7 @@ #include "randombytes.h" #include "sign.h" #include "symmetric.h" - -/************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_expand_mat -* -* Description: Implementation of ExpandA. Generates matrix A with uniformly -* random coefficients a_{i,j} by performing rejection -* sampling on the output stream of SHAKE128(rho|i|j). -* -* Arguments: - polyvecl mat[K]: output matrix -* - const uint8_t rho[]: byte array containing seed rho -**************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < L; ++j) { - PQCLEAN_DILITHIUM2_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t)((i << 8) + j)); - } - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM2_CLEAN_challenge -* -* Description: Implementation of H. Samples polynomial with 60 nonzero -* coefficients in {-1,1} using the output stream of -* SHAKE256(mu|w1). -* -* Arguments: - poly *c: pointer to output polynomial -* - const uint8_t mu[]: byte array containing mu -* - const polyveck *w1: pointer to vector w1 -**************************************************/ -void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, - const uint8_t mu[CRHBYTES], - const polyveck *w1) { - uint64_t signs; - uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - uint8_t outbuf[SHAKE256_RATE]; - shake256ctx state; - uint8_t b; - size_t pos; - - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[i] = mu[i]; - } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); - } - - shake256_absorb(&state, inbuf, sizeof(inbuf)); - shake256_squeezeblocks(outbuf, 1, &state); - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)outbuf[i] << 8 * i; - } - - pos = 8; - - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - for (size_t i = 196; i < 256; ++i) { - do { - if (pos >= SHAKE256_RATE) { - shake256_squeezeblocks(outbuf, 1, &state); - pos = 0; - } - - b = outbuf[pos++]; - } while (b > i); - - c->coeffs[i] = c->coeffs[b]; - c->coeffs[b] = 1; - c->coeffs[b] ^= -((int32_t)signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - shake256_ctx_release(&state); -} +#include /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair @@ -94,9 +14,9 @@ void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, * Description: Generates public and private key. * * Arguments: - uint8_t *pk: pointer to output public key (allocated -* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) * - uint8_t *sk: pointer to output private key (allocated -* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes) +* array of PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ @@ -104,48 +24,42 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { uint8_t seedbuf[3 * SEEDBYTES]; uint8_t tr[CRHBYTES]; const uint8_t *rho, *rhoprime, *key; - uint16_t nonce = 0; polyvecl mat[K]; polyvecl s1, s1hat; - polyveck s2, t, t1, t0; + polyveck s2, t1, t0; - /* Expand 32 bytes of randomness into rho, rhoprime and key */ - randombytes(seedbuf, 3 * SEEDBYTES); + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); rho = seedbuf; rhoprime = seedbuf + SEEDBYTES; key = seedbuf + 2 * SEEDBYTES; /* Expand matrix */ - PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho); + PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); /* Sample short vectors s1 and s2 */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&s1.vec[i], rhoprime, nonce++); - } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_eta(&s2.vec[i], rhoprime, nonce++); - } + PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); /* Matrix-vector multiplication */ s1hat = s1; PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1hat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); - PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&t.vec[i]); - PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&t.vec[i]); - } + PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&t1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&t1); /* Add error vector s2 */ - PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&t, &t, &s2); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&t1, &t1, &s2); /* Extract t1 and write public key */ - PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(&t); - PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(&t1, &t0, &t); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&t1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_power2round(&t1, &t0, &t1); PQCLEAN_DILITHIUM2_CLEAN_pack_pk(pk, rho, &t1); /* Compute CRH(rho, t1) and write secret key */ crh(tr, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES); - PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); + PQCLEAN_DILITHIUM2_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); return 0; } @@ -153,44 +67,41 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature * -* Description: Compute signed message. +* Description: Computes signature. * -* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES -* of len) -* - size_t *smlen: pointer to output length of signed message -* (should be PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) -* - uint8_t *m: pointer to message to be signed -* - size_t mlen: length of message -* - uint8_t *sk: pointer to bit-packed secret key +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key * * Returns 0 (success) **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *msg, size_t mlen, - const uint8_t *sk) { +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + unsigned int n; uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; uint8_t *rho, *tr, *key, *mu, *rhoprime; - uint32_t n; uint16_t nonce = 0; - poly c, chat; - polyvecl mat[K], s1, y, yhat, z; - polyveck t0, s2, w, w1, w0; - polyveck h, cs2, ct0; + polyvecl mat[K], s1, y, z; + polyveck t0, s2, w1, w0, h; + poly cp; + shake256incctx state; rho = seedbuf; tr = rho + SEEDBYTES; key = tr + CRHBYTES; mu = key + SEEDBYTES; rhoprime = mu + CRHBYTES; - PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); + PQCLEAN_DILITHIUM2_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); - // use incremental hash API instead of copying around buffers /* Compute CRH(tr, msg) */ - shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, tr, CRHBYTES); - shake256_inc_absorb(&state, msg, mlen); + shake256_inc_absorb(&state, m, mlen); shake256_inc_finalize(&state); shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); @@ -198,76 +109,71 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature( crh(rhoprime, key, SEEDBYTES + CRHBYTES); /* Expand matrix and transform vectors */ - PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho); + PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&s1); PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&s2); PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t0); rej: /* Sample intermediate vector y */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_uniform_gamma1m1(&y.vec[i], rhoprime, nonce++); - } + PQCLEAN_DILITHIUM2_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); /* Matrix-vector multiplication */ - yhat = y; - PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&yhat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); - PQCLEAN_DILITHIUM2_CLEAN_poly_reduce(&w.vec[i]); - PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&w.vec[i]); - } + z = y; + PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1); /* Decompose w and call the random oracle */ - PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&w); - PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(&w1, &w0, &w); - PQCLEAN_DILITHIUM2_CLEAN_challenge(&c, mu, &w1); - chat = c; - PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&chat); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_decompose(&w1, &w0, &w1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(sig, &w1); - /* Check that subtracting cs2 does not change high bits of w and low bits - * do not reveal secret information */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); - PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&cs2.vec[i]); - } - PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w0, &w0, &cs2); - PQCLEAN_DILITHIUM2_CLEAN_polyveck_freeze(&w0); - if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { - goto rej; - } + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, sig); + PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp); /* Compute z, reject if it reveals secret */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); - PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&z.vec[i]); - } + PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); + PQCLEAN_DILITHIUM2_CLEAN_polyvecl_invntt_tomont(&z); PQCLEAN_DILITHIUM2_CLEAN_polyvecl_add(&z, &z, &y); - PQCLEAN_DILITHIUM2_CLEAN_polyvecl_freeze(&z); + PQCLEAN_DILITHIUM2_CLEAN_polyvecl_reduce(&z); if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { goto rej; } - /* Compute hints for w1 */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); - PQCLEAN_DILITHIUM2_CLEAN_poly_invntt_montgomery(&ct0.vec[i]); - } - - PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&ct0); - if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&ct0, GAMMA2)) { + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w0, &w0, &h); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w0); + if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { goto rej; } - PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&w0, &w0, &ct0); - PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&w0); + /* Compute hints for w1 */ + PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&h); + if (PQCLEAN_DILITHIUM2_CLEAN_polyveck_chknorm(&h, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM2_CLEAN_polyveck_add(&w0, &w0, &h); n = PQCLEAN_DILITHIUM2_CLEAN_polyveck_make_hint(&h, &w0, &w1); if (n > OMEGA) { goto rej; } /* Write signature */ - PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, &z, &h, &c); + PQCLEAN_DILITHIUM2_CLEAN_pack_sig(sig, sig, &z, &h); *siglen = PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; return 0; } @@ -281,53 +187,63 @@ rej: * array with PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen bytes), * can be equal to m * - size_t *smlen: pointer to output length of signed -* message +* message * - const uint8_t *m: pointer to message to be signed * - size_t mlen: length of message * - const uint8_t *sk: pointer to bit-packed secret key * * Returns 0 (success) **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - int rc; - memmove(sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, m, mlen); - rc = PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); - *smlen += mlen; - return rc; -} +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + size_t i; + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} /************************************************* * Name: PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify * -* Description: Verify signed message. +* Description: Verifies signature. * -* Arguments: - uint8_t *sig: signature -* - size_t siglen: length of signature (PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) -* - uint8_t *m: pointer to message -* - size_t *mlen: pointer to output length of message -* - uint8_t *pk: pointer to bit-packed public key +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key * -* Returns 0 if signed message could be verified correctly and -1 otherwise +* Returns 0 if signature could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, const uint8_t *pk) { +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) { + unsigned int i; + uint8_t buf[K * POLYW1_PACKEDBYTES]; uint8_t rho[SEEDBYTES]; uint8_t mu[CRHBYTES]; - poly c, chat, cp; + uint8_t c[SEEDBYTES]; + uint8_t c2[SEEDBYTES]; + poly cp; polyvecl mat[K], z; - polyveck t1, w1, h, tmp1, tmp2; + polyveck t1, w1, h; + shake256incctx state; - if (siglen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { + if (siglen != PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { return -1; } PQCLEAN_DILITHIUM2_CLEAN_unpack_pk(rho, &t1, pk); - if (PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(&z, &h, &c, sig)) { + if (PQCLEAN_DILITHIUM2_CLEAN_unpack_sig(c, &z, &h, sig)) { return -1; } if (PQCLEAN_DILITHIUM2_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { @@ -336,8 +252,6 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( /* Compute CRH(CRH(rho, t1), msg) */ crh(mu, pk, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_PUBLICKEYBYTES); - - shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -346,38 +260,39 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( shake256_inc_ctx_release(&state); /* Matrix-vector multiplication; compute Az - c2^dt1 */ - PQCLEAN_DILITHIUM2_CLEAN_expand_mat(mat, rho); + PQCLEAN_DILITHIUM2_CLEAN_poly_challenge(&cp, c); + PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_expand(mat, rho); PQCLEAN_DILITHIUM2_CLEAN_polyvecl_ntt(&z); - for (size_t i = 0; i < K ; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); - } + PQCLEAN_DILITHIUM2_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); - chat = c; - PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&chat); + PQCLEAN_DILITHIUM2_CLEAN_poly_ntt(&cp); PQCLEAN_DILITHIUM2_CLEAN_polyveck_shiftl(&t1); PQCLEAN_DILITHIUM2_CLEAN_polyveck_ntt(&t1); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM2_CLEAN_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); - } + PQCLEAN_DILITHIUM2_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); - PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&tmp1, &tmp1, &tmp2); - PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&tmp1); - PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_montgomery(&tmp1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_sub(&w1, &w1, &t1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_invntt_tomont(&w1); /* Reconstruct w1 */ - PQCLEAN_DILITHIUM2_CLEAN_polyveck_csubq(&tmp1); - PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(&w1, &tmp1, &h); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM2_CLEAN_polyveck_pack_w1(buf, &w1); - /* Call random oracle and verify challenge */ - PQCLEAN_DILITHIUM2_CLEAN_challenge(&cp, mu, &w1); - for (size_t i = 0; i < N; ++i) { - if (c.coeffs[i] != cp.coeffs[i]) { + /* Call random oracle and verify PQCLEAN_DILITHIUM2_CLEAN_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(c2, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (c[i] != c2[i]) { return -1; } } - // All good return 0; } @@ -387,7 +302,7 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( * Description: Verify signed message. * * Arguments: - uint8_t *m: pointer to output message (allocated -* array with smlen bytes), can be equal to sm +* array with smlen bytes), can be equal to sm * - size_t *mlen: pointer to output length of message * - const uint8_t *sm: pointer to signed message * - size_t smlen: length of signed message @@ -395,33 +310,34 @@ int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify( * * Returns 0 if signed message could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk) { +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) { + size_t i; + if (smlen < PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; - if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, - sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ - for (size_t i = 0; i < *mlen; ++i) { + for (i = 0; i < *mlen; ++i) { m[i] = sm[PQCLEAN_DILITHIUM2_CLEAN_CRYPTO_BYTES + i]; } return 0; } - /* Signature verification failed */ badsig: + /* Signature verification failed */ *mlen = (size_t) -1; - for (size_t i = 0; i < smlen; ++i) { + for (i = 0; i < smlen; ++i) { m[i] = 0; } return -1; } - diff --git a/crypto_sign/dilithium2/clean/sign.h b/crypto_sign/dilithium2/clean/sign.h index 4196d29b..3b151d58 100644 --- a/crypto_sign/dilithium2/clean/sign.h +++ b/crypto_sign/dilithium2/clean/sign.h @@ -1,12 +1,29 @@ #ifndef PQCLEAN_DILITHIUM2_CLEAN_SIGN_H #define PQCLEAN_DILITHIUM2_CLEAN_SIGN_H - -#include "api.h" #include "params.h" #include "poly.h" #include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM2_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); -void PQCLEAN_DILITHIUM2_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM2_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES], - const polyveck *w1); #endif diff --git a/crypto_sign/dilithium2/clean/stream.c b/crypto_sign/dilithium2/clean/stream.c deleted file mode 100644 index e862e9de..00000000 --- a/crypto_sign/dilithium2/clean/stream.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "stream.h" - -#include - -void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { - - uint8_t buf[SEEDBYTES + 2]; - memcpy(buf, seed, SEEDBYTES); - buf[SEEDBYTES] = (uint8_t)nonce; - buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); - - shake128_absorb(state, buf, SEEDBYTES + 2); -} - - -void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { - - uint8_t buf[CRHBYTES + 2]; - memcpy(buf, seed, CRHBYTES); - buf[CRHBYTES] = (uint8_t)nonce; - buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); - - shake256_absorb(state, buf, CRHBYTES + 2); -} diff --git a/crypto_sign/dilithium2/clean/stream.h b/crypto_sign/dilithium2/clean/stream.h deleted file mode 100644 index d607ce99..00000000 --- a/crypto_sign/dilithium2/clean/stream.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM2_CLEAN_STREAM_H -#define PQCLEAN_DILITHIUM2_CLEAN_STREAM_H - -#include - -#include "fips202.h" -#include "params.h" - -void PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); - -void PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); - -#endif diff --git a/crypto_sign/dilithium2/clean/symmetric-shake.c b/crypto_sign/dilithium2/clean/symmetric-shake.c new file mode 100644 index 00000000..1decd901 --- /dev/null +++ b/crypto_sign/dilithium2/clean/symmetric-shake.c @@ -0,0 +1,26 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake128_inc_init(state); + shake128_inc_absorb(state, seed, SEEDBYTES); + shake128_inc_absorb(state, t, 2); + shake128_inc_finalize(state); +} + +void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake256_inc_init(state); + shake256_inc_absorb(state, seed, CRHBYTES); + shake256_inc_absorb(state, t, 2); + shake256_inc_finalize(state); +} diff --git a/crypto_sign/dilithium2/clean/symmetric.h b/crypto_sign/dilithium2/clean/symmetric.h index ee2b8103..cbbb11d4 100644 --- a/crypto_sign/dilithium2/clean/symmetric.h +++ b/crypto_sign/dilithium2/clean/symmetric.h @@ -1,25 +1,36 @@ #ifndef PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H #define PQCLEAN_DILITHIUM2_CLEAN_SYMMETRIC_H - -#include "params.h" -#include "stream.h" - - #include "fips202.h" +#include "params.h" +#include -#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) -#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_CLEAN_shake128_stream_init(STATE, SEED, NONCE) -#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream128_ctx_release(STATE) shake128_ctx_release(STATE) -#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2_CLEAN_shake256_stream_init(STATE, SEED, NONCE) -#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream256_ctx_release(STATE) shake256_ctx_release(STATE) + + +typedef shake128incctx stream128_state; +typedef shake256incctx stream256_state; + +void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); + +void PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, + const uint8_t seed[CRHBYTES], + uint16_t nonce); #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE -typedef shake128ctx stream128_state; -typedef shake256ctx stream256_state; +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) +#define stream128_release(STATE) shake128_inc_ctx_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM2_CLEAN_dilithium_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) +#define stream256_release(STATE) shake256_inc_ctx_release(STATE) #endif diff --git a/crypto_sign/dilithium2aes/META.yml b/crypto_sign/dilithium2aes/META.yml new file mode 100644 index 00000000..6a4fe381 --- /dev/null +++ b/crypto_sign/dilithium2aes/META.yml @@ -0,0 +1,31 @@ +name: Dilithium2-AES +type: signature +claimed-nist-level: 2 +length-public-key: 1312 +length-secret-key: 2544 +length-signature: 2420 +nistkat-sha256: 23972a0a5f1f32781aa11fa57d9994ddd53c1bbcc732967f61d9d9aaef01c492 +testvectors-sha256: 22e68fe8bf781dee949a4297f9ba44d1c350a1d88bae03117cfb2ca494c6e604 +principal-submitters: + - Vadim Lyubashevsky +auxiliary-submitters: + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Peter Schwabe + - Gregor Seiler + - Damien Stehlé +implementations: + - name: clean + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium + - name: avx2 + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - aes + - avx2 + - popcnt diff --git a/crypto_sign/dilithium2aes/avx2/LICENSE b/crypto_sign/dilithium2aes/avx2/LICENSE new file mode 100644 index 00000000..08473af7 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/LICENSE @@ -0,0 +1,5 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium2aes/avx2/Makefile b/crypto_sign/dilithium2aes/avx2/Makefile new file mode 100644 index 00000000..a973d785 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/Makefile @@ -0,0 +1,23 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium2aes_avx2.a +HEADERS=aes256ctr.h align.h api.h cdecl.h consts.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc +OBJECTS=aes256ctr.o consts.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o invntt.o ntt.o pointwise.o shuffle.o +CFLAGS=-mavx2 -maes -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls -std=c99 \ + -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_sign/dilithium2aes/avx2/aes256ctr.c b/crypto_sign/dilithium2aes/avx2/aes256ctr.c new file mode 100644 index 00000000..82463cb5 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/aes256ctr.c @@ -0,0 +1,142 @@ +#include "aes256ctr.h" +#include +#include +#include +/* Based heavily on public-domain code by Romain Dolbeau + * Different handling of nonce+counter than original version using + * separated 64-bit nonce and internal 64-bit counter, starting from zero + * Public Domain */ + + +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { + __m128i f, f0, f1, f2, f3; + const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + + /* Load current counter value */ + f = _mm_load_si128(n); + + /* Increase counter in 4 consecutive blocks */ + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); + + /* Write counter for next iteration, increased by 4 */ + _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); + + /* Actual AES encryption, 4x interleaved */ + f = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, f); + f1 = _mm_xor_si128(f1, f); + f2 = _mm_xor_si128(f2, f); + f3 = _mm_xor_si128(f3, f); + + for (int i = 1; i < 14; i++) { + f = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, f); + f1 = _mm_aesenc_si128(f1, f); + f2 = _mm_aesenc_si128(f2, f); + f3 = _mm_aesenc_si128(f3, f); + } + + f = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, f); + f1 = _mm_aesenclast_si128(f1, f); + f2 = _mm_aesenclast_si128(f2, f); + f3 = _mm_aesenclast_si128(f3, f); + + /* Write results */ + _mm_storeu_si128((__m128i *)(out + 0), f0); + _mm_storeu_si128((__m128i *)(out + 16), f1); + _mm_storeu_si128((__m128i *)(out + 32), f2); + _mm_storeu_si128((__m128i *)(out + 48), f3); +} + +void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) { + __m128i key0, key1, temp0, temp1, temp2, temp4; + int idx = 0; + + key0 = _mm_loadu_si128((__m128i *)(key + 0)); + key1 = _mm_loadu_si128((__m128i *)(key + 16)); + state->n = _mm_loadl_epi64((__m128i *)&nonce); + + state->rkeys[idx++] = key0; + temp0 = key0; + temp2 = key1; + temp4 = _mm_setzero_si128(); + +#define BLOCK1(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \ + state->rkeys[idx++] = temp2; \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x10); \ + temp0 = _mm_xor_si128(temp0, temp4); \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x8c); \ + temp0 = _mm_xor_si128(temp0, temp4); \ + temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xff); \ + temp0 = _mm_xor_si128(temp0, temp1) + +#define BLOCK2(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ + state->rkeys[idx++] = temp0; \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x10); \ + temp2 = _mm_xor_si128(temp2, temp4); \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x8c); \ + temp2 = _mm_xor_si128(temp2, temp4); \ + temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xaa); \ + temp2 = _mm_xor_si128(temp2, temp1) + + BLOCK1(0x01); + BLOCK2(0x01); + + BLOCK1(0x02); + BLOCK2(0x02); + + BLOCK1(0x04); + BLOCK2(0x04); + + BLOCK1(0x08); + BLOCK2(0x08); + + BLOCK1(0x10); + BLOCK2(0x10); + + BLOCK1(0x20); + BLOCK2(0x20); + + BLOCK1(0x40); + state->rkeys[idx++] = temp0; +} + +void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state) { + size_t i; + for (i = 0; i < nblocks; i++) { + aesni_encrypt4(out, &state->n, state->rkeys); + out += 64; + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t seed[32], + uint64_t nonce) { + unsigned int i; + uint8_t buf[64]; + aes256ctr_ctx state; + + PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&state, seed, nonce); + + while (outlen >= 64) { + aesni_encrypt4(out, &state.n, state.rkeys); + outlen -= 64; + out += 64; + } + + if (outlen) { + aesni_encrypt4(buf, &state.n, state.rkeys); + for (i = 0; i < outlen; i++) { + out[i] = buf[i]; + } + } +} diff --git a/crypto_sign/dilithium2aes/avx2/aes256ctr.h b/crypto_sign/dilithium2aes/avx2/aes256ctr.h new file mode 100644 index 00000000..cdc936ba --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/aes256ctr.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_AES256CTR_H +#define PQCLEAN_DILITHIUM2AES_AVX2_AES256CTR_H + +#include +#include +#include + + +#define AES256CTR_BLOCKBYTES 64 + +typedef struct { + __m128i rkeys[16]; + __m128i n; +} aes256ctr_ctx; + +void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint64_t nonce); + +void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +void PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t seed[32], + uint64_t nonce); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/align.h b/crypto_sign/dilithium2aes/avx2/align.h new file mode 100644 index 00000000..c2c4e057 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/align.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_ALIGN_H +#define PQCLEAN_DILITHIUM2AES_AVX2_ALIGN_H + +#include +#include + +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[N]; \ + __m256i vec[((N)+31)/32]; \ + } + +#define ALIGNED_INT32(N) \ + union { \ + int32_t coeffs[N]; \ + __m256i vec[((N)+7)/8]; \ + } + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/api.h b/crypto_sign/dilithium2aes/avx2/api.h new file mode 100644 index 00000000..84f0634b --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_API_H +#define PQCLEAN_DILITHIUM2AES_AVX2_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES 1312 +#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES 2544 +#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES 2420 +#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_ALGNAME "Dilithium2-AES" + + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/cdecl.h b/crypto_sign/dilithium2aes/avx2/cdecl.h new file mode 100644 index 00000000..20d4580c --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/cdecl.h @@ -0,0 +1,24 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_CDECL_H +#define PQCLEAN_DILITHIUM2AES_AVX2_CDECL_H + + + +#define _8XQ 0 +#define _8XQINV 8 +#define _8XDIV_QINV 16 +#define _8XDIV 24 +#define _ZETAS_QINV 32 +#define _ZETAS 328 + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#define _cdecl(s) _##s +#define cdecl(s) s + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/consts.c b/crypto_sign/dilithium2aes/avx2/consts.c new file mode 100644 index 00000000..82736730 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/consts.c @@ -0,0 +1,101 @@ +#include "consts.h" +#include "params.h" +#include + +#define QINV 58728449 // q^(-1) mod 2^32 +#define MONT (-4186625) // 2^32 mod q +#define DIV 41978 // mont^2/256 +#define DIV_QINV (-8395782) + +const qdata_t PQCLEAN_DILITHIUM2AES_AVX2_qdata = {{ +//#define _8XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, + +//#define _8XQINV 8 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +//#define _8XDIV_QINV 16 + DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, + +//#define _8XDIV 24 + DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, + +//#define _ZETAS_QINV 32 + -151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, + 308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, + -1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, + -1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, + -285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, + 1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, + 1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, + 1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, + 329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, + -1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, + -202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, + -1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, + 1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, + -1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, + -783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, + 1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, + -695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, + -654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, + -247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, + -916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, + 1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, + -898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, + 2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, + 831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, + -2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, + 991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, + 908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, + -1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, + 6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, + 1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, + -1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, + 1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, + 702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, + 746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, + 885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, + 1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, + -1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, + +//#define _ZETAS 328 + -3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, + -359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, + 3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, + 3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, + 2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, + -549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, + -2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, + 1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, + 3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, + -3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, + 189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, + -1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, + -983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, + 264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, + -3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, + 2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, + 342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, + -1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, + -3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, + 3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, + 286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, + 1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, + 3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, + 2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, + -2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, + -2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, + 3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, + 3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, + 4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, + -1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, + 269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, + } +}; diff --git a/crypto_sign/dilithium2aes/avx2/consts.h b/crypto_sign/dilithium2aes/avx2/consts.h new file mode 100644 index 00000000..d02cf328 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/consts.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_CONSTS_H +#define PQCLEAN_DILITHIUM2AES_AVX2_CONSTS_H +#include "align.h" +#include "cdecl.h" + + +typedef ALIGNED_INT32(624) qdata_t; +extern const qdata_t PQCLEAN_DILITHIUM2AES_AVX2_qdata; + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/invntt.S b/crypto_sign/dilithium2aes/avx2/invntt.S new file mode 100644 index 00000000..95c8127d --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/invntt.S @@ -0,0 +1,240 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpsubd %ymm\l,%ymm\h,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vpmuldq %ymm\zl0,%ymm12,%ymm13 +vmovshdup %ymm12,%ymm\h +vpmuldq %ymm\zl1,%ymm\h,%ymm14 + +vpmuldq %ymm\zh0,%ymm12,%ymm12 +vpmuldq %ymm\zh1,%ymm\h,%ymm\h + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vpsubd %ymm13,%ymm12,%ymm12 +vpsubd %ymm14,%ymm\h,%ymm\h + +vmovshdup %ymm12,%ymm12 +vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h +.endm + +.macro levels0t5 off +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +/* level 0 */ +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,5,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 6,7,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,9,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 10,11,1,3,2,15 + +/* level 1 */ +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,6,1,3,2,15 +butterfly 5,7,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,10,1,3,2,15 +butterfly 9,11,1,3,2,15 + +/* level 2 */ +vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,8,1,3,2,15 +butterfly 5,9,1,3,2,15 +butterfly 6,10,1,3,2,15 +butterfly 7,11,1,3,2,15 + +/* level 3 */ +shuffle2 4,5,3,5 +shuffle2 6,7,4,7 +shuffle2 8,9,6,9 +shuffle2 10,11,8,11 + +vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 +butterfly 3,5 +butterfly 4,7 +butterfly 6,9 +butterfly 8,11 + +/* level 4 */ +shuffle4 3,4,10,4 +shuffle4 6,8,3,8 +shuffle4 5,7,6,7 +shuffle4 9,11,5,11 + +vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 +butterfly 10,4 +butterfly 3,8 +butterfly 6,7 +butterfly 5,11 + +/* level 5 */ +shuffle8 10,3,9,3 +shuffle8 6,5,10,5 +shuffle8 4,8,6,8 +shuffle8 7,11,4,11 + +vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 +butterfly 9,3 +butterfly 10,5 +butterfly 6,8 +butterfly 4,11 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm10,256*\off+ 32(%rdi) +vmovdqa %ymm6,256*\off+ 64(%rdi) +vmovdqa %ymm4,256*\off+ 96(%rdi) +vmovdqa %ymm3,256*\off+128(%rdi) +vmovdqa %ymm5,256*\off+160(%rdi) +vmovdqa %ymm8,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.macro levels6t7 off +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +/* level 6 */ +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +/* level 7 */ +vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) + +vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 +vmovdqa (_8XDIV)*4(%rsi),%ymm2 +vpmuldq %ymm1,%ymm4,%ymm12 +vpmuldq %ymm1,%ymm5,%ymm13 +vmovshdup %ymm4,%ymm8 +vmovshdup %ymm5,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm4,%ymm4 +vpmuldq %ymm2,%ymm5,%ymm5 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm4,%ymm4 +vpsubd %ymm13,%ymm5,%ymm5 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm4,%ymm4 +vmovshdup %ymm5,%ymm5 +vpblendd $0xAA,%ymm8,%ymm4,%ymm4 +vpblendd $0xAA,%ymm9,%ymm5,%ymm5 + +vpmuldq %ymm1,%ymm6,%ymm12 +vpmuldq %ymm1,%ymm7,%ymm13 +vmovshdup %ymm6,%ymm8 +vmovshdup %ymm7,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm6,%ymm6 +vpmuldq %ymm2,%ymm7,%ymm7 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm6,%ymm6 +vpsubd %ymm13,%ymm7,%ymm7 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm6,%ymm6 +vmovshdup %ymm7,%ymm7 +vpblendd $0xAA,%ymm8,%ymm6,%ymm6 +vpblendd $0xAA,%ymm9,%ymm7,%ymm7 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx) +cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx): +_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t5 0 +levels0t5 1 +levels0t5 2 +levels0t5 3 + +levels6t7 0 +levels6t7 1 +levels6t7 2 +levels6t7 3 + +ret diff --git a/crypto_sign/dilithium2aes/avx2/ntt.S b/crypto_sign/dilithium2aes/avx2/ntt.S new file mode 100644 index 00000000..d04e2b62 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/ntt.S @@ -0,0 +1,199 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpmuldq %ymm\zl0,%ymm\h,%ymm13 +vmovshdup %ymm\h,%ymm12 +vpmuldq %ymm\zl1,%ymm12,%ymm14 + +vpmuldq %ymm\zh0,%ymm\h,%ymm\h +vpmuldq %ymm\zh1,%ymm12,%ymm12 + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vmovshdup %ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h + +vpsubd %ymm\h,%ymm\l,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vmovshdup %ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 + +vpaddd %ymm13,%ymm12,%ymm\h +vpsubd %ymm13,%ymm\l,%ymm\l +.endm + +.macro levels0t1 off +/* level 0 */ +vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 + +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +/* level 1 */ +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) +.endm + +.macro levels2t7 off +/* level 2 */ +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +/* level 3 */ +vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 + +butterfly 3,5 +butterfly 8,10 +butterfly 4,6 +butterfly 9,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +/* level 4 */ +vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 + +butterfly 7,8 +butterfly 5,6 +butterfly 3,4 +butterfly 10,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +/* level 5 */ +vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 + +butterfly 9,5,1,10,2,15 +butterfly 8,4,1,10,2,15 +butterfly 7,3,1,10,2,15 +butterfly 6,11,1,10,2,15 + +/* level 6 */ +vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,7,1,10,2,15 +butterfly 8,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,3,1,10,2,15 +butterfly 4,11,1,10,2,15 + +/* level 7 */ +vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,8,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 7,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,4,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 3,11,1,10,2,15 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm8,256*\off+ 32(%rdi) +vmovdqa %ymm7,256*\off+ 64(%rdi) +vmovdqa %ymm6,256*\off+ 96(%rdi) +vmovdqa %ymm5,256*\off+128(%rdi) +vmovdqa %ymm4,256*\off+160(%rdi) +vmovdqa %ymm3,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx) +cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx): +_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 + +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 + +ret + diff --git a/crypto_sign/dilithium2aes/avx2/ntt.h b/crypto_sign/dilithium2aes/avx2/ntt.h new file mode 100644 index 00000000..24c2ad18 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/ntt.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_NTT_H +#define PQCLEAN_DILITHIUM2AES_AVX2_NTT_H + +#include + +void PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata); +void PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata); + +void PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx(__m256i *a); + +void PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata); +void PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM2AES_AVX2_qdata); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/packing.c b/crypto_sign/dilithium2aes/avx2/packing.c new file mode 100644 index 00000000..5ef0a359 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/packing.c @@ -0,0 +1,261 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2AES_AVX2_challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Encode h */ + for (i = 0; i < OMEGA + K; ++i) { + sig[i] = 0; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; + } + } + + sig[OMEGA + i] = (uint8_t) k; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2AES_AVX2_challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES]) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + return 0; +} diff --git a/crypto_sign/dilithium2aes/avx2/packing.h b/crypto_sign/dilithium2aes/avx2/packing.h new file mode 100644 index 00000000..d5c0a4c3 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/packing.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_PACKING_H +#define PQCLEAN_DILITHIUM2AES_AVX2_PACKING_H +#include "params.h" +#include "polyvec.h" +#include + +void PQCLEAN_DILITHIUM2AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +void PQCLEAN_DILITHIUM2AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM2AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM2AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM2AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES]); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/params.h b/crypto_sign/dilithium2aes/avx2/params.h new file mode 100644 index 00000000..b6b4a6c2 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/params.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_PARAMS_H +#define PQCLEAN_DILITHIUM2AES_AVX2_PARAMS_H + + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#define K 4 +#define L 4 +#define ETA 2 +#define TAU 39 +#define BETA 78 +#define GAMMA1 (1 << 17) +#define GAMMA2 ((Q-1)/88) +#define OMEGA 80 +#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_ALGNAME "Dilithium2-AES" + + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 576 + +#define POLYW1_PACKEDBYTES 192 + +#define POLYETA_PACKEDBYTES 96 + +#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/pointwise.S b/crypto_sign/dilithium2aes/avx2/pointwise.S new file mode 100644 index 00000000..2f1b8386 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/pointwise.S @@ -0,0 +1,199 @@ +#include "params.h" +#include "cdecl.h" + +.text +.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx) +.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx) +cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx): +_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx): +#consts +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop1: +#load +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa 64(%rsi),%ymm6 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vmovdqa 64(%rdx),%ymm14 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm6,%ymm7 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm14,%ymm15 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm6,%ymm14,%ymm6 +vpmuldq %ymm7,%ymm15,%ymm7 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm0,%ymm6,%ymm14 +vpmuldq %ymm0,%ymm7,%ymm15 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpmuldq %ymm1,%ymm14,%ymm14 +vpmuldq %ymm1,%ymm15,%ymm15 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsubq %ymm14,%ymm6,%ymm6 +vpsubq %ymm15,%ymm7,%ymm7 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm6,%ymm6 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 +vpblendd $0xAA,%ymm7,%ymm6,%ymm6 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm6,64(%rdi) + +add $96,%rdi +add $96,%rsi +add $96,%rdx +add $1,%eax +cmp $10,%eax +jb _looptop1 + +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0x55,%ymm2,%ymm3,%ymm2 +vpblendd $0x55,%ymm4,%ymm5,%ymm4 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +ret + +.macro pointwise off +#load +vmovdqa \off(%rsi),%ymm6 +vmovdqa \off+32(%rsi),%ymm8 +vmovdqa \off(%rdx),%ymm10 +vmovdqa \off+32(%rdx),%ymm12 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm6,%ymm10,%ymm6 +vpmuldq %ymm7,%ymm11,%ymm7 +vpmuldq %ymm8,%ymm12,%ymm8 +vpmuldq %ymm9,%ymm13,%ymm9 +.endm + +.macro acc +vpaddq %ymm6,%ymm2,%ymm2 +vpaddq %ymm7,%ymm3,%ymm3 +vpaddq %ymm8,%ymm4,%ymm4 +vpaddq %ymm9,%ymm5,%ymm5 +.endm + +.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx) +.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx) +cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx): +_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx): +#consts +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop2: +pointwise 0 + +#mov +vmovdqa %ymm6,%ymm2 +vmovdqa %ymm7,%ymm3 +vmovdqa %ymm8,%ymm4 +vmovdqa %ymm9,%ymm5 + +pointwise 1024 +acc + +pointwise 2048 +acc + +pointwise 3072 +acc + + + + +#reduce +vpmuldq %ymm0,%ymm2,%ymm6 +vpmuldq %ymm0,%ymm3,%ymm7 +vpmuldq %ymm0,%ymm4,%ymm8 +vpmuldq %ymm0,%ymm5,%ymm9 +vpmuldq %ymm1,%ymm6,%ymm6 +vpmuldq %ymm1,%ymm7,%ymm7 +vpmuldq %ymm1,%ymm8,%ymm8 +vpmuldq %ymm1,%ymm9,%ymm9 +vpsubq %ymm6,%ymm2,%ymm2 +vpsubq %ymm7,%ymm3,%ymm3 +vpsubq %ymm8,%ymm4,%ymm4 +vpsubq %ymm9,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 + +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +add $64,%rsi +add $64,%rdx +add $64,%rdi +add $1,%eax +cmp $16,%eax +jb _looptop2 + +ret diff --git a/crypto_sign/dilithium2aes/avx2/poly.c b/crypto_sign/dilithium2aes/avx2/poly.c new file mode 100644 index 00000000..896252de --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/poly.c @@ -0,0 +1,891 @@ +#include "align.h" +#include "consts.h" +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "rejsample.h" +#include "rounding.h" +#include "symmetric.h" +#include +#include +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. Assumes input +* coefficients to be at most 2^31 - 2^22 - 1 in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i off = _mm256_set1_epi32(1 << 22); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_add_epi32(f, off); + g = _mm256_srai_epi32(g, 23); + g = _mm256_mullo_epi32(g, q); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: poly_addq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i zero = _mm256_setzero_si256(); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_blendv_epi32(zero, q, f); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* positive standard representatives. Assumes input +* coefficients to be at most 2^31 - 2^22 + 1 in +* absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(a); + PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(a); + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(poly *a) { + unsigned int i; + __m256i f; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_slli_epi32(f, D); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by up to +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_AVX2_ntt_avx(a->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_AVX2_invntt_avx(a->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +void PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx(a->vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_AVX2_pointwise_avx(c->vec, a->vec, b->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod^+ Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* positive standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be positive standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint +* +* Description: Compute hint array. The coefficients of which are the +* indices of the coefficients of the input polynomial +* whose low bits overflow into the high bits. +* +* Arguments: - uint8_t *h: pointer to output hint array (preallocated of length N) +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of hints, i.e. length of hint array. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1) { + unsigned int r; + DBENCH_START(); + + r = PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(hint, a0->vec, a1->vec); + + DBENCH_STOP(*tround); + return r; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(b->vec, a->vec, h->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input polynomial to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int r; + __m256i f, t; + const __m256i bound = _mm256_set1_epi32(B - 1); + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + + t = _mm256_setzero_si256(); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_abs_epi32(f); + f = _mm256_cmpgt_epi32(f, bound); + t = _mm256_or_si256(t, f); + } + + r = 1 - _mm256_testz_si256(t, t); + DBENCH_STOP(*tsample); + return r; +} + +/************************************************* +* Name: rej_uniform +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + /* PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx reads up to 8 additional bytes */ + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + /* length of buf is always divisible by 3; hence, no bytes left */ + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(a, &state); + stream128_release(&state); +} + + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling using the +* output stream of SHAKE256(seed|nonce) +* or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN * STREAM128_BLOCKBYTES) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_ETA_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(a, &state); + stream128_release(&state); +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES+STREAM256_BLOCKBYTES-1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state) { + /* PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack reads 14 additional bytes */ + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf; + stream256_squeezeblocks(buf.coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, state); + PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(a, buf.coeffs); +} + +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { + stream256_state state; + stream256_init(&state, seed, nonce); + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(a, &state); + stream256_release(&state); +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(poly *restrict c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + ALIGNED_UINT8(SHAKE256_RATE) buf; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + + memcpy(&signs, buf.coeffs, 8); + pos = 8; + + memset(c->vec, 0, sizeof(poly)); + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + pos = 0; + } + + b = buf.coeffs[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = ETA - a->coeffs[8 * i + 0]; + t[1] = ETA - a->coeffs[8 * i + 1]; + t[2] = ETA - a->coeffs[8 * i + 2]; + t[3] = ETA - a->coeffs[8 * i + 3]; + t[4] = ETA - a->coeffs[8 * i + 4]; + t[5] = ETA - a->coeffs[8 * i + 5]; + t[6] = ETA - a->coeffs[8 * i + 6]; + t[7] = ETA - a->coeffs[8 * i + 7]; + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(poly *restrict r, const uint8_t a[POLYETA_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be positive standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2); + r[5 * i + 2] = (a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4); + r[5 * i + 3] = (a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6); + r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are positive standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(poly *restrict r, const uint8_t a[POLYT1_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = t[0]; + r[13 * i + 1] = t[0] >> 8; + r[13 * i + 1] |= t[1] << 5; + r[13 * i + 2] = t[1] >> 3; + r[13 * i + 3] = t[1] >> 11; + r[13 * i + 3] |= t[2] << 2; + r[13 * i + 4] = t[2] >> 6; + r[13 * i + 4] |= t[3] << 7; + r[13 * i + 5] = t[3] >> 1; + r[13 * i + 6] = t[3] >> 9; + r[13 * i + 6] |= t[4] << 4; + r[13 * i + 7] = t[4] >> 4; + r[13 * i + 8] = t[4] >> 12; + r[13 * i + 8] |= t[5] << 1; + r[13 * i + 9] = t[5] >> 7; + r[13 * i + 9] |= t[6] << 6; + r[13 * i + 10] = t[6] >> 2; + r[13 * i + 11] = t[6] >> 10; + r[13 * i + 11] |= t[7] << 3; + r[13 * i + 12] = t[7] >> 5; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(poly *restrict r, const uint8_t a[POLYT0_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + t[0] = GAMMA1 - a->coeffs[4 * i + 0]; + t[1] = GAMMA1 - a->coeffs[4 * i + 1]; + t[2] = GAMMA1 - a->coeffs[4 * i + 2]; + t[3] = GAMMA1 - a->coeffs[4 * i + 3]; + + r[9 * i + 0] = t[0]; + r[9 * i + 1] = t[0] >> 8; + r[9 * i + 2] = t[0] >> 16; + r[9 * i + 2] |= t[1] << 2; + r[9 * i + 3] = t[1] >> 6; + r[9 * i + 4] = t[1] >> 14; + r[9 * i + 4] |= t[2] << 4; + r[9 * i + 5] = t[2] >> 4; + r[9 * i + 6] = t[2] >> 12; + r[9 * i + 6] |= t[3] << 6; + r[9 * i + 7] = t[3] >> 2; + r[9 * i + 8] = t[3] >> 10; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(poly *restrict r, const uint8_t a[POLYZ_PACKEDBYTES + 14]) { + unsigned int i; + __m256i f; + const __m256i shufbidx = _mm256_set_epi8(-1, 9, 8, 7, -1, 7, 6, 5, -1, 5, 4, 3, -1, 3, 2, 1, + -1, 8, 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0); + const __m256i srlvdidx = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0); + const __m256i mask = _mm256_set1_epi32(0x3FFFF); + const __m256i gamma1 = _mm256_set1_epi32(GAMMA1); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_loadu_si256((__m256i *)&a[18 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_srlv_epi32(f, srlvdidx); + f = _mm256_and_si256(f, mask); + f = _mm256_sub_epi32(gamma1, f); + _mm256_store_si256(&r->vec[i], f); + } + + DBENCH_STOP(*tpack); +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be positive standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *restrict a) { + unsigned int i; + __m256i f0, f1, f2, f3; + const __m256i shift1 = _mm256_set1_epi16((64 << 8) + 1); + const __m256i shift2 = _mm256_set1_epi32((4096 << 16) + 1); + const __m256i shufdidx1 = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + const __m256i shufdidx2 = _mm256_set_epi32(-1, -1, 6, 5, 4, 2, 1, 0); + const __m256i shufbidx = _mm256_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0, + -1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0); + DBENCH_START(); + + for (i = 0; i < N / 32; i++) { + f0 = _mm256_load_si256(&a->vec[4 * i + 0]); + f1 = _mm256_load_si256(&a->vec[4 * i + 1]); + f2 = _mm256_load_si256(&a->vec[4 * i + 2]); + f3 = _mm256_load_si256(&a->vec[4 * i + 3]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_maddubs_epi16(f0, shift1); + f0 = _mm256_madd_epi16(f0, shift2); + f0 = _mm256_permutevar8x32_epi32(f0, shufdidx1); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + f0 = _mm256_permutevar8x32_epi32(f0, shufdidx2); + _mm256_storeu_si256((__m256i *)&r[24 * i], f0); + } + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium2aes/avx2/poly.h b/crypto_sign/dilithium2aes/avx2/poly.h new file mode 100644 index 00000000..7f066099 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/poly.h @@ -0,0 +1,52 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_POLY_H +#define PQCLEAN_DILITHIUM2AES_AVX2_POLY_H +#include "align.h" +#include "params.h" +#include "symmetric.h" +#include + +typedef ALIGNED_INT32(N) poly; + +void PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM2AES_AVX2_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + + +void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/polyvec.c b/crypto_sign/dilithium2aes/avx2/polyvec.c new file mode 100644 index 00000000..323fb1c5 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/polyvec.c @@ -0,0 +1,449 @@ +#include "aes256ctr.h" +#include "consts.h" +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include + +#define UNUSED(x) (void)x + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + uint64_t nonce; + aes256ctr_ctx state; + + PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&state, rho, 0); + + for (i = 0; i < K; i++) { + for (j = 0; j < L; j++) { + nonce = (i << 8) + j; + state.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&mat[i].vec[j], &state); + PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&mat[i].vec[j]); + } + } +} + + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { + PQCLEAN_DILITHIUM2AES_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - uint8_t *hint: pointer to output hint array +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { + unsigned int i, n = 0; + + for (i = 0; i < K; ++i) { + n += PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); + } + + return n; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} diff --git a/crypto_sign/dilithium2aes/avx2/polyvec.h b/crypto_sign/dilithium2aes/avx2/polyvec.h new file mode 100644 index 00000000..67652026 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/polyvec.h @@ -0,0 +1,64 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_POLYVEC_H +#define PQCLEAN_DILITHIUM2AES_AVX2_POLYVEC_H +#include "params.h" +#include "poly.h" +#include + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_reduce(polyvecl *v); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + +int PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(polyveck *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + +int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + + +void PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/rejsample.c b/crypto_sign/dilithium2aes/avx2/rejsample.c new file mode 100644 index 00000000..0c26e4b6 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/rejsample.c @@ -0,0 +1,394 @@ +#include "params.h" +#include "rejsample.h" +#include "symmetric.h" +#include +#include + +const uint8_t PQCLEAN_DILITHIUM2AES_AVX2_idxlut[256][8] = { + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 1, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 0, 0, 0, 0, 0, 0}, + { 2, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0}, + { 1, 2, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 0, 0, 0, 0, 0}, + { 3, 0, 0, 0, 0, 0, 0, 0}, + { 0, 3, 0, 0, 0, 0, 0, 0}, + { 1, 3, 0, 0, 0, 0, 0, 0}, + { 0, 1, 3, 0, 0, 0, 0, 0}, + { 2, 3, 0, 0, 0, 0, 0, 0}, + { 0, 2, 3, 0, 0, 0, 0, 0}, + { 1, 2, 3, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 0, 0, 0, 0}, + { 4, 0, 0, 0, 0, 0, 0, 0}, + { 0, 4, 0, 0, 0, 0, 0, 0}, + { 1, 4, 0, 0, 0, 0, 0, 0}, + { 0, 1, 4, 0, 0, 0, 0, 0}, + { 2, 4, 0, 0, 0, 0, 0, 0}, + { 0, 2, 4, 0, 0, 0, 0, 0}, + { 1, 2, 4, 0, 0, 0, 0, 0}, + { 0, 1, 2, 4, 0, 0, 0, 0}, + { 3, 4, 0, 0, 0, 0, 0, 0}, + { 0, 3, 4, 0, 0, 0, 0, 0}, + { 1, 3, 4, 0, 0, 0, 0, 0}, + { 0, 1, 3, 4, 0, 0, 0, 0}, + { 2, 3, 4, 0, 0, 0, 0, 0}, + { 0, 2, 3, 4, 0, 0, 0, 0}, + { 1, 2, 3, 4, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 0, 0, 0}, + { 5, 0, 0, 0, 0, 0, 0, 0}, + { 0, 5, 0, 0, 0, 0, 0, 0}, + { 1, 5, 0, 0, 0, 0, 0, 0}, + { 0, 1, 5, 0, 0, 0, 0, 0}, + { 2, 5, 0, 0, 0, 0, 0, 0}, + { 0, 2, 5, 0, 0, 0, 0, 0}, + { 1, 2, 5, 0, 0, 0, 0, 0}, + { 0, 1, 2, 5, 0, 0, 0, 0}, + { 3, 5, 0, 0, 0, 0, 0, 0}, + { 0, 3, 5, 0, 0, 0, 0, 0}, + { 1, 3, 5, 0, 0, 0, 0, 0}, + { 0, 1, 3, 5, 0, 0, 0, 0}, + { 2, 3, 5, 0, 0, 0, 0, 0}, + { 0, 2, 3, 5, 0, 0, 0, 0}, + { 1, 2, 3, 5, 0, 0, 0, 0}, + { 0, 1, 2, 3, 5, 0, 0, 0}, + { 4, 5, 0, 0, 0, 0, 0, 0}, + { 0, 4, 5, 0, 0, 0, 0, 0}, + { 1, 4, 5, 0, 0, 0, 0, 0}, + { 0, 1, 4, 5, 0, 0, 0, 0}, + { 2, 4, 5, 0, 0, 0, 0, 0}, + { 0, 2, 4, 5, 0, 0, 0, 0}, + { 1, 2, 4, 5, 0, 0, 0, 0}, + { 0, 1, 2, 4, 5, 0, 0, 0}, + { 3, 4, 5, 0, 0, 0, 0, 0}, + { 0, 3, 4, 5, 0, 0, 0, 0}, + { 1, 3, 4, 5, 0, 0, 0, 0}, + { 0, 1, 3, 4, 5, 0, 0, 0}, + { 2, 3, 4, 5, 0, 0, 0, 0}, + { 0, 2, 3, 4, 5, 0, 0, 0}, + { 1, 2, 3, 4, 5, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 0, 0}, + { 6, 0, 0, 0, 0, 0, 0, 0}, + { 0, 6, 0, 0, 0, 0, 0, 0}, + { 1, 6, 0, 0, 0, 0, 0, 0}, + { 0, 1, 6, 0, 0, 0, 0, 0}, + { 2, 6, 0, 0, 0, 0, 0, 0}, + { 0, 2, 6, 0, 0, 0, 0, 0}, + { 1, 2, 6, 0, 0, 0, 0, 0}, + { 0, 1, 2, 6, 0, 0, 0, 0}, + { 3, 6, 0, 0, 0, 0, 0, 0}, + { 0, 3, 6, 0, 0, 0, 0, 0}, + { 1, 3, 6, 0, 0, 0, 0, 0}, + { 0, 1, 3, 6, 0, 0, 0, 0}, + { 2, 3, 6, 0, 0, 0, 0, 0}, + { 0, 2, 3, 6, 0, 0, 0, 0}, + { 1, 2, 3, 6, 0, 0, 0, 0}, + { 0, 1, 2, 3, 6, 0, 0, 0}, + { 4, 6, 0, 0, 0, 0, 0, 0}, + { 0, 4, 6, 0, 0, 0, 0, 0}, + { 1, 4, 6, 0, 0, 0, 0, 0}, + { 0, 1, 4, 6, 0, 0, 0, 0}, + { 2, 4, 6, 0, 0, 0, 0, 0}, + { 0, 2, 4, 6, 0, 0, 0, 0}, + { 1, 2, 4, 6, 0, 0, 0, 0}, + { 0, 1, 2, 4, 6, 0, 0, 0}, + { 3, 4, 6, 0, 0, 0, 0, 0}, + { 0, 3, 4, 6, 0, 0, 0, 0}, + { 1, 3, 4, 6, 0, 0, 0, 0}, + { 0, 1, 3, 4, 6, 0, 0, 0}, + { 2, 3, 4, 6, 0, 0, 0, 0}, + { 0, 2, 3, 4, 6, 0, 0, 0}, + { 1, 2, 3, 4, 6, 0, 0, 0}, + { 0, 1, 2, 3, 4, 6, 0, 0}, + { 5, 6, 0, 0, 0, 0, 0, 0}, + { 0, 5, 6, 0, 0, 0, 0, 0}, + { 1, 5, 6, 0, 0, 0, 0, 0}, + { 0, 1, 5, 6, 0, 0, 0, 0}, + { 2, 5, 6, 0, 0, 0, 0, 0}, + { 0, 2, 5, 6, 0, 0, 0, 0}, + { 1, 2, 5, 6, 0, 0, 0, 0}, + { 0, 1, 2, 5, 6, 0, 0, 0}, + { 3, 5, 6, 0, 0, 0, 0, 0}, + { 0, 3, 5, 6, 0, 0, 0, 0}, + { 1, 3, 5, 6, 0, 0, 0, 0}, + { 0, 1, 3, 5, 6, 0, 0, 0}, + { 2, 3, 5, 6, 0, 0, 0, 0}, + { 0, 2, 3, 5, 6, 0, 0, 0}, + { 1, 2, 3, 5, 6, 0, 0, 0}, + { 0, 1, 2, 3, 5, 6, 0, 0}, + { 4, 5, 6, 0, 0, 0, 0, 0}, + { 0, 4, 5, 6, 0, 0, 0, 0}, + { 1, 4, 5, 6, 0, 0, 0, 0}, + { 0, 1, 4, 5, 6, 0, 0, 0}, + { 2, 4, 5, 6, 0, 0, 0, 0}, + { 0, 2, 4, 5, 6, 0, 0, 0}, + { 1, 2, 4, 5, 6, 0, 0, 0}, + { 0, 1, 2, 4, 5, 6, 0, 0}, + { 3, 4, 5, 6, 0, 0, 0, 0}, + { 0, 3, 4, 5, 6, 0, 0, 0}, + { 1, 3, 4, 5, 6, 0, 0, 0}, + { 0, 1, 3, 4, 5, 6, 0, 0}, + { 2, 3, 4, 5, 6, 0, 0, 0}, + { 0, 2, 3, 4, 5, 6, 0, 0}, + { 1, 2, 3, 4, 5, 6, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 0}, + { 7, 0, 0, 0, 0, 0, 0, 0}, + { 0, 7, 0, 0, 0, 0, 0, 0}, + { 1, 7, 0, 0, 0, 0, 0, 0}, + { 0, 1, 7, 0, 0, 0, 0, 0}, + { 2, 7, 0, 0, 0, 0, 0, 0}, + { 0, 2, 7, 0, 0, 0, 0, 0}, + { 1, 2, 7, 0, 0, 0, 0, 0}, + { 0, 1, 2, 7, 0, 0, 0, 0}, + { 3, 7, 0, 0, 0, 0, 0, 0}, + { 0, 3, 7, 0, 0, 0, 0, 0}, + { 1, 3, 7, 0, 0, 0, 0, 0}, + { 0, 1, 3, 7, 0, 0, 0, 0}, + { 2, 3, 7, 0, 0, 0, 0, 0}, + { 0, 2, 3, 7, 0, 0, 0, 0}, + { 1, 2, 3, 7, 0, 0, 0, 0}, + { 0, 1, 2, 3, 7, 0, 0, 0}, + { 4, 7, 0, 0, 0, 0, 0, 0}, + { 0, 4, 7, 0, 0, 0, 0, 0}, + { 1, 4, 7, 0, 0, 0, 0, 0}, + { 0, 1, 4, 7, 0, 0, 0, 0}, + { 2, 4, 7, 0, 0, 0, 0, 0}, + { 0, 2, 4, 7, 0, 0, 0, 0}, + { 1, 2, 4, 7, 0, 0, 0, 0}, + { 0, 1, 2, 4, 7, 0, 0, 0}, + { 3, 4, 7, 0, 0, 0, 0, 0}, + { 0, 3, 4, 7, 0, 0, 0, 0}, + { 1, 3, 4, 7, 0, 0, 0, 0}, + { 0, 1, 3, 4, 7, 0, 0, 0}, + { 2, 3, 4, 7, 0, 0, 0, 0}, + { 0, 2, 3, 4, 7, 0, 0, 0}, + { 1, 2, 3, 4, 7, 0, 0, 0}, + { 0, 1, 2, 3, 4, 7, 0, 0}, + { 5, 7, 0, 0, 0, 0, 0, 0}, + { 0, 5, 7, 0, 0, 0, 0, 0}, + { 1, 5, 7, 0, 0, 0, 0, 0}, + { 0, 1, 5, 7, 0, 0, 0, 0}, + { 2, 5, 7, 0, 0, 0, 0, 0}, + { 0, 2, 5, 7, 0, 0, 0, 0}, + { 1, 2, 5, 7, 0, 0, 0, 0}, + { 0, 1, 2, 5, 7, 0, 0, 0}, + { 3, 5, 7, 0, 0, 0, 0, 0}, + { 0, 3, 5, 7, 0, 0, 0, 0}, + { 1, 3, 5, 7, 0, 0, 0, 0}, + { 0, 1, 3, 5, 7, 0, 0, 0}, + { 2, 3, 5, 7, 0, 0, 0, 0}, + { 0, 2, 3, 5, 7, 0, 0, 0}, + { 1, 2, 3, 5, 7, 0, 0, 0}, + { 0, 1, 2, 3, 5, 7, 0, 0}, + { 4, 5, 7, 0, 0, 0, 0, 0}, + { 0, 4, 5, 7, 0, 0, 0, 0}, + { 1, 4, 5, 7, 0, 0, 0, 0}, + { 0, 1, 4, 5, 7, 0, 0, 0}, + { 2, 4, 5, 7, 0, 0, 0, 0}, + { 0, 2, 4, 5, 7, 0, 0, 0}, + { 1, 2, 4, 5, 7, 0, 0, 0}, + { 0, 1, 2, 4, 5, 7, 0, 0}, + { 3, 4, 5, 7, 0, 0, 0, 0}, + { 0, 3, 4, 5, 7, 0, 0, 0}, + { 1, 3, 4, 5, 7, 0, 0, 0}, + { 0, 1, 3, 4, 5, 7, 0, 0}, + { 2, 3, 4, 5, 7, 0, 0, 0}, + { 0, 2, 3, 4, 5, 7, 0, 0}, + { 1, 2, 3, 4, 5, 7, 0, 0}, + { 0, 1, 2, 3, 4, 5, 7, 0}, + { 6, 7, 0, 0, 0, 0, 0, 0}, + { 0, 6, 7, 0, 0, 0, 0, 0}, + { 1, 6, 7, 0, 0, 0, 0, 0}, + { 0, 1, 6, 7, 0, 0, 0, 0}, + { 2, 6, 7, 0, 0, 0, 0, 0}, + { 0, 2, 6, 7, 0, 0, 0, 0}, + { 1, 2, 6, 7, 0, 0, 0, 0}, + { 0, 1, 2, 6, 7, 0, 0, 0}, + { 3, 6, 7, 0, 0, 0, 0, 0}, + { 0, 3, 6, 7, 0, 0, 0, 0}, + { 1, 3, 6, 7, 0, 0, 0, 0}, + { 0, 1, 3, 6, 7, 0, 0, 0}, + { 2, 3, 6, 7, 0, 0, 0, 0}, + { 0, 2, 3, 6, 7, 0, 0, 0}, + { 1, 2, 3, 6, 7, 0, 0, 0}, + { 0, 1, 2, 3, 6, 7, 0, 0}, + { 4, 6, 7, 0, 0, 0, 0, 0}, + { 0, 4, 6, 7, 0, 0, 0, 0}, + { 1, 4, 6, 7, 0, 0, 0, 0}, + { 0, 1, 4, 6, 7, 0, 0, 0}, + { 2, 4, 6, 7, 0, 0, 0, 0}, + { 0, 2, 4, 6, 7, 0, 0, 0}, + { 1, 2, 4, 6, 7, 0, 0, 0}, + { 0, 1, 2, 4, 6, 7, 0, 0}, + { 3, 4, 6, 7, 0, 0, 0, 0}, + { 0, 3, 4, 6, 7, 0, 0, 0}, + { 1, 3, 4, 6, 7, 0, 0, 0}, + { 0, 1, 3, 4, 6, 7, 0, 0}, + { 2, 3, 4, 6, 7, 0, 0, 0}, + { 0, 2, 3, 4, 6, 7, 0, 0}, + { 1, 2, 3, 4, 6, 7, 0, 0}, + { 0, 1, 2, 3, 4, 6, 7, 0}, + { 5, 6, 7, 0, 0, 0, 0, 0}, + { 0, 5, 6, 7, 0, 0, 0, 0}, + { 1, 5, 6, 7, 0, 0, 0, 0}, + { 0, 1, 5, 6, 7, 0, 0, 0}, + { 2, 5, 6, 7, 0, 0, 0, 0}, + { 0, 2, 5, 6, 7, 0, 0, 0}, + { 1, 2, 5, 6, 7, 0, 0, 0}, + { 0, 1, 2, 5, 6, 7, 0, 0}, + { 3, 5, 6, 7, 0, 0, 0, 0}, + { 0, 3, 5, 6, 7, 0, 0, 0}, + { 1, 3, 5, 6, 7, 0, 0, 0}, + { 0, 1, 3, 5, 6, 7, 0, 0}, + { 2, 3, 5, 6, 7, 0, 0, 0}, + { 0, 2, 3, 5, 6, 7, 0, 0}, + { 1, 2, 3, 5, 6, 7, 0, 0}, + { 0, 1, 2, 3, 5, 6, 7, 0}, + { 4, 5, 6, 7, 0, 0, 0, 0}, + { 0, 4, 5, 6, 7, 0, 0, 0}, + { 1, 4, 5, 6, 7, 0, 0, 0}, + { 0, 1, 4, 5, 6, 7, 0, 0}, + { 2, 4, 5, 6, 7, 0, 0, 0}, + { 0, 2, 4, 5, 6, 7, 0, 0}, + { 1, 2, 4, 5, 6, 7, 0, 0}, + { 0, 1, 2, 4, 5, 6, 7, 0}, + { 3, 4, 5, 6, 7, 0, 0, 0}, + { 0, 3, 4, 5, 6, 7, 0, 0}, + { 1, 3, 4, 5, 6, 7, 0, 0}, + { 0, 1, 3, 4, 5, 6, 7, 0}, + { 2, 3, 4, 5, 6, 7, 0, 0}, + { 0, 2, 3, 4, 5, 6, 7, 0}, + { 1, 2, 3, 4, 5, 6, 7, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7} +}; + +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { + unsigned int ctr, pos; + uint32_t good; + __m256i d, tmp; + const __m256i bound = _mm256_set1_epi32(Q); + const __m256i mask = _mm256_set1_epi32(0x7FFFFF); + const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, + -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, + -1, 5, 4, 3, -1, 2, 1, 0); + + ctr = pos = 0; + while (pos <= REJ_UNIFORM_BUFLEN - 24) { + d = _mm256_loadu_si256((__m256i *)&buf[pos]); + d = _mm256_permute4x64_epi64(d, 0x94); + d = _mm256_shuffle_epi8(d, idx8); + d = _mm256_and_si256(d, mask); + pos += 24; + + tmp = _mm256_sub_epi32(d, bound); + good = _mm256_movemask_ps((__m256)tmp); + tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good])); + d = _mm256_permutevar8x32_epi32(d, tmp); + + _mm256_storeu_si256((__m256i *)&r[ctr], d); + ctr += _mm_popcnt_u32(good); + + } + + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { + unsigned int ctr, pos; + uint32_t good; + __m256i f0, f1, f2; + __m128i g0, g1; + const __m256i mask = _mm256_set1_epi8(15); + const __m256i eta = _mm256_set1_epi8(ETA); + const __m256i bound = mask; + const __m256i v = _mm256_set1_epi32(-6560); + const __m256i p = _mm256_set1_epi32(5); + + ctr = pos = 0; + while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { + f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); + f1 = _mm256_slli_epi16(f0, 4); + f0 = _mm256_or_si256(f0, f1); + f0 = _mm256_and_si256(f0, mask); + + f1 = _mm256_sub_epi8(f0, bound); + f0 = _mm256_sub_epi8(eta, f0); + good = _mm256_movemask_epi8(f1); + + g0 = _mm256_castsi256_si128(f0); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm256_extracti128_si256(f0, 1); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM2AES_AVX2_idxlut[good]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good); + pos += 4; + } + + uint32_t t0, t1; + while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + r[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < N) { + t1 = t1 - (205 * t1 >> 10) * 5; + r[ctr++] = 2 - t1; + } + } + + return ctr; +} diff --git a/crypto_sign/dilithium2aes/avx2/rejsample.h b/crypto_sign/dilithium2aes/avx2/rejsample.h new file mode 100644 index 00000000..154ae530 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/rejsample.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_REJSAMPLE_H +#define PQCLEAN_DILITHIUM2AES_AVX2_REJSAMPLE_H +#include "params.h" +#include "symmetric.h" +#include + +#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) + +#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) + +extern const uint8_t PQCLEAN_DILITHIUM2AES_AVX2_idxlut[256][8]; + +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); + +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/rounding.c b/crypto_sign/dilithium2aes/avx2/rounding.c new file mode 100644 index 00000000..054ce423 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/rounding.c @@ -0,0 +1,157 @@ +#include "consts.h" +#include "params.h" +#include "rejsample.h" +#include "rounding.h" +#include +#include +#include + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +/************************************************* +* Name: power2round +* +* Description: For finite field elements a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be positive standard representative. +* +* Arguments: - __m256i *a1: output array of length N/8 with high bits +* - __m256i *a0: output array of length N/8 with low bits a0 +* - const __m256i *a: input array of length N/8 +* +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i mask = _mm256_set1_epi32(-(1 << D)); + const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); + + for (i = 0; i < N / 8; ++i) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, half); + f0 = _mm256_and_si256(f1, mask); + f1 = _mm256_srli_epi32(f1, D); + f0 = _mm256_sub_epi32(f, f0); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + +/************************************************* +* Name: decompose +* +* Description: For finite field element a, compute high and low parts a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard +* representative. +* +* Arguments: - __m256i *a1: output array of length N/8 with high parts +* - __m256i *a0: output array of length N/8 with low parts a0 +* - const __m256i *a: input array of length N/8 +* +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1, t; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM2AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + const __m256i v = _mm256_set1_epi32(11275); + const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(128); + const __m256i max = _mm256_set1_epi32(43); + const __m256i zero = _mm256_setzero_si256(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + t = _mm256_sub_epi32(max, f1); + f1 = _mm256_blendv_epi32(f1, zero, t); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + +/************************************************* +* Name: make_hint +* +* Description: Compute indices of polynomial coefficients whose low bits +* overflow into the high bits. +* +* Arguments: - uint8_t *hint: hint array +* - const __m256i *a0: low bits of input elements +* - const __m256i *a1: high bits of input elements +* +* Returns number of overflowing low bits +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { + unsigned int i, n = 0; + __m256i f0, f1, g0, g1; + uint32_t bad; + uint64_t idx; + const __m256i low = _mm256_set1_epi32(-GAMMA2); + const __m256i high = _mm256_set1_epi32(GAMMA2); + + for (i = 0; i < N / 8; ++i) { + f0 = _mm256_load_si256(&a0[i]); + f1 = _mm256_load_si256(&a1[i]); + g0 = _mm256_abs_epi32(f0); + g0 = _mm256_cmpgt_epi32(g0, high); + g1 = _mm256_cmpeq_epi32(f0, low); + g1 = _mm256_sign_epi32(g1, f1); + g0 = _mm256_or_si256(g0, g1); + + bad = _mm256_movemask_ps((__m256)g0); + memcpy(&idx, PQCLEAN_DILITHIUM2AES_AVX2_idxlut[bad], 8); + idx += (uint64_t)0x0808080808080808 * i; + memcpy(&hint[n], &idx, 8); + n += _mm_popcnt_u32(bad); + } + + return n; +} + +/************************************************* +* Name: use_hint +* +* Description: Correct high parts according to hint. +* +* Arguments: - __m256i *b: output array of length N/8 with corrected high parts +* - const __m256i *a: input array of length N/8 +* - const __m256i *a: input array of length N/8 with hint bits +* +**************************************************/ +void PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { + unsigned int i; + __m256i a0[N / 8]; + __m256i f, g, h, t; + const __m256i zero = _mm256_setzero_si256(); + const __m256i max = _mm256_set1_epi32(43); + + PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(b, a0, a); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a0[i]); + g = _mm256_load_si256(&b[i]); + h = _mm256_load_si256(&hint[i]); + t = _mm256_blendv_epi32(zero, h, f); + t = _mm256_slli_epi32(t, 1); + h = _mm256_sub_epi32(h, t); + g = _mm256_add_epi32(g, h); + g = _mm256_blendv_epi32(g, max, g); + f = _mm256_cmpgt_epi32(g, max); + g = _mm256_blendv_epi32(g, zero, f); + _mm256_store_si256(&b[i], g); + } +} diff --git a/crypto_sign/dilithium2aes/avx2/rounding.h b/crypto_sign/dilithium2aes/avx2/rounding.h new file mode 100644 index 00000000..f8e790fd --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/rounding.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_ROUNDING_H +#define PQCLEAN_DILITHIUM2AES_AVX2_ROUNDING_H +#include "params.h" +#include +#include + +void PQCLEAN_DILITHIUM2AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); +void PQCLEAN_DILITHIUM2AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); +unsigned int PQCLEAN_DILITHIUM2AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); +void PQCLEAN_DILITHIUM2AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/shuffle.S b/crypto_sign/dilithium2aes/avx2/shuffle.S new file mode 100644 index 00000000..047deaa7 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/shuffle.S @@ -0,0 +1,54 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +#store +vmovdqa %ymm9,(%rdi) +vmovdqa %ymm8,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm3,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx) +.global _cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx) +cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx): +_cdecl(PQCLEAN_DILITHIUM2AES_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret diff --git a/crypto_sign/dilithium2aes/avx2/shuffle.inc b/crypto_sign/dilithium2aes/avx2/shuffle.inc new file mode 100644 index 00000000..73e9ffe0 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/shuffle.inc @@ -0,0 +1,25 @@ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm diff --git a/crypto_sign/dilithium2aes/avx2/sign.c b/crypto_sign/dilithium2aes/avx2/sign.c new file mode 100644 index 00000000..a6b111a6 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/sign.c @@ -0,0 +1,425 @@ +#include "aes256ctr.h" +#include "align.h" +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" +#include +#include + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; + uint8_t seedbuf[3 * SEEDBYTES]; + const uint8_t *rho, *rhoprime, *key; + uint64_t nonce; + aes256ctr_ctx aesctx; + polyvecl rowbuf[1]; + polyvecl s1, *row = rowbuf; + polyveck s2; + poly t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Store rho, key */ + memcpy(pk, rho, SEEDBYTES); + memcpy(sk, rho, SEEDBYTES); + memcpy(sk + SEEDBYTES, key, SEEDBYTES); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0); + for (i = 0; i < L; ++i) { + nonce = i; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(&s1.vec[i], &aesctx); + } + for (i = 0; i < K; ++i) { + nonce = L + i; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_eta_preinit(&s2.vec[i], &aesctx); + } + + /* Pack secret vectors */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); + } + for (i = 0; i < K; i++) { + PQCLEAN_DILITHIUM2AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); + } + + /* Transform s1 */ + PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&s1); + + PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rho, 0); + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + for (unsigned int j = 0; j < L; j++) { + nonce = (i << 8) + j; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx); + PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&row->vec[j]); + } + + /* Compute inner-product */ + PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); + PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&t1); + + /* Add error polynomial */ + PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&t1, &t1, &s2.vec[i]); + + /* Round t and pack t1, t0 */ + PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&t1); + PQCLEAN_DILITHIUM2AES_AVX2_poly_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM2AES_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); + PQCLEAN_DILITHIUM2AES_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); + } + + /* Compute CRH(rho, t1) and store in secret key */ + crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + unsigned int i, n, pos; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint8_t hintbuf[N]; + uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce = 0; + polyvecl mat[K], s1, z; + polyveck t0, s2, w1; + poly c, tmp; + union { + polyvecl y; + polyveck w0; + } tmpv; + shake256incctx state; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM2AES_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + /* Compute CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_expand(mat, rho); + PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM2AES_AVX2_polyveck_ntt(&t0); + + aes256ctr_ctx aesctx; + PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0); + +rej: + /* Sample intermediate vector y */ + for (i = 0; i < L; ++i) { + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce++; + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_gamma1_preinit(&z.vec[i], &aesctx); + } + + /* Matrix-vector product */ + tmpv.y = z; + PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_ntt(&tmpv.y); + PQCLEAN_DILITHIUM2AES_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); + PQCLEAN_DILITHIUM2AES_AVX2_polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM2AES_AVX2_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM2AES_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); + PQCLEAN_DILITHIUM2AES_AVX2_polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&c); + + /* Compute z, reject if it reveals secret */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); + PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); + PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&z.vec[i]); + if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { + goto rej; + } + } + + /* Zero hint vector in signature */ + pos = 0; + memset(hint, 0, OMEGA); + + for (i = 0; i < K; i++) { + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); + PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&tmpv.w0.vec[i]); + if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints */ + PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); + PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&tmp); + if (PQCLEAN_DILITHIUM2AES_AVX2_poly_chknorm(&tmp, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM2AES_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + n = PQCLEAN_DILITHIUM2AES_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); + if (pos + n > OMEGA) { + goto rej; + } + + /* Store hints in signature */ + memcpy(&hint[pos], hintbuf, n); + hint[OMEGA + i] = pos = pos + n; + } + + /* Pack z into signature */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM2AES_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); + } + + *siglen = PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { + unsigned int i, j, pos = 0; + /* PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack writes additional 14 bytes */ + ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; + uint8_t mu[CRHBYTES]; + const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce; + aes256ctr_ctx aesctx; + polyvecl rowbuf[1]; + polyvecl *row = rowbuf; + polyvecl z; + poly c, w1, h; + shake256incctx state; + + if (siglen != PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + /* Expand PQCLEAN_DILITHIUM2AES_AVX2_challenge */ + PQCLEAN_DILITHIUM2AES_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&c); + + /* Unpack z; shortness follows from unpacking */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM2AES_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); + PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&z.vec[i]); + } + + PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(&aesctx, pk, 0); + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + for (j = 0; j < L; j++) { + nonce = (i << 8) + j; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM2AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx); + PQCLEAN_DILITHIUM2AES_AVX2_poly_nttunpack(&row->vec[j]); + } + + /* Compute i-th row of Az - c2^Dt1 */ + PQCLEAN_DILITHIUM2AES_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); + + PQCLEAN_DILITHIUM2AES_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); + PQCLEAN_DILITHIUM2AES_AVX2_poly_shiftl(&h); + PQCLEAN_DILITHIUM2AES_AVX2_poly_ntt(&h); + PQCLEAN_DILITHIUM2AES_AVX2_poly_pointwise_montgomery(&h, &c, &h); + + PQCLEAN_DILITHIUM2AES_AVX2_poly_sub(&w1, &w1, &h); + PQCLEAN_DILITHIUM2AES_AVX2_poly_reduce(&w1); + PQCLEAN_DILITHIUM2AES_AVX2_poly_invntt_tomont(&w1); + + /* Get hint polynomial and reconstruct w1 */ + memset(h.vec, 0, sizeof(poly)); + if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { + return -1; + } + + for (j = pos; j < hint[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > pos && hint[j] <= hint[j - 1]) { + return -1; + } + h.coeffs[hint[j]] = 1; + } + pos = hint[OMEGA + i]; + + PQCLEAN_DILITHIUM2AES_AVX2_poly_caddq(&w1); + PQCLEAN_DILITHIUM2AES_AVX2_poly_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM2AES_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); + } + + /* Extra indices are zero for strong unforgeability */ + for (j = pos; j < OMEGA; ++j) { + if (hint[j]) { + return -1; + } + } + + /* Call random oracle and verify PQCLEAN_DILITHIUM2AES_AVX2_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (buf.coeffs[i] != sig[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { + size_t i; + + if (smlen < PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES) { + goto badsig; + } + + *mlen = smlen - PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM2AES_AVX2_CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium2aes/avx2/sign.h b/crypto_sign/dilithium2aes/avx2/sign.h new file mode 100644 index 00000000..3c372ea2 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/sign.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_SIGN_H +#define PQCLEAN_DILITHIUM2AES_AVX2_SIGN_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM2AES_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM2AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium2aes/avx2/symmetric.h b/crypto_sign/dilithium2aes/avx2/symmetric.h new file mode 100644 index 00000000..681d9ad9 --- /dev/null +++ b/crypto_sign/dilithium2aes/avx2/symmetric.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_DILITHIUM2AES_AVX2_SYMMETRIC_H +#define PQCLEAN_DILITHIUM2AES_AVX2_SYMMETRIC_H +#include "aes256ctr.h" +#include "fips202.h" +#include "params.h" +#include + + + +typedef aes256ctr_ctx stream128_state; +typedef aes256ctr_ctx stream256_state; + +#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES +#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream128_release(STATE) +#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM2AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_release(STATE) + + +#endif diff --git a/crypto_sign/dilithium2aes/clean/LICENSE b/crypto_sign/dilithium2aes/clean/LICENSE new file mode 100644 index 00000000..08473af7 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/LICENSE @@ -0,0 +1,5 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium2aes/clean/Makefile b/crypto_sign/dilithium2aes/clean/Makefile new file mode 100644 index 00000000..c5955825 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/Makefile @@ -0,0 +1,19 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium2aes_clean.a +HEADERS=aes256ctr.h api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS=aes256ctr.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-aes.o + +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_sign/dilithium2aes/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium2aes/clean/Makefile.Microsoft_nmake new file mode 100644 index 00000000..b3f34a7e --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/Makefile.Microsoft_nmake @@ -0,0 +1,23 @@ +# This Makefile can be used with Microsoft Visual Studio's nmake using the command: +# nmake /f Makefile.Microsoft_nmake + +LIBRARY=libdilithium2aes_clean.lib +OBJECTS=aes256ctr.obj ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-aes.obj + +# Warning C4146 is raised when a unary minus operator is applied to an +# unsigned type; this has nonetheless been standard and portable for as +# long as there has been a C standard, and we need it for constant-time +# computations. Thus, we disable that spurious warning. +CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 + +all: $(LIBRARY) + +# Make sure objects are recompiled if headers change. +$(OBJECTS): *.h + +$(LIBRARY): $(OBJECTS) + LIB.EXE /NOLOGO /WX /OUT:$@ $** + +clean: + -DEL $(OBJECTS) + -DEL $(LIBRARY) diff --git a/crypto_sign/dilithium2aes/clean/aes256ctr.c b/crypto_sign/dilithium2aes/clean/aes256ctr.c new file mode 100644 index 00000000..8ccd25a1 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/aes256ctr.c @@ -0,0 +1,564 @@ +#include "aes256ctr.h" +#include +#include +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +static inline uint32_t br_dec32le(const uint8_t *src) { + return (uint32_t)src[0] + | ((uint32_t)src[1] << 8) + | ((uint32_t)src[2] << 16) + | ((uint32_t)src[3] << 24); +} + +static void br_range_dec32le(uint32_t *v, size_t num, const uint8_t *src) { + while (num-- > 0) { + *v ++ = br_dec32le(src); + src += 4; + } +} + +static inline uint32_t br_swap32(uint32_t x) { + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + +static inline void br_enc32le(uint8_t *dst, uint32_t x) { + dst[0] = (uint8_t)x; + dst[1] = (uint8_t)(x >> 8); + dst[2] = (uint8_t)(x >> 16); + dst[3] = (uint8_t)(x >> 24); +} + +static void br_range_enc32le(uint8_t *dst, const uint32_t *v, size_t num) { + while (num-- > 0) { + br_enc32le(dst, *v ++); + dst += 4; + } +} + +static void br_aes_ct64_bitslice_Sbox(uint64_t *q) { + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +static void br_aes_ct64_ortho(uint64_t *q) { +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \ + (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) { + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) { + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const uint8_t Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t sub_word(uint32_t x) { + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +static void br_aes_ct64_keysched(uint64_t *comp_skey, const uint8_t *key) { + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + int key_len = 32; + + nk = (int)(key_len >> 2); + nkf = (int)((14 + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } +} + +static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey) { + unsigned u, v, n; + + n = (14 + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} + +static inline void add_round_key(uint64_t *q, const uint64_t *sk) { + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void shift_rows(uint64_t *q) { + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t rotr32(uint64_t x) { + return (x << 32) | (x >> 32); +} + +static inline void mix_columns(uint64_t *q) { + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +static void inc4_be(uint32_t *x) { + *x = br_swap32(*x) + 4; + *x = br_swap32(*x); +} + +static void aes_ctr4x(uint8_t out[64], uint32_t ivw[16], uint64_t sk_exp[64]) { + uint32_t w[16]; + uint64_t q[8]; + int i; + + memcpy(w, ivw, sizeof(w)); + for (i = 0; i < 4; i++) { + br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + + add_round_key(q, sk_exp); + for (i = 1; i < 14; i++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, sk_exp + (i << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, sk_exp + 112); + + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]); + } + br_range_enc32le(out, w, 16); + + /* Increase counter for next 4 blocks */ + inc4_be(ivw + 3); + inc4_be(ivw + 7); + inc4_be(ivw + 11); + inc4_be(ivw + 15); +} + +static void br_aes_ct64_ctr_init(uint64_t sk_exp[120], const uint8_t *key) { + uint64_t skey[30]; + + br_aes_ct64_keysched(skey, key); + br_aes_ct64_skey_expand(sk_exp, skey); +} + +static void br_aes_ct64_ctr_run(uint64_t sk_exp[120], const uint8_t *iv, uint32_t cc, uint8_t *data, size_t len) { + uint32_t ivw[16]; + size_t i; + + br_range_dec32le(ivw, 3, iv); + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + ivw[ 3] = br_swap32(cc); + ivw[ 7] = br_swap32(cc + 1); + ivw[11] = br_swap32(cc + 2); + ivw[15] = br_swap32(cc + 3); + + while (len > 64) { + aes_ctr4x(data, ivw, sk_exp); + data += 64; + len -= 64; + } + if (len > 0) { + uint8_t tmp[64]; + aes_ctr4x(tmp, ivw, sk_exp); + for (i = 0; i < len; i++) { + data[i] = tmp[i]; + } + } +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t *key, const uint8_t *nonce) { + uint64_t sk_exp[120]; + + br_aes_ct64_ctr_init(sk_exp, key); + br_aes_ct64_ctr_run(sk_exp, nonce, 0, out, outlen); +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_init(aes256ctr_ctx *s, const uint8_t *key, const uint8_t *nonce) { + br_aes_ct64_ctr_init(s->sk_exp, key); + + br_range_dec32le(s->ivw, 3, nonce); + memcpy(s->ivw + 4, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 8, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 12, s->ivw, 3 * sizeof(uint32_t)); + s->ivw[ 3] = br_swap32(0); + s->ivw[ 7] = br_swap32(1); + s->ivw[11] = br_swap32(2); + s->ivw[15] = br_swap32(3); +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *s) { + while (nblocks > 0) { + aes_ctr4x(out, s->ivw, s->sk_exp); + out += 64; + nblocks--; + } +} diff --git a/crypto_sign/dilithium2aes/clean/aes256ctr.h b/crypto_sign/dilithium2aes/clean/aes256ctr.h new file mode 100644 index 00000000..094a18e1 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/aes256ctr.h @@ -0,0 +1,28 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_AES256CTR_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_AES256CTR_H + +#include +#include + +#define AES256CTR_BLOCKBYTES 64 + + +typedef struct { + uint64_t sk_exp[120]; + uint32_t ivw[16]; +} aes256ctr_ctx; + +void PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/api.h b/crypto_sign/dilithium2aes/clean/api.h new file mode 100644 index 00000000..6f19a259 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/api.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_API_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES 1312 +#define PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_SECRETKEYBYTES 2544 +#define PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES 2420 +#define PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_ALGNAME "Dilithium2-AES" + + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/ntt.c b/crypto_sign/dilithium2aes/clean/ntt.c new file mode 100644 index 00000000..9dfe2088 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/ntt.c @@ -0,0 +1,98 @@ +#include "ntt.h" +#include "params.h" +#include "reduce.h" +#include + +static const int32_t zetas[N] = { + 0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, + 2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, + -2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, + -1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, + 3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, + -671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, + -3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, + -3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, + 189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, + 1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, + 2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, + 266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, + 900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, + -655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, + 342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, + 2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, + -3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, + -1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, + -1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, + -542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, + -2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, + -3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, + -3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, + -426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, + -2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, + -554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 +}; + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_ntt +* +* Description: Forward NTT, in-place. No modular reduction is performed after +* additions or subtractions. Output vector is in bitreversed order. +* +* Arguments: - uint32_t p[N]: input/output coefficient array +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_ntt(int32_t a[N]) { + unsigned int len, start, j, k; + int32_t zeta, t; + + k = 0; + for (len = 128; len > 0; len >>= 1) { + for (start = 0; start < N; start = j + len) { + zeta = zetas[++k]; + for (j = start; j < start + len; ++j) { + t = PQCLEAN_DILITHIUM2AES_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + a[j + len] = a[j] - t; + a[j] = a[j] + t; + } + } + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_invntt_tomont +* +* Description: Inverse NTT and multiplication by Montgomery factor 2^32. +* In-place. No modular reductions after additions or +* subtractions; input coefficients need to be smaller than +* Q in absolute value. Output coefficient are smaller than Q in +* absolute value. +* +* Arguments: - uint32_t p[N]: input/output coefficient array +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_invntt_tomont(int32_t a[N]) { + unsigned int start, len, j, k; + int32_t t, zeta; + const int32_t f = 41978; // mont^2/256 + + k = 256; + for (len = 1; len < N; len <<= 1) { + for (start = 0; start < N; start = j + len) { + zeta = -zetas[--k]; + for (j = start; j < start + len; ++j) { + t = a[j]; + a[j] = t + a[j + len]; + a[j + len] = t - a[j + len]; + a[j + len] = PQCLEAN_DILITHIUM2AES_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + } + } + } + + for (j = 0; j < N; ++j) { + a[j] = PQCLEAN_DILITHIUM2AES_CLEAN_montgomery_reduce((int64_t)f * a[j]); + } +} diff --git a/crypto_sign/dilithium2aes/clean/ntt.h b/crypto_sign/dilithium2aes/clean/ntt.h new file mode 100644 index 00000000..3de56c05 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/ntt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_NTT_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_NTT_H +#include "params.h" +#include + +void PQCLEAN_DILITHIUM2AES_CLEAN_ntt(int32_t a[N]); + +void PQCLEAN_DILITHIUM2AES_CLEAN_invntt_tomont(int32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/packing.c b/crypto_sign/dilithium2aes/clean/packing.c new file mode 100644 index 00000000..bbaaf21f --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/packing.c @@ -0,0 +1,261 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM2AES_CLEAN_challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Encode h */ + for (i = 0; i < OMEGA + K; ++i) { + sig[i] = 0; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; + } + } + + sig[OMEGA + i] = (uint8_t) k; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM2AES_CLEAN_challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES]) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + return 0; +} diff --git a/crypto_sign/dilithium2aes/clean/packing.h b/crypto_sign/dilithium2aes/clean/packing.h new file mode 100644 index 00000000..a6dbd114 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/packing.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_PACKING_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_PACKING_H +#include "params.h" +#include "polyvec.h" +#include + +void PQCLEAN_DILITHIUM2AES_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +void PQCLEAN_DILITHIUM2AES_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM2AES_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM2AES_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM2AES_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM2AES_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES]); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/params.h b/crypto_sign/dilithium2aes/clean/params.h new file mode 100644 index 00000000..b8625aa3 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/params.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_PARAMS_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_PARAMS_H + + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#define K 4 +#define L 4 +#define ETA 2 +#define TAU 39 +#define BETA 78 +#define GAMMA1 (1 << 17) +#define GAMMA2 ((Q-1)/88) +#define OMEGA 80 +#define PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_ALGNAME "Dilithium2-AES" + + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 576 + +#define POLYW1_PACKEDBYTES 192 + +#define POLYETA_PACKEDBYTES 96 + +#define PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif diff --git a/crypto_sign/dilithium2aes/clean/poly.c b/crypto_sign/dilithium2aes/clean/poly.c new file mode 100644 index 00000000..7edd1629 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/poly.c @@ -0,0 +1,867 @@ +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "reduce.h" +#include "rounding.h" +#include "symmetric.h" +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_reduce(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM2AES_CLEAN_reduce32(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_caddq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_caddq(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM2AES_CLEAN_caddq(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* standard representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_freeze(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM2AES_CLEAN_freeze(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_shiftl(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] <<= D; + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_ntt(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_CLEAN_ntt(a->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM2AES_CLEAN_invntt_tomont(a->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = PQCLEAN_DILITHIUM2AES_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM2AES_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM2AES_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2AES_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { + unsigned int i, s = 0; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + h->coeffs[i] = PQCLEAN_DILITHIUM2AES_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + + DBENCH_STOP(*tround); + return s; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + b->coeffs[i] = PQCLEAN_DILITHIUM2AES_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM2AES_CLEAN_reduce32(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int32_t t; + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for (i = 0; i < N; ++i) { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2 * a->coeffs[i]); + + if (t >= B) { + DBENCH_STOP(*tsample); + return 1; + } + } + + DBENCH_STOP(*tsample); + return 0; +} + +/************************************************* +* Name: rej_uniform +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); + + ctr = rej_uniform(a->coeffs, N, buf, buflen); + + while (ctr < N) { + off = buflen % 3; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + stream128_squeezeblocks(buf + off, 1, &state); + buflen = STREAM128_BLOCKBYTES + off; + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); + } + stream128_release(&state); +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling on the +* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = rej_eta(a->coeffs, N, buf, buflen); + + while (ctr < N) { + stream128_squeezeblocks(buf, 1, &state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); + } + stream128_release(&state); +} + +/************************************************* +* Name: poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce) { + uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + stream256_release(&state); + PQCLEAN_DILITHIUM2AES_CLEAN_polyz_unpack(a, buf); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[SHAKE256_RATE]; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf, sizeof buf, &state); + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)buf[i] << 8 * i; + } + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf, sizeof buf, &state); + pos = 0; + } + + b = buf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (uint8_t) (ETA - a->coeffs[8 * i + 0]); + t[1] = (uint8_t) (ETA - a->coeffs[8 * i + 1]); + t[2] = (uint8_t) (ETA - a->coeffs[8 * i + 2]); + t[3] = (uint8_t) (ETA - a->coeffs[8 * i + 3]); + t[4] = (uint8_t) (ETA - a->coeffs[8 * i + 4]); + t[5] = (uint8_t) (ETA - a->coeffs[8 * i + 5]); + t[6] = (uint8_t) (ETA - a->coeffs[8 * i + 6]); + t[7] = (uint8_t) (ETA - a->coeffs[8 * i + 7]); + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)); + r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)); + r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)); + r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = (uint8_t) t[0]; + r[13 * i + 1] = (uint8_t) (t[0] >> 8); + r[13 * i + 1] |= (uint8_t) (t[1] << 5); + r[13 * i + 2] = (uint8_t) (t[1] >> 3); + r[13 * i + 3] = (uint8_t) (t[1] >> 11); + r[13 * i + 3] |= (uint8_t) (t[2] << 2); + r[13 * i + 4] = (uint8_t) (t[2] >> 6); + r[13 * i + 4] |= (uint8_t) (t[3] << 7); + r[13 * i + 5] = (uint8_t) (t[3] >> 1); + r[13 * i + 6] = (uint8_t) (t[3] >> 9); + r[13 * i + 6] |= (uint8_t) (t[4] << 4); + r[13 * i + 7] = (uint8_t) (t[4] >> 4); + r[13 * i + 8] = (uint8_t) (t[4] >> 12); + r[13 * i + 8] |= (uint8_t) (t[5] << 1); + r[13 * i + 9] = (uint8_t) (t[5] >> 7); + r[13 * i + 9] |= (uint8_t) (t[6] << 6); + r[13 * i + 10] = (uint8_t) (t[6] >> 2); + r[13 * i + 11] = (uint8_t) (t[6] >> 10); + r[13 * i + 11] |= (uint8_t) (t[7] << 3); + r[13 * i + 12] = (uint8_t) (t[7] >> 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyz_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + t[0] = GAMMA1 - a->coeffs[4 * i + 0]; + t[1] = GAMMA1 - a->coeffs[4 * i + 1]; + t[2] = GAMMA1 - a->coeffs[4 * i + 2]; + t[3] = GAMMA1 - a->coeffs[4 * i + 3]; + + r[9 * i + 0] = (uint8_t) t[0]; + r[9 * i + 1] = (uint8_t) (t[0] >> 8); + r[9 * i + 2] = (uint8_t) (t[0] >> 16); + r[9 * i + 2] |= (uint8_t) (t[1] << 2); + r[9 * i + 3] = (uint8_t) (t[1] >> 6); + r[9 * i + 4] = (uint8_t) (t[1] >> 14); + r[9 * i + 4] |= (uint8_t) (t[2] << 4); + r[9 * i + 5] = (uint8_t) (t[2] >> 4); + r[9 * i + 6] = (uint8_t) (t[2] >> 12); + r[9 * i + 6] |= (uint8_t) (t[3] << 6); + r[9 * i + 7] = (uint8_t) (t[3] >> 2); + r[9 * i + 8] = (uint8_t) (t[3] >> 10); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = a[9 * i + 0]; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 1] << 8; + r->coeffs[4 * i + 0] |= (uint32_t)a[9 * i + 2] << 16; + r->coeffs[4 * i + 0] &= 0x3FFFF; + + r->coeffs[4 * i + 1] = a[9 * i + 2] >> 2; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 3] << 6; + r->coeffs[4 * i + 1] |= (uint32_t)a[9 * i + 4] << 14; + r->coeffs[4 * i + 1] &= 0x3FFFF; + + r->coeffs[4 * i + 2] = a[9 * i + 4] >> 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 5] << 4; + r->coeffs[4 * i + 2] |= (uint32_t)a[9 * i + 6] << 12; + r->coeffs[4 * i + 2] &= 0x3FFFF; + + r->coeffs[4 * i + 3] = a[9 * i + 6] >> 6; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 7] << 2; + r->coeffs[4 * i + 3] |= (uint32_t)a[9 * i + 8] << 10; + r->coeffs[4 * i + 3] &= 0x3FFFF; + + r->coeffs[4 * i + 0] = GAMMA1 - r->coeffs[4 * i + 0]; + r->coeffs[4 * i + 1] = GAMMA1 - r->coeffs[4 * i + 1]; + r->coeffs[4 * i + 2] = GAMMA1 - r->coeffs[4 * i + 2]; + r->coeffs[4 * i + 3] = GAMMA1 - r->coeffs[4 * i + 3]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[3 * i + 0] = (uint8_t) a->coeffs[4 * i + 0]; + r[3 * i + 0] |= (uint8_t) (a->coeffs[4 * i + 1] << 6); + r[3 * i + 1] = (uint8_t) (a->coeffs[4 * i + 1] >> 2); + r[3 * i + 1] |= (uint8_t) (a->coeffs[4 * i + 2] << 4); + r[3 * i + 2] = (uint8_t) (a->coeffs[4 * i + 2] >> 4); + r[3 * i + 2] |= (uint8_t) (a->coeffs[4 * i + 3] << 2); + } + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium2aes/clean/poly.h b/crypto_sign/dilithium2aes/clean/poly.h new file mode 100644 index 00000000..6ad8405e --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/poly.h @@ -0,0 +1,53 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_POLY_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_POLY_H +#include "params.h" +#include + +typedef struct { + int32_t coeffs[N]; +} poly; + +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_caddq(poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM2AES_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM2AES_CLEAN_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM2AES_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyz_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyw1_pack(uint8_t *r, const poly *a); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/polyvec.c b/crypto_sign/dilithium2aes/clean/polyvec.c new file mode 100644 index 00000000..a7a77fb3 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/polyvec.c @@ -0,0 +1,448 @@ +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + + for (i = 0; i < K; ++i) { + for (j = 0; j < L; ++j) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); + } + } +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); + } +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_ntt(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + unsigned int i; + poly t; + + PQCLEAN_DILITHIUM2AES_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); + for (i = 1; i < L; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); + PQCLEAN_DILITHIUM2AES_CLEAN_poly_add(w, w, &t); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM2AES_CLEAN_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_caddq(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM2AES_CLEAN_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; + + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM2AES_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + } + + return s; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM2AES_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} diff --git a/crypto_sign/dilithium2aes/clean/polyvec.h b/crypto_sign/dilithium2aes/clean/polyvec.h new file mode 100644 index 00000000..ea0ade36 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/polyvec.h @@ -0,0 +1,68 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_POLYVEC_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_POLYVEC_H +#include "params.h" +#include "poly.h" +#include + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_reduce(polyvecl *v); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + + +int PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_caddq(polyveck *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + +int PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/reduce.c b/crypto_sign/dilithium2aes/clean/reduce.c new file mode 100644 index 00000000..6e8f0269 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/reduce.c @@ -0,0 +1,69 @@ +#include "params.h" +#include "reduce.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_montgomery_reduce +* +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. +* +* Arguments: - int64_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_montgomery_reduce(int64_t a) { + int32_t t; + + t = (int32_t)((uint64_t)a * (uint64_t)QINV); + t = (a - (int64_t)t * Q) >> 32; + return t; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_reduce32 +* +* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, +* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_reduce32(int32_t a) { + int32_t t; + + t = (a + (1 << 22)) >> 23; + t = a - t * Q; + return t; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_caddq +* +* Description: Add Q if input coefficient is negative. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_caddq(int32_t a) { + a += (a >> 31) & Q; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_freeze +* +* Description: For finite field element a, compute standard +* representative r = a mod^+ Q. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_freeze(int32_t a) { + a = PQCLEAN_DILITHIUM2AES_CLEAN_reduce32(a); + a = PQCLEAN_DILITHIUM2AES_CLEAN_caddq(a); + return a; +} diff --git a/crypto_sign/dilithium2aes/clean/reduce.h b/crypto_sign/dilithium2aes/clean/reduce.h new file mode 100644 index 00000000..7ba1c955 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/reduce.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_REDUCE_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_REDUCE_H +#include "params.h" +#include + +#define MONT (-4186625) // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 + +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_montgomery_reduce(int64_t a); + +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_reduce32(int32_t a); + +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_caddq(int32_t a); + +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_freeze(int32_t a); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/rounding.c b/crypto_sign/dilithium2aes/clean/rounding.c new file mode 100644 index 00000000..c467c5b3 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/rounding.c @@ -0,0 +1,98 @@ +#include "params.h" +#include "rounding.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_power2round(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + (1 << (D - 1)) - 1) >> D; + *a0 = a - (a1 << D); + return a1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_decompose(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + 127) >> 7; + a1 = (a1 * 11275 + (1 << 23)) >> 24; + a1 ^= ((43 - a1) >> 31) & a1; + + *a0 = a - a1 * 2 * GAMMA2; + *a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; + return a1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. +* +* Arguments: - int32_t a0: low bits of input element +* - int32_t a1: high bits of input element +* +* Returns 1 if overflow. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM2AES_CLEAN_make_hint(int32_t a0, int32_t a1) { + if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { + return 1; + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - int32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_use_hint(int32_t a, unsigned int hint) { + int32_t a0, a1; + + a1 = PQCLEAN_DILITHIUM2AES_CLEAN_decompose(&a0, a); + if (hint == 0) { + return a1; + } + + if (a0 > 0) { + if (a1 == 43) { + return 0; + } + return a1 + 1; + } + if (a1 == 0) { + return 43; + } + return a1 - 1; +} diff --git a/crypto_sign/dilithium2aes/clean/rounding.h b/crypto_sign/dilithium2aes/clean/rounding.h new file mode 100644 index 00000000..e970c5d9 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/rounding.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_ROUNDING_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_ROUNDING_H +#include "params.h" +#include + +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_power2round(int32_t *a0, int32_t a); + +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_decompose(int32_t *a0, int32_t a); + +unsigned int PQCLEAN_DILITHIUM2AES_CLEAN_make_hint(int32_t a0, int32_t a1); + +int32_t PQCLEAN_DILITHIUM2AES_CLEAN_use_hint(int32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/sign.c b/crypto_sign/dilithium2aes/clean/sign.c new file mode 100644 index 00000000..a30df581 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/sign.c @@ -0,0 +1,343 @@ +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Expand matrix */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_expand(mat, rho); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); + + /* Matrix-vector multiplication */ + s1hat = s1; + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_ntt(&s1hat); + PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce(&t1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_invntt_tomont(&t1); + + /* Add error vector s2 */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_add(&t1, &t1, &s2); + + /* Extract t1 and write public key */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_caddq(&t1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM2AES_CLEAN_pack_pk(pk, rho, &t1); + + /* Compute CRH(rho, t1) and write secret key */ + crh(tr, pk, PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES); + PQCLEAN_DILITHIUM2AES_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + unsigned int n; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint16_t nonce = 0; + polyvecl mat[K], s1, y, z; + polyveck t0, s2, w1, w0, h; + poly cp; + shake256incctx state; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM2AES_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + /* Compute CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_expand(mat, rho); + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); + + /* Matrix-vector multiplication */ + z = y; + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_decompose(&w1, &w0, &w1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM2AES_CLEAN_poly_challenge(&cp, sig); + PQCLEAN_DILITHIUM2AES_CLEAN_poly_ntt(&cp); + + /* Compute z, reject if it reveals secret */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_invntt_tomont(&z); + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_add(&z, &z, &y); + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_reduce(&z); + if (PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + goto rej; + } + + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_sub(&w0, &w0, &h); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce(&w0); + if (PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints for w1 */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce(&h); + if (PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_chknorm(&h, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_add(&w0, &w0, &h); + n = PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_make_hint(&h, &w0, &w1); + if (n > OMEGA) { + goto rej; + } + + /* Write signature */ + PQCLEAN_DILITHIUM2AES_CLEAN_pack_sig(sig, sig, &z, &h); + *siglen = PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) { + unsigned int i; + uint8_t buf[K * POLYW1_PACKEDBYTES]; + uint8_t rho[SEEDBYTES]; + uint8_t mu[CRHBYTES]; + uint8_t c[SEEDBYTES]; + uint8_t c2[SEEDBYTES]; + poly cp; + polyvecl mat[K], z; + polyveck t1, w1, h; + shake256incctx state; + + if (siglen != PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES) { + return -1; + } + + PQCLEAN_DILITHIUM2AES_CLEAN_unpack_pk(rho, &t1, pk); + if (PQCLEAN_DILITHIUM2AES_CLEAN_unpack_sig(c, &z, &h, sig)) { + return -1; + } + if (PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + PQCLEAN_DILITHIUM2AES_CLEAN_poly_challenge(&cp, c); + PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_expand(mat, rho); + + PQCLEAN_DILITHIUM2AES_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM2AES_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + + PQCLEAN_DILITHIUM2AES_CLEAN_poly_ntt(&cp); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_shiftl(&t1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_ntt(&t1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); + + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_sub(&w1, &w1, &t1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_invntt_tomont(&w1); + + /* Reconstruct w1 */ + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM2AES_CLEAN_polyveck_pack_w1(buf, &w1); + + /* Call random oracle and verify PQCLEAN_DILITHIUM2AES_CLEAN_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(c2, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (c[i] != c2[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) { + size_t i; + + if (smlen < PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES) { + goto badsig; + } + + *mlen = smlen - PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM2AES_CLEAN_CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = (size_t) -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium2aes/clean/sign.h b/crypto_sign/dilithium2aes/clean/sign.h new file mode 100644 index 00000000..567eb054 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/sign.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_SIGN_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_SIGN_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM2AES_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM2AES_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium2aes/clean/symmetric-aes.c b/crypto_sign/dilithium2aes/clean/symmetric-aes.c new file mode 100644 index 00000000..4ae16c26 --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/symmetric-aes.c @@ -0,0 +1,12 @@ +#include "aes256ctr.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM2AES_CLEAN_dilithium_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint16_t nonce) { + uint8_t expnonce[12] = {0}; + expnonce[0] = (uint8_t) nonce; + expnonce[1] = (uint8_t) (nonce >> 8); + PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_init(state, key, expnonce); +} diff --git a/crypto_sign/dilithium2aes/clean/symmetric.h b/crypto_sign/dilithium2aes/clean/symmetric.h new file mode 100644 index 00000000..6ef04c5c --- /dev/null +++ b/crypto_sign/dilithium2aes/clean/symmetric.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_DILITHIUM2AES_CLEAN_SYMMETRIC_H +#define PQCLEAN_DILITHIUM2AES_CLEAN_SYMMETRIC_H +#include "aes256ctr.h" +#include "fips202.h" +#include "params.h" +#include + + + +typedef aes256ctr_ctx stream128_state; +typedef aes256ctr_ctx stream256_state; + +void PQCLEAN_DILITHIUM2AES_CLEAN_dilithium_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES +#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM2AES_CLEAN_dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream128_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM2AES_CLEAN_dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_DILITHIUM2AES_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_release(STATE) + + +#endif diff --git a/crypto_sign/dilithium3/META.yml b/crypto_sign/dilithium3/META.yml index 37f51f72..3b84f249 100644 --- a/crypto_sign/dilithium3/META.yml +++ b/crypto_sign/dilithium3/META.yml @@ -1,11 +1,11 @@ name: Dilithium3 type: signature -claimed-nist-level: 2 -length-public-key: 1472 -length-secret-key: 3504 -length-signature: 2701 -nistkat-sha256: 900268789819cc81b03e6384d97336b7bc700a5a9ffd5d3c993deacb6fe7f5b6 -testvectors-sha256: 35d7e51b9e4e456c68bfc5ae393d311c96005d8563eb3240a051c97f3710c45d +claimed-nist-level: 3 +length-public-key: 1952 +length-secret-key: 4016 +length-signature: 3293 +nistkat-sha256: d0d4bb6945e14206d17b52f8a395d5a750ec8a73f2ea06b9f1cd226d225a9bfb +testvectors-sha256: 531b85dbecaeaf135ad9004c8e2d5ce163b8e72d9c3a537e15bd383cf5f38aa4 principal-submitters: - Vadim Lyubashevsky auxiliary-submitters: @@ -17,15 +17,15 @@ auxiliary-submitters: - Damien Stehlé implementations: - name: clean - version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium - name: avx2 - version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium supported_platforms: - - architecture: x86_64 - operating_systems: - - Darwin - - Linux - required_flags: - - avx2 - - bmi1 - - popcnt + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - aes + - avx2 + - popcnt diff --git a/crypto_sign/dilithium3/avx2/LICENSE b/crypto_sign/dilithium3/avx2/LICENSE index 40541676..08473af7 100644 --- a/crypto_sign/dilithium3/avx2/LICENSE +++ b/crypto_sign/dilithium3/avx2/LICENSE @@ -1,6 +1,5 @@ Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) -For Keccak and the random number generator -we are using public-domain code from sources -and by authors listed in comments on top of -the respective files. +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium3/avx2/Makefile b/crypto_sign/dilithium3/avx2/Makefile index 8f4e1dff..8cd2a155 100644 --- a/crypto_sign/dilithium3/avx2/Makefile +++ b/crypto_sign/dilithium3/avx2/Makefile @@ -1,34 +1,27 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium3_avx2.a - -SOURCES = fips202x4.c invntt.S nttconsts.c ntt.S packing.c pointwise.S poly.c \ - polyvec.c reduce.S rejsample.c rounding.c sign.c stream.c -OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \ - polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o -HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \ - fips202x4.h shuffle.inc cdecl.inc - -CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \ - -Wmissing-prototypes -Wredundant-decls -std=c99 \ - -Wcast-align -Werror=shadow\ - -mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS) - -all: $(LIB) - +HEADERS=align.h api.h cdecl.h consts.h fips202x4.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc +OBJECTS=consts.o fips202x4.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o symmetric-shake.o f1600x4.o invntt.o ntt.o pointwise.o shuffle.o KECCAK4XDIR=../../../common/keccak4x KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) +CFLAGS=-mavx2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls \ + -Wpointer-arith -Wshadow \ + -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< %.o: %.S $(HEADERS) - $(CC) -c -o $@ $< + $(CC) $(CFLAGS) -c -o $@ $< $(LIB): $(OBJECTS) $(KECCAK4X) - $(AR) -r $@ $^ + $(AR) -r $@ $(OBJECTS) $(KECCAK4X) $(KECCAK4X): $(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ) @@ -36,5 +29,3 @@ $(KECCAK4X): clean: $(RM) $(OBJECTS) $(RM) $(LIB) - $(MAKE) -C $(KECCAK4XDIR) clean - diff --git a/crypto_sign/dilithium3/avx2/align.h b/crypto_sign/dilithium3/avx2/align.h new file mode 100644 index 00000000..668db1c0 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/align.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_ALIGN_H +#define PQCLEAN_DILITHIUM3_AVX2_ALIGN_H + +#include +#include + +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[N]; \ + __m256i vec[((N)+31)/32]; \ + } + +#define ALIGNED_INT32(N) \ + union { \ + int32_t coeffs[N]; \ + __m256i vec[((N)+7)/8]; \ + } + +#endif diff --git a/crypto_sign/dilithium3/avx2/alignment.h b/crypto_sign/dilithium3/avx2/alignment.h deleted file mode 100644 index a1eb88f8..00000000 --- a/crypto_sign/dilithium3/avx2/alignment.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H -#define PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H - -#define ALIGNED_UINT8(N) \ - union { \ - uint32_t as_arr[N]; \ - __m256i as_vec[(N)/32]; \ - } - -#define ALIGNED_UINT32(N) \ - union { \ - uint32_t as_arr[N]; \ - __m256i as_vec[(N)/8]; \ - } - -#define ALIGNED_UINT64(N) \ - union { \ - uint64_t as_arr[N]; \ - __m256i as_vec[(N)/8]; \ - } - -#endif //PQCLEAN_DILITHIUM3_AVX2_ALIGNMENT_H diff --git a/crypto_sign/dilithium3/avx2/api.h b/crypto_sign/dilithium3/avx2/api.h index a3f7603e..f6cbffa8 100644 --- a/crypto_sign/dilithium3/avx2/api.h +++ b/crypto_sign/dilithium3/avx2/api.h @@ -4,26 +4,14 @@ #include #include - -#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES 1472U -#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES 3504U -#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES 2701U +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES 1952 +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES 4016 +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES 3293 #define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_ALGNAME "Dilithium3" -int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair( - uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM3_AVX2_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature( uint8_t *sig, size_t *siglen, @@ -33,6 +21,12 @@ int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); #endif diff --git a/crypto_sign/dilithium2/avx2/cdecl.inc b/crypto_sign/dilithium3/avx2/cdecl.h similarity index 55% rename from crypto_sign/dilithium2/avx2/cdecl.inc rename to crypto_sign/dilithium3/avx2/cdecl.h index 3e290d89..e1b6605e 100644 --- a/crypto_sign/dilithium2/avx2/cdecl.inc +++ b/crypto_sign/dilithium3/avx2/cdecl.h @@ -1,5 +1,14 @@ -#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL -#define PQCLEAN_DILITHIUM2_AVX2_CDECL +#ifndef PQCLEAN_DILITHIUM3_AVX2_CDECL_H +#define PQCLEAN_DILITHIUM3_AVX2_CDECL_H + + + +#define _8XQ 0 +#define _8XQINV 8 +#define _8XDIV_QINV 16 +#define _8XDIV 24 +#define _ZETAS_QINV 32 +#define _ZETAS 328 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -9,10 +18,7 @@ * This define helps us get around this */ -#if defined(__WIN32__) || defined(__APPLE__) -#define cdecl(s) _##s -#else +#define _cdecl(s) _##s #define cdecl(s) s -#endif #endif diff --git a/crypto_sign/dilithium3/avx2/consts.c b/crypto_sign/dilithium3/avx2/consts.c new file mode 100644 index 00000000..757fa122 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/consts.c @@ -0,0 +1,101 @@ +#include "consts.h" +#include "params.h" +#include + +#define QINV 58728449 // q^(-1) mod 2^32 +#define MONT (-4186625) // 2^32 mod q +#define DIV 41978 // mont^2/256 +#define DIV_QINV (-8395782) + +const qdata_t PQCLEAN_DILITHIUM3_AVX2_qdata = {{ +//#define _8XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, + +//#define _8XQINV 8 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +//#define _8XDIV_QINV 16 + DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, + +//#define _8XDIV 24 + DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, + +//#define _ZETAS_QINV 32 + -151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, + 308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, + -1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, + -1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, + -285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, + 1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, + 1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, + 1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, + 329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, + -1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, + -202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, + -1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, + 1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, + -1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, + -783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, + 1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, + -695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, + -654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, + -247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, + -916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, + 1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, + -898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, + 2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, + 831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, + -2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, + 991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, + 908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, + -1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, + 6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, + 1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, + -1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, + 1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, + 702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, + 746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, + 885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, + 1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, + -1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, + +//#define _ZETAS 328 + -3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, + -359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, + 3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, + 3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, + 2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, + -549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, + -2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, + 1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, + 3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, + -3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, + 189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, + -1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, + -983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, + 264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, + -3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, + 2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, + 342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, + -1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, + -3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, + 3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, + 286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, + 1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, + 3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, + 2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, + -2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, + -2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, + 3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, + 3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, + 4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, + -1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, + 269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, + } +}; diff --git a/crypto_sign/dilithium3/avx2/consts.h b/crypto_sign/dilithium3/avx2/consts.h new file mode 100644 index 00000000..d244c443 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/consts.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM3_AVX2_CONSTS_H +#define PQCLEAN_DILITHIUM3_AVX2_CONSTS_H +#include "align.h" +#include "cdecl.h" + + +typedef ALIGNED_INT32(624) qdata_t; +extern const qdata_t PQCLEAN_DILITHIUM3_AVX2_qdata; + +#endif diff --git a/crypto_sign/dilithium3/avx2/f1600x4.S b/crypto_sign/dilithium3/avx2/f1600x4.S new file mode 100644 index 00000000..1aedd83f --- /dev/null +++ b/crypto_sign/dilithium3/avx2/f1600x4.S @@ -0,0 +1,909 @@ +/* Taken from Bas Westerbaan's new 4-way SHAKE implementation + * for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/), + * but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */ + +#include "cdecl.h" + +.data +.p2align 5 +rho8: +.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14 +rho56: +.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8 + +.text +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4) +.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4) +cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4): +_cdecl(PQCLEAN_DILITHIUM3_AVX2_f1600x4): +vmovdqa rho8(%rip), %ymm0 +movq $6, %rax +looptop: +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 192(%rdi), %ymm4, %ymm9 +vpxor 384(%rdi), %ymm3, %ymm10 +vpxor 576(%rdi), %ymm2, %ymm11 +vpxor 768(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 0(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 96(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 320(%rdi), %ymm5, %ymm10 +vpxor 512(%rdi), %ymm4, %ymm11 +vpxor 704(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 32(%rdi), %ymm4, %ymm8 +vpxor 224(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 608(%rdi), %ymm1, %ymm11 +vpxor 640(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 128(%rdi), %ymm1, %ymm8 +vpxor 160(%rdi), %ymm5, %ymm9 +vpxor 352(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 736(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 64(%rdi), %ymm3, %ymm8 +vpxor 256(%rdi), %ymm2, %ymm9 +vpxor 448(%rdi), %ymm1, %ymm10 +vpxor 480(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 448(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 512(%rdi), %ymm4, %ymm9 +vpxor 224(%rdi), %ymm3, %ymm10 +vpxor 736(%rdi), %ymm2, %ymm11 +vpxor 448(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 8(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 576(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 640(%rdi), %ymm5, %ymm10 +vpxor 352(%rdi), %ymm4, %ymm11 +vpxor 64(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 192(%rdi), %ymm4, %ymm8 +vpxor 704(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 128(%rdi), %ymm1, %ymm11 +vpxor 480(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 768(%rdi), %ymm1, %ymm8 +vpxor 320(%rdi), %ymm5, %ymm9 +vpxor 32(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 256(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 384(%rdi), %ymm3, %ymm8 +vpxor 96(%rdi), %ymm2, %ymm9 +vpxor 608(%rdi), %ymm1, %ymm10 +vpxor 160(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 608(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 352(%rdi), %ymm4, %ymm9 +vpxor 704(%rdi), %ymm3, %ymm10 +vpxor 256(%rdi), %ymm2, %ymm11 +vpxor 608(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 16(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 736(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 480(%rdi), %ymm5, %ymm10 +vpxor 32(%rdi), %ymm4, %ymm11 +vpxor 384(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 512(%rdi), %ymm4, %ymm8 +vpxor 64(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 768(%rdi), %ymm1, %ymm11 +vpxor 160(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 448(%rdi), %ymm1, %ymm8 +vpxor 640(%rdi), %ymm5, %ymm9 +vpxor 192(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 96(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 224(%rdi), %ymm3, %ymm8 +vpxor 576(%rdi), %ymm2, %ymm9 +vpxor 128(%rdi), %ymm1, %ymm10 +vpxor 320(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 128(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 32(%rdi), %ymm4, %ymm9 +vpxor 64(%rdi), %ymm3, %ymm10 +vpxor 96(%rdi), %ymm2, %ymm11 +vpxor 128(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 24(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 256(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 160(%rdi), %ymm5, %ymm10 +vpxor 192(%rdi), %ymm4, %ymm11 +vpxor 224(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 352(%rdi), %ymm4, %ymm8 +vpxor 384(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 448(%rdi), %ymm1, %ymm11 +vpxor 320(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 608(%rdi), %ymm1, %ymm8 +vpxor 480(%rdi), %ymm5, %ymm9 +vpxor 512(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 576(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 704(%rdi), %ymm3, %ymm8 +vpxor 736(%rdi), %ymm2, %ymm9 +vpxor 768(%rdi), %ymm1, %ymm10 +vpxor 640(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 768(%rdi) +addq $32, %rsi +subq $1, %rax +jnz looptop +ret diff --git a/crypto_sign/dilithium3/avx2/fips202x4.c b/crypto_sign/dilithium3/avx2/fips202x4.c index 7df19546..bb4a3767 100644 --- a/crypto_sign/dilithium3/avx2/fips202x4.c +++ b/crypto_sign/dilithium3/avx2/fips202x4.c @@ -1,233 +1,219 @@ -#include -#include - #include "fips202.h" #include "fips202x4.h" -#include "params.h" +#include +#include +#include +#include #define NROUNDS 24 -#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) -static uint64_t load64(const uint8_t *x) { - uint64_t r = 0; +/* Keccak round constants */ +static const uint64_t KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; - for (size_t i = 0; i < 8; ++i) { - r |= (uint64_t)x[i] << 8 * i; - } - - return r; -} - -static void store64(uint8_t *x, uint64_t u) { - for (size_t i = 0; i < 8; ++i) { - x[i] = (uint8_t)(u >> 8 * i); - } -} - -/* Use implementation from the Keccak Code Package */ -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); -#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds - -static void keccak_absorb4x(__m256i *s, - uint8_t r, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen, - uint8_t p) { +static void keccakx4_absorb_once(__m256i s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, + uint8_t p) { size_t i; - uint8_t t0[200]; - uint8_t t1[200]; - uint8_t t2[200]; - uint8_t t3[200]; - uint64_t *ss = (uint64_t *)s; + uint64_t pos = 0; + __m256i t, idx; for (i = 0; i < 25; ++i) { - s[i] = _mm256_xor_si256(s[i], s[i]); + s[i] = _mm256_setzero_si256(); } - while (mlen >= r) { + idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); + while (inlen >= r) { for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(m0 + 8 * i); - ss[4 * i + 1] ^= load64(m1 + 8 * i); - ss[4 * i + 2] ^= load64(m2 + 8 * i); - ss[4 * i + 3] ^= load64(m3 + 8 * i); + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; } + inlen -= r; - KeccakF1600_StatePermute4x(s); - mlen -= r; - m0 += r; - m1 += r; - m2 += r; - m3 += r; + PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants); } - for (i = 0; i < r; ++i) { - t0[i] = 0; - t1[i] = 0; - t2[i] = 0; - t3[i] = 0; + for (i = 0; i < inlen / 8; ++i) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; } - for (i = 0; i < mlen; ++i) { - t0[i] = m0[i]; - t1[i] = m1[i]; - t2[i] = m2[i]; - t3[i] = m3[i]; + inlen -= 8 * i; + + if (inlen) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); + t = _mm256_and_si256(t, idx); + s[i] = _mm256_xor_si256(s[i], t); } - t0[i] = p; - t1[i] = p; - t2[i] = p; - t3[i] = p; - - t0[r - 1] |= 128; - t1[r - 1] |= 128; - t2[r - 1] |= 128; - t3[r - 1] |= 128; - - for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(t0 + 8 * i); - ss[4 * i + 1] ^= load64(t1 + 8 * i); - ss[4 * i + 2] ^= load64(t2 + 8 * i); - ss[4 * i + 3] ^= load64(t3 + 8 * i); - } + t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); + s[i] = _mm256_xor_si256(s[i], t); + t = _mm256_set1_epi64x((long long)(1ULL << 63)); + s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); } - -static void keccak_squeezeblocks4x(uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, +static void keccakx4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, size_t nblocks, - uint8_t r, - __m256i *s) { - uint64_t *ss = (uint64_t *)s; + unsigned int r, + __m256i s[25]) { + unsigned int i; + __m128d t; while (nblocks > 0) { - KeccakF1600_StatePermute4x(s); - for (size_t i = 0; i < r / 8; ++i) { - store64(h0 + 8 * i, ss[4 * i + 0]); - store64(h1 + 8 * i, ss[4 * i + 1]); - store64(h2 + 8 * i, ss[4 * i + 2]); - store64(h3 + 8 * i, ss[4 * i + 3]); + PQCLEAN_DILITHIUM3_AVX2_f1600x4(s, KeccakF_RoundConstants); + for (i = 0; i < r / 8; ++i) { + t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); + _mm_storel_pd((double *)&out0[8 * i], t); + _mm_storeh_pd((double *)&out1[8 * i], t); + t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); + _mm_storel_pd((double *)&out2[8 * i], t); + _mm_storeh_pd((double *)&out3[8 * i], t); } - h0 += r; - h1 += r; - h2 += r; - h3 += r; + out0 += r; + out1 += r; + out2 += r; + out3 += r; --nblocks; } - } -void PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F); +void PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } -void PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s) { - keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s); +void PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); } -void PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F); +void PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); } -void PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s) { - keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s); +void PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); } -void PQCLEAN_DILITHIUM3_AVX2_shake128_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - size_t nblocks = hlen / SHAKE128_RATE; +void PQCLEAN_DILITHIUM3_AVX2_shake128x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE128_RATE; uint8_t t[4][SHAKE128_RATE]; - __m256i s[25]; + keccakx4_state state; - PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen); - PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); + PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); - h0 += nblocks * SHAKE128_RATE; - h1 += nblocks * SHAKE128_RATE; - h2 += nblocks * SHAKE128_RATE; - h3 += nblocks * SHAKE128_RATE; - hlen -= nblocks * SHAKE128_RATE; + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + out2 += nblocks * SHAKE128_RATE; + out3 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; - if (hlen) { - PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); - for (size_t i = 0; i < hlen; ++i) { - h0[i] = t[0][i]; - h1[i] = t[1][i]; - h2[i] = t[2][i]; - h3[i] = t[3][i]; + if (outlen) { + PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; } } } -void PQCLEAN_DILITHIUM3_AVX2_shake256_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - size_t nblocks = hlen / SHAKE256_RATE; +void PQCLEAN_DILITHIUM3_AVX2_shake256x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE256_RATE; uint8_t t[4][SHAKE256_RATE]; - __m256i s[25]; + keccakx4_state state; - PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen); - PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); + PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); + PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); - h0 += nblocks * SHAKE256_RATE; - h1 += nblocks * SHAKE256_RATE; - h2 += nblocks * SHAKE256_RATE; - h3 += nblocks * SHAKE256_RATE; - hlen -= nblocks * SHAKE256_RATE; + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + out2 += nblocks * SHAKE256_RATE; + out3 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; - if (hlen) { - PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); - for (size_t i = 0; i < hlen; ++i) { - h0[i] = t[0][i]; - h1[i] = t[1][i]; - h2[i] = t[2][i]; - h3[i] = t[3][i]; + if (outlen) { + PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; } } } diff --git a/crypto_sign/dilithium3/avx2/fips202x4.h b/crypto_sign/dilithium3/avx2/fips202x4.h index 51b085c9..0a480d63 100644 --- a/crypto_sign/dilithium3/avx2/fips202x4.h +++ b/crypto_sign/dilithium3/avx2/fips202x4.h @@ -5,62 +5,60 @@ #include #include -#include "params.h" +typedef struct { + __m256i s[25]; +} keccakx4_state; -void PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); +void PQCLEAN_DILITHIUM3_AVX2_f1600x4(__m256i *s, const uint64_t *rc); -void PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s); +void PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); -void PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); +void PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state); -void PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s); +void PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); -void PQCLEAN_DILITHIUM3_AVX2_shake128_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); +void PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state); -void PQCLEAN_DILITHIUM3_AVX2_shake256_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); +void PQCLEAN_DILITHIUM3_AVX2_shake128x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +void PQCLEAN_DILITHIUM3_AVX2_shake256x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); #endif diff --git a/crypto_sign/dilithium3/avx2/invntt.S b/crypto_sign/dilithium3/avx2/invntt.S index 0dbfacf6..41c831b0 100644 --- a/crypto_sign/dilithium3/avx2/invntt.S +++ b/crypto_sign/dilithium3/avx2/invntt.S @@ -1,282 +1,240 @@ +#include "cdecl.h" .include "shuffle.inc" -#include "cdecl.inc" -.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 -vpaddd %ymm2,%ymm\l0,%ymm12 -vpaddd %ymm2,%ymm\l1,%ymm13 -vpaddd %ymm2,%ymm\l2,%ymm14 +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpsubd %ymm\l,%ymm\h,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l -vpsubd %ymm\h0,%ymm12,%ymm12 -vpsubd %ymm\h1,%ymm13,%ymm13 -vpsubd %ymm\h2,%ymm14,%ymm14 +vpmuldq %ymm\zl0,%ymm12,%ymm13 +vmovshdup %ymm12,%ymm\h +vpmuldq %ymm\zl1,%ymm\h,%ymm14 -vpmuludq %ymm\z0,%ymm12,%ymm12 -vpmuludq %ymm\z0,%ymm13,%ymm13 -vpaddd %ymm2,%ymm\l3,%ymm15 +vpmuldq %ymm\zh0,%ymm12,%ymm12 +vpmuldq %ymm\zh1,%ymm\h,%ymm\h -vpmuludq %ymm\z1,%ymm14,%ymm14 -vpsubd %ymm\h3,%ymm15,%ymm15 -vpaddd %ymm\l0,%ymm\h0,%ymm\l0 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 -vpmuludq %ymm\z1,%ymm15,%ymm15 -vpaddd %ymm\l1,%ymm\h1,%ymm\l1 -vpaddd %ymm\l2,%ymm\h2,%ymm\l2 +vpsubd %ymm13,%ymm12,%ymm12 +vpsubd %ymm14,%ymm\h,%ymm\h -vpaddd %ymm\l3,%ymm\h3,%ymm\l3 - -vpmuludq %ymm0,%ymm12,%ymm\h0 -vpmuludq %ymm0,%ymm13,%ymm\h1 -vpmuludq %ymm0,%ymm14,%ymm\h2 -vpmuludq %ymm0,%ymm15,%ymm\h3 -vpmuludq %ymm1,%ymm\h0,%ymm\h0 -vpmuludq %ymm1,%ymm\h1,%ymm\h1 -vpmuludq %ymm1,%ymm\h2,%ymm\h2 -vpmuludq %ymm1,%ymm\h3,%ymm\h3 -vpaddq %ymm12,%ymm\h0,%ymm\h0 -vpaddq %ymm13,%ymm\h1,%ymm\h1 -vpaddq %ymm14,%ymm\h2,%ymm\h2 -vpaddq %ymm15,%ymm\h3,%ymm\h3 -vpsrlq $32,%ymm\h0,%ymm\h0 -vpsrlq $32,%ymm\h1,%ymm\h1 -vpsrlq $32,%ymm\h2,%ymm\h2 -vpsrlq $32,%ymm\h3,%ymm\h3 +vmovshdup %ymm12,%ymm12 +vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h .endm -.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx) -cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 +.macro levels0t5 off +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 -#load -vmovdqa (%rsi),%ymm6 -vmovdqa 32(%rsi),%ymm7 -vmovdqa 64(%rsi),%ymm5 -vmovdqa 96(%rsi),%ymm10 +/* level 0 */ +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,5,1,3,2,15 -#reorder -shuffle8 6,5,8,5 -shuffle8 7,10,6,10 +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 6,7,1,3,2,15 -shuffle4 8,6,4,6 -shuffle4 5,10,8,10 +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,9,1,3,2,15 -vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm6,%ymm7 -vpsrlq $32,%ymm8,%ymm9 -vpsrlq $32,%ymm10,%ymm11 +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 10,11,1,3,2,15 -level0: -vpmovzxdq (%rdx),%ymm3 -vpmovzxdq 16(%rdx),%ymm15 -vpaddd %ymm2,%ymm4,%ymm12 -vpaddd %ymm2,%ymm6,%ymm13 -vpaddd %ymm2,%ymm8,%ymm14 +/* level 1 */ +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,6,1,3,2,15 +butterfly 5,7,1,3,2,15 -vpsubd %ymm5,%ymm12,%ymm12 -vpsubd %ymm7,%ymm13,%ymm13 -vpsubd %ymm9,%ymm14,%ymm14 +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,10,1,3,2,15 +butterfly 9,11,1,3,2,15 -vpmuludq %ymm3,%ymm12,%ymm12 -vpmuludq %ymm15,%ymm13,%ymm13 -vpaddd %ymm2,%ymm10,%ymm15 +/* level 2 */ +vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,8,1,3,2,15 +butterfly 5,9,1,3,2,15 +butterfly 6,10,1,3,2,15 +butterfly 7,11,1,3,2,15 -vpsubd %ymm11,%ymm15,%ymm15 -vpaddd %ymm4,%ymm5,%ymm4 -vpaddd %ymm6,%ymm7,%ymm6 -vpmovzxdq 32(%rdx),%ymm5 -vpmovzxdq 48(%rdx),%ymm7 +/* level 3 */ +shuffle2 4,5,3,5 +shuffle2 6,7,4,7 +shuffle2 8,9,6,9 +shuffle2 10,11,8,11 -vpmuludq %ymm5,%ymm14,%ymm14 -vpmuludq %ymm7,%ymm15,%ymm15 -vpaddd %ymm8,%ymm9,%ymm8 +vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 +butterfly 3,5 +butterfly 4,7 +butterfly 6,9 +butterfly 8,11 -vpaddd %ymm10,%ymm11,%ymm10 +/* level 4 */ +shuffle4 3,4,10,4 +shuffle4 6,8,3,8 +shuffle4 5,7,6,7 +shuffle4 9,11,5,11 -vpmuludq %ymm0,%ymm12,%ymm5 -vpmuludq %ymm0,%ymm13,%ymm7 -vpmuludq %ymm0,%ymm14,%ymm9 -vpmuludq %ymm0,%ymm15,%ymm11 -vpmuludq %ymm1,%ymm5,%ymm5 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm9,%ymm9 -vpmuludq %ymm1,%ymm11,%ymm11 -vpaddq %ymm12,%ymm5,%ymm5 -vpaddq %ymm13,%ymm7,%ymm7 -vpaddq %ymm14,%ymm9,%ymm9 -vpaddq %ymm15,%ymm11,%ymm11 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm7,%ymm7 -vpsrlq $32,%ymm9,%ymm9 -vpsrlq $32,%ymm11,%ymm11 +vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 +butterfly 10,4 +butterfly 3,8 +butterfly 6,7 +butterfly 5,11 -level1: -#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) -vpmovzxdq 64(%rdx),%ymm15 -vpmovzxdq 80(%rdx),%ymm3 +/* level 5 */ +shuffle8 10,3,9,3 +shuffle8 6,5,10,5 +shuffle8 4,8,6,8 +shuffle8 7,11,4,11 -butterfly 4,5,8,9,6,7,10,11 +vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 +butterfly 9,3 +butterfly 10,5 +butterfly 6,8 +butterfly 4,11 -level2: -#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) -vpmovzxdq 96(%rdx),%ymm3 +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm10,256*\off+ 32(%rdi) +vmovdqa %ymm6,256*\off+ 64(%rdi) +vmovdqa %ymm4,256*\off+ 96(%rdi) +vmovdqa %ymm3,256*\off+128(%rdi) +vmovdqa %ymm5,256*\off+160(%rdi) +vmovdqa %ymm8,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm -butterfly 4,5,6,7,8,9,10,11,3,3 +.macro levels6t7 off +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 -#shuffle -shuffle4 4,5,3,5 -shuffle4 6,7,4,7 -shuffle4 8,9,6,9 -shuffle4 10,11,8,11 +/* level 6 */ +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 -level3: -#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) -vpbroadcastd 112(%rdx),%ymm14 -vpbroadcastd 116(%rdx),%ymm15 -vpblendd $0xF0,%ymm15,%ymm14,%ymm10 +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 -butterfly 3,4,6,8,5,7,9,11,10,10 +/* level 7 */ +vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 -#shuffle -shuffle8 3,4,10,4 -shuffle8 6,8,3,8 -shuffle8 5,7,6,7 -shuffle8 9,11,5,11 +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 -level4: -#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) -vpbroadcastd 120(%rdx),%ymm9 +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) -butterfly 10,3,6,5,4,8,7,11,9,9 +vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 +vmovdqa (_8XDIV)*4(%rsi),%ymm2 +vpmuldq %ymm1,%ymm4,%ymm12 +vpmuldq %ymm1,%ymm5,%ymm13 +vmovshdup %ymm4,%ymm8 +vmovshdup %ymm5,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm4,%ymm4 +vpmuldq %ymm2,%ymm5,%ymm5 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm4,%ymm4 +vpsubd %ymm13,%ymm5,%ymm5 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm4,%ymm4 +vmovshdup %ymm5,%ymm5 +vpblendd $0xAA,%ymm8,%ymm4,%ymm4 +vpblendd $0xAA,%ymm9,%ymm5,%ymm5 -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm8,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx) -cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x256q)(%rip),%ymm2 - -#load -vmovdqa (%rsi),%ymm4 -vmovdqa 256(%rsi),%ymm5 -vmovdqa 512(%rsi),%ymm6 -vmovdqa 768(%rsi),%ymm7 -vmovdqa 1024(%rsi),%ymm8 -vmovdqa 1280(%rsi),%ymm9 -vmovdqa 1536(%rsi),%ymm10 -vmovdqa 1792(%rsi),%ymm11 - -level5: -vpbroadcastd (%rdx),%ymm3 -vpbroadcastd 4(%rdx),%ymm15 -vpaddd %ymm2,%ymm4,%ymm12 -vpaddd %ymm2,%ymm6,%ymm13 -vpaddd %ymm2,%ymm8,%ymm14 - -vpsubd %ymm5,%ymm12,%ymm12 -vpsubd %ymm7,%ymm13,%ymm13 -vpsubd %ymm9,%ymm14,%ymm14 - -vpmuludq %ymm3,%ymm12,%ymm12 -vpmuludq %ymm15,%ymm13,%ymm13 -vpaddd %ymm2,%ymm10,%ymm15 - -vpsubd %ymm11,%ymm15,%ymm15 -vpaddd %ymm4,%ymm5,%ymm4 -vpaddd %ymm6,%ymm7,%ymm6 -vpbroadcastd 8(%rdx),%ymm5 -vpbroadcastd 12(%rdx),%ymm7 - -vpmuludq %ymm5,%ymm14,%ymm14 -vpmuludq %ymm7,%ymm15,%ymm15 -vpaddd %ymm8,%ymm9,%ymm8 - -vpaddd %ymm10,%ymm11,%ymm10 - -vpmuludq %ymm0,%ymm12,%ymm5 -vpmuludq %ymm0,%ymm13,%ymm7 -vpmuludq %ymm0,%ymm14,%ymm9 -vpmuludq %ymm0,%ymm15,%ymm11 -vpmuludq %ymm1,%ymm5,%ymm5 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm9,%ymm9 -vpmuludq %ymm1,%ymm11,%ymm11 -vpaddq %ymm12,%ymm5,%ymm5 -vpaddq %ymm13,%ymm7,%ymm7 -vpaddq %ymm14,%ymm9,%ymm9 -vpaddq %ymm15,%ymm11,%ymm11 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm7,%ymm7 -vpsrlq $32,%ymm9,%ymm9 -vpsrlq $32,%ymm11,%ymm11 - -level6: -#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) -vpbroadcastd 16(%rdx),%ymm15 -vpbroadcastd 20(%rdx),%ymm3 - -butterfly 4,5,8,9,6,7,10,11 - -level7: -#cdecl(PQCLEAN_DILITHIUM3_AVX2_zetas) -vpbroadcastd 24(%rdx),%ymm3 - -butterfly 4,5,6,7,8,9,10,11,3,3 - -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xdiv)(%rip),%ymm3 - -vpmuludq %ymm3,%ymm4,%ymm4 -vpmuludq %ymm3,%ymm5,%ymm5 -vpmuludq %ymm3,%ymm6,%ymm6 -vpmuludq %ymm3,%ymm7,%ymm7 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm0,%ymm6,%ymm14 -vpmuludq %ymm0,%ymm7,%ymm15 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm12,%ymm4,%ymm4 -vpaddq %ymm13,%ymm5,%ymm5 -vpaddq %ymm14,%ymm6,%ymm6 -vpaddq %ymm15,%ymm7,%ymm7 -vpsrlq $32,%ymm4,%ymm4 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm6,%ymm6 -vpsrlq $32,%ymm7,%ymm7 - -#store -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_mask)(%rip),%ymm3 -vpermd %ymm4,%ymm3,%ymm4 -vpermd %ymm5,%ymm3,%ymm5 -vpermd %ymm6,%ymm3,%ymm6 -vpermd %ymm7,%ymm3,%ymm7 -vpermd %ymm8,%ymm3,%ymm8 -vpermd %ymm9,%ymm3,%ymm9 -vpermd %ymm10,%ymm3,%ymm10 -vpermd %ymm11,%ymm3,%ymm11 -vmovdqa %xmm4,(%rdi) -vmovdqa %xmm5,128(%rdi) -vmovdqa %xmm6,256(%rdi) -vmovdqa %xmm7,384(%rdi) -vmovdqa %xmm8,512(%rdi) -vmovdqa %xmm9,640(%rdi) -vmovdqa %xmm10,768(%rdi) -vmovdqa %xmm11,896(%rdi) +vpmuldq %ymm1,%ymm6,%ymm12 +vpmuldq %ymm1,%ymm7,%ymm13 +vmovshdup %ymm6,%ymm8 +vmovshdup %ymm7,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm6,%ymm6 +vpmuldq %ymm2,%ymm7,%ymm7 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm6,%ymm6 +vpsubd %ymm13,%ymm7,%ymm7 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm6,%ymm6 +vmovshdup %ymm7,%ymm7 +vpblendd $0xAA,%ymm8,%ymm6,%ymm6 +vpblendd $0xAA,%ymm9,%ymm7,%ymm7 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx): +_cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t5 0 +levels0t5 1 +levels0t5 2 +levels0t5 3 + +levels6t7 0 +levels6t7 1 +levels6t7 2 +levels6t7 3 ret diff --git a/crypto_sign/dilithium3/avx2/ntt.S b/crypto_sign/dilithium3/avx2/ntt.S index 4cb18d8b..d89f8045 100644 --- a/crypto_sign/dilithium3/avx2/ntt.S +++ b/crypto_sign/dilithium3/avx2/ntt.S @@ -1,179 +1,199 @@ +#include "cdecl.h" .include "shuffle.inc" -#include "cdecl.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 -#mul -vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 -vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 -vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2 -vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3 +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpmuldq %ymm\zl0,%ymm\h,%ymm13 +vmovshdup %ymm\h,%ymm12 +vpmuldq %ymm\zl1,%ymm12,%ymm14 -#reduce -vpmuludq %ymm0,%ymm\rh0,%ymm12 -vpmuludq %ymm0,%ymm\rh1,%ymm13 -vpmuludq %ymm0,%ymm\rh2,%ymm14 -vpmuludq %ymm0,%ymm\rh3,%ymm15 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm\rh0,%ymm12,%ymm12 -vpaddq %ymm\rh1,%ymm13,%ymm13 -vpaddq %ymm\rh2,%ymm14,%ymm14 -vpaddq %ymm\rh3,%ymm15,%ymm15 -vpsrlq $32,%ymm12,%ymm12 -vpsrlq $32,%ymm13,%ymm13 -vpsrlq $32,%ymm14,%ymm14 -vpsrlq $32,%ymm15,%ymm15 +vpmuldq %ymm\zh0,%ymm\h,%ymm\h +vpmuldq %ymm\zh1,%ymm12,%ymm12 -#update -vpaddd %ymm2,%ymm\rl0,%ymm\rh0 -vpaddd %ymm2,%ymm\rl1,%ymm\rh1 -vpaddd %ymm2,%ymm\rl2,%ymm\rh2 -vpaddd %ymm2,%ymm\rl3,%ymm\rh3 -vpaddd %ymm12,%ymm\rl0,%ymm\rl0 -vpaddd %ymm13,%ymm\rl1,%ymm\rl1 -vpaddd %ymm14,%ymm\rl2,%ymm\rl2 -vpaddd %ymm15,%ymm\rl3,%ymm\rl3 -vpsubd %ymm12,%ymm\rh0,%ymm\rh0 -vpsubd %ymm13,%ymm\rh1,%ymm\rh1 -vpsubd %ymm14,%ymm\rh2,%ymm\rh2 -vpsubd %ymm15,%ymm\rh3,%ymm\rh3 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vmovshdup %ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h + +vpsubd %ymm\h,%ymm\l,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vmovshdup %ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 + +vpaddd %ymm13,%ymm12,%ymm\h +vpsubd %ymm13,%ymm\l,%ymm\l .endm -.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx) -cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 +.macro levels0t1 off +/* level 0 */ +vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 -level0: -#zetas -vpbroadcastd (%rdx),%ymm3 +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 -#load -vpmovzxdq (%rsi),%ymm4 -vpmovzxdq 128(%rsi),%ymm5 -vpmovzxdq 256(%rsi),%ymm6 -vpmovzxdq 384(%rsi),%ymm7 -vpmovzxdq 512(%rsi),%ymm8 -vpmovzxdq 640(%rsi),%ymm9 -vpmovzxdq 768(%rsi),%ymm10 -vpmovzxdq 896(%rsi),%ymm11 +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 -butterfly 4,5,6,7,8,9,10,11 +/* level 1 */ +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 -level1: -#PQCLEAN_DILITHIUM3_AVX2_zetas -vpbroadcastd 4(%rdx),%ymm12 -vpbroadcastd 8(%rdx),%ymm13 +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 -butterfly 4,5,8,9,6,7,10,11,12,12,13,13 +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) +.endm -level2: -#PQCLEAN_DILITHIUM3_AVX2_zetas -vpbroadcastd 12(%rdx),%ymm12 -vpbroadcastd 16(%rdx),%ymm13 -vpbroadcastd 20(%rdx),%ymm14 -vpbroadcastd 24(%rdx),%ymm15 +.macro levels2t7 off +/* level 2 */ +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 -butterfly 4,6,8,10,5,7,9,11,12,13,14,15 +vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,256(%rdi) -vmovdqa %ymm6,512(%rdi) -vmovdqa %ymm7,768(%rdi) -vmovdqa %ymm8,1024(%rdi) -vmovdqa %ymm9,1280(%rdi) -vmovdqa %ymm10,1536(%rdi) -vmovdqa %ymm11,1792(%rdi) - -ret - -.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx) -cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x2q)(%rip),%ymm2 - -#load -vmovdqa (%rsi),%ymm4 -vmovdqa 32(%rsi),%ymm5 -vmovdqa 64(%rsi),%ymm6 -vmovdqa 96(%rsi),%ymm7 -vmovdqa 128(%rsi),%ymm8 -vmovdqa 160(%rsi),%ymm9 -vmovdqa 192(%rsi),%ymm10 -vmovdqa 224(%rsi),%ymm11 - -level3: -#zetas -vpbroadcastd (%rdx),%ymm3 - -butterfly 4,5,6,7,8,9,10,11 - -level4: -#PQCLEAN_DILITHIUM3_AVX2_zetas -vpbroadcastd 4(%rdx),%ymm12 -vpbroadcastd 8(%rdx),%ymm13 -vpblendd $0xF0,%ymm13,%ymm12,%ymm12 +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 shuffle8 4,8,3,8 shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly 3,8,4,9,5,10,6,11,12,12,12,12 +/* level 3 */ +vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 -level5: -#zetas -vpmovzxdq 12(%rdx),%ymm12 +butterfly 3,5 +butterfly 8,10 +butterfly 4,6 +butterfly 9,11 shuffle4 3,5,7,5 shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly 7,5,3,10,8,6,4,11,12,12,12,12 +/* level 4 */ +vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 -level6: -#zetas -vpmovzxdq 28(%rdx),%ymm12 -vpmovzxdq 44(%rdx),%ymm13 +butterfly 7,8 +butterfly 5,6 +butterfly 3,4 +butterfly 10,11 -butterfly 7,5,8,6,3,10,4,11,12,12,13,13 +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 -level7: -#zetas -vpmovzxdq 60(%rdx),%ymm12 -vpmovzxdq 76(%rdx),%ymm13 -vpmovzxdq 92(%rdx),%ymm14 -vpmovzxdq 108(%rdx),%ymm15 +/* level 5 */ +vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 -butterfly 7,3,8,4,5,10,6,11,12,13,14,15 +butterfly 9,5,1,10,2,15 +butterfly 8,4,1,10,2,15 +butterfly 7,3,1,10,2,15 +butterfly 6,11,1,10,2,15 -#store -vpsllq $32,%ymm5,%ymm5 -vpsllq $32,%ymm10,%ymm10 -vpsllq $32,%ymm6,%ymm6 -vpsllq $32,%ymm11,%ymm11 -vpblendd $0xAA,%ymm5,%ymm7,%ymm7 -vpblendd $0xAA,%ymm10,%ymm3,%ymm3 -vpblendd $0xAA,%ymm6,%ymm8,%ymm8 -vpblendd $0xAA,%ymm11,%ymm4,%ymm4 +/* level 6 */ +vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,7,1,10,2,15 +butterfly 8,6,1,10,2,15 -shuffle4 7,3,5,3 -shuffle4 8,4,7,4 +vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,3,1,10,2,15 +butterfly 4,11,1,10,2,15 -shuffle8 5,7,6,7 -shuffle8 3,4,5,4 +/* level 7 */ +vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,8,1,10,2,15 -vmovdqa %ymm6,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm7,64(%rdi) -vmovdqa %ymm4,96(%rdi) +vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 7,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,4,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 3,11,1,10,2,15 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm8,256*\off+ 32(%rdi) +vmovdqa %ymm7,256*\off+ 64(%rdi) +vmovdqa %ymm6,256*\off+ 96(%rdi) +vmovdqa %ymm5,256*\off+128(%rdi) +vmovdqa %ymm4,256*\off+160(%rdi) +vmovdqa %ymm3,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx): +_cdecl(PQCLEAN_DILITHIUM3_AVX2_ntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 + +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 ret + diff --git a/crypto_sign/dilithium3/avx2/ntt.h b/crypto_sign/dilithium3/avx2/ntt.h index cf0a0a58..b6c6e372 100644 --- a/crypto_sign/dilithium3/avx2/ntt.h +++ b/crypto_sign/dilithium3/avx2/ntt.h @@ -1,36 +1,14 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_DILITHIUM3_AVX2_NTT_H +#define PQCLEAN_DILITHIUM3_AVX2_NTT_H -#include +#include -#include "nttconsts.h" -#include "params.h" +void PQCLEAN_DILITHIUM3_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata); +void PQCLEAN_DILITHIUM3_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata); -void PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx( - uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas -); -void PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx( - uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas -); +void PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx(__m256i *a); -void PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx( - uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas_inv -); -void PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx( - uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas_inv -); - -void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx( - uint32_t *c, const uint32_t *a, const uint32_t *b); -void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx( - uint32_t *c, const uint32_t *a, const uint32_t *b); +void PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata); +void PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM3_AVX2_qdata); #endif diff --git a/crypto_sign/dilithium3/avx2/nttconsts.c b/crypto_sign/dilithium3/avx2/nttconsts.c deleted file mode 100644 index 2bd3b20b..00000000 --- a/crypto_sign/dilithium3/avx2/nttconsts.c +++ /dev/null @@ -1,80 +0,0 @@ -#include "nttconsts.h" - -#define QINV 4236238847 // -q^(-1) mod 2^32 -#define MONT 4193792ULL -#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) - - -const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, - 256 * Q - } - }; -const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, - 0x7FFFFF, 0x7FFFFF - } - }; -const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; - -#undef QINV -#undef MONT -#undef DIV - - -const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas = { - .as_arr = { - 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776, - 3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667, - 5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191, - 3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439, - 7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422, - 1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579, - 8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390, - 7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620, - 5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868, - 3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076, - 6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435, - 5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599, - 3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165, - 7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031, - 6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064, - 2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112, - 162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237, - 8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977, - 1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735, - 6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892, - 5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443, - 7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090, - 4834730, 7018208, 1976782 - } -}; - -const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv = { - .as_arr = { - 6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985, - 4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748, - 2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646, - 1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087, - 177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422, - 6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573, - 5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061, - 6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386, - 1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252, - 1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818, - 1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496, - 7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525, - 6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443, - 7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093, - 5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647, - 4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669, - 43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330, - 1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900, - 6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326, - 4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745, - 3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293, - 518909, 2608894, 3975713 - } -}; diff --git a/crypto_sign/dilithium3/avx2/nttconsts.h b/crypto_sign/dilithium3/avx2/nttconsts.h deleted file mode 100644 index caf6945d..00000000 --- a/crypto_sign/dilithium3/avx2/nttconsts.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H -#define PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H - -#include -#include - -#include "alignment.h" -#include "params.h" - -typedef ALIGNED_UINT32(8) aligned_uint32x8_t; - -typedef ALIGNED_UINT32(N) aligned_uint32xN_t; - - -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xqinv; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xq; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x2q; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x256q; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_mask; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8x23ones; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM3_AVX2_8xdiv; - -extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas; -extern const aligned_uint32xN_t PQCLEAN_DILITHIUM3_AVX2_zetas_inv; - -#endif //PQCLEAN_DILITHIUM3_AVX2_NTTCONSTS_H - diff --git a/crypto_sign/dilithium3/avx2/packing.c b/crypto_sign/dilithium3/avx2/packing.c index 24b2bfff..dde16b0f 100644 --- a/crypto_sign/dilithium3/avx2/packing.c +++ b/crypto_sign/dilithium3/avx2/packing.c @@ -3,6 +3,7 @@ #include "poly.h" #include "polyvec.h" + /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_pack_pk * @@ -12,17 +13,18 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM3_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { pk[i] = rho[i]; } pk += SEEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); } } @@ -35,212 +37,201 @@ void PQCLEAN_DILITHIUM3_AVX2_pack_pk( * - const polyveck *t1: pointer to output vector t1 * - uint8_t pk[]: byte array containing bit-packed pk **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM3_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { rho[i] = pk[i]; } pk += SEEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); } } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_pack_sk * -* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). * * Arguments: - uint8_t sk[]: output byte array * - const uint8_t rho[]: byte array containing rho -* - const uint8_t key[]: byte array containing key * - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 -* - const polyveck *t0: pointer to vector t0 **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM3_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { sk[i] = rho[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < SEEDBYTES; ++i) { + for (i = 0; i < SEEDBYTES; ++i) { sk[i] = key[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < CRHBYTES; ++i) { + for (i = 0; i < CRHBYTES; ++i) { sk[i] = tr[i]; } sk += CRHBYTES; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); } - sk += L * POLETA_SIZE_PACKED; + sk += L * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); } - sk += K * POLETA_SIZE_PACKED; + sk += K * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); } } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sk * -* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). * * Arguments: - const uint8_t rho[]: output byte array for rho -* - const uint8_t key[]: output byte array for key * - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 * - const polyvecl *s1: pointer to output vector s1 * - const polyveck *s2: pointer to output vector s2 -* - const polyveck *r0: pointer to output vector t0 * - uint8_t sk[]: byte array containing bit-packed sk **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM3_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { rho[i] = sk[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < SEEDBYTES; ++i) { + for (i = 0; i < SEEDBYTES; ++i) { key[i] = sk[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < CRHBYTES; ++i) { + for (i = 0; i < CRHBYTES; ++i) { tr[i] = sk[i]; } sk += CRHBYTES; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); } - sk += L * POLETA_SIZE_PACKED; + sk += L * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); } - sk += K * POLETA_SIZE_PACKED; + sk += K * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); } } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_pack_sig * -* Description: Bit-pack signature sig = (z, h, c). +* Description: Bit-pack signature sig = (c, z, h). * * Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM3_AVX2_challenge hash length SEEDBYTES * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { - size_t k; - uint64_t signs, mask; +void PQCLEAN_DILITHIUM3_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; } - sig += L * POLZ_SIZE_PACKED; + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; /* Encode h */ - k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { - if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (uint8_t)j; - } - } - - sig[OMEGA + i] = (uint8_t)k; - } - while (k < OMEGA) { - sig[k++] = 0; - } - sig += OMEGA + K; - - /* Encode c */ - signs = 0; - mask = 1; - for (size_t i = 0; i < N / 8; ++i) { + for (i = 0; i < OMEGA + K; ++i) { sig[i] = 0; - for (size_t j = 0; j < 8; ++j) { - if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (uint8_t)(1u << j); - if (c->coeffs[8 * i + j] == (Q - 1)) { - signs |= mask; - } - mask <<= 1; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; } } - } - sig += N / 8; - for (size_t i = 0; i < 8; ++i) { - sig[i] = (uint8_t)(signs >> 8u * i); + + sig[OMEGA + i] = (uint8_t) k; } } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_unpack_sig * -* Description: Unpack signature sig = (z, h, c). +* Description: Unpack signature sig = (c, z, h). * -* Arguments: - polyvecl *z: pointer to output vector z +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM3_AVX2_challenge hash +* - polyvecl *z: pointer to output vector z * - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial * - const uint8_t sig[]: byte array containing * bit-packed signature * * Returns 1 in case of malformed signature; otherwise 0. **************************************************/ -int PQCLEAN_DILITHIUM3_AVX2_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES]) { - size_t k; - uint64_t signs; +int PQCLEAN_DILITHIUM3_AVX2_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES]) { + unsigned int i, j, k; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; } - sig += L * POLZ_SIZE_PACKED; + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; /* Decode h */ k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { h->vec[i].coeffs[j] = 0; } @@ -248,7 +239,7 @@ int PQCLEAN_DILITHIUM3_AVX2_unpack_sig( return 1; } - for (size_t j = k; j < sig[OMEGA + i]; ++j) { + for (j = k; j < sig[OMEGA + i]; ++j) { /* Coefficients are ordered for strong unforgeability */ if (j > k && sig[j] <= sig[j - 1]) { return 1; @@ -260,38 +251,11 @@ int PQCLEAN_DILITHIUM3_AVX2_unpack_sig( } /* Extra indices are zero for strong unforgeability */ - for (size_t j = k; j < OMEGA; ++j) { + for (j = k; j < OMEGA; ++j) { if (sig[j]) { return 1; } } - sig += OMEGA + K; - - /* Decode c */ - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)sig[N / 8 + i] << 8 * i; - } - - /* Extra sign bits are zero for strong unforgeability */ - if (signs >> 60) { - return 1; - } - - for (size_t i = 0; i < N / 8; ++i) { - for (size_t j = 0; j < 8; ++j) { - if ((sig[i] >> j) & 0x01) { - c->coeffs[8 * i + j] = 1; - c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - } - } - return 0; } diff --git a/crypto_sign/dilithium3/avx2/packing.h b/crypto_sign/dilithium3/avx2/packing.h index afee9223..9404c00f 100644 --- a/crypto_sign/dilithium3/avx2/packing.h +++ b/crypto_sign/dilithium3/avx2/packing.h @@ -1,42 +1,31 @@ #ifndef PQCLEAN_DILITHIUM3_AVX2_PACKING_H #define PQCLEAN_DILITHIUM3_AVX2_PACKING_H - -#include "api.h" #include "params.h" #include "polyvec.h" +#include -void PQCLEAN_DILITHIUM3_AVX2_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1); -void PQCLEAN_DILITHIUM3_AVX2_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[SEEDBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM3_AVX2_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, const polyveck *h, const poly *c); +void PQCLEAN_DILITHIUM3_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); -void PQCLEAN_DILITHIUM3_AVX2_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM3_AVX2_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t *sk); -int PQCLEAN_DILITHIUM3_AVX2_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES]); +void PQCLEAN_DILITHIUM3_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM3_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM3_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM3_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM3_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES]); #endif diff --git a/crypto_sign/dilithium3/avx2/params.h b/crypto_sign/dilithium3/avx2/params.h index 6096eaf1..d824116f 100644 --- a/crypto_sign/dilithium3/avx2/params.h +++ b/crypto_sign/dilithium3/avx2/params.h @@ -2,28 +2,40 @@ #define PQCLEAN_DILITHIUM3_AVX2_PARAMS_H + #define SEEDBYTES 32 #define CRHBYTES 48 #define N 256 #define Q 8380417 -#define QBITS 23 -#define D 14 -#define GAMMA1 ((Q - 1)/16) -#define GAMMA2 (GAMMA1/2) -#define ALPHA (2*GAMMA2) +#define D 13 +#define ROOT_OF_UNITY 1753 -#define K 5 -#define L 4 -#define ETA 5 -#define SETABITS 4 -#define BETA 275 -#define OMEGA 96 +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 55 +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_ALGNAME "Dilithium3" -#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) -#define POLT0_SIZE_PACKED ((N*D)/8) -#define POLETA_SIZE_PACKED ((N*SETABITS)/8) -#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) -#define POLW1_SIZE_PACKED ((N*4)/8) +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 640 + +#define POLYW1_PACKEDBYTES 128 + +#define POLYETA_PACKEDBYTES 128 + +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/crypto_sign/dilithium3/avx2/pointwise.S b/crypto_sign/dilithium3/avx2/pointwise.S index b0085373..b3e020f4 100644 --- a/crypto_sign/dilithium3/avx2/pointwise.S +++ b/crypto_sign/dilithium3/avx2/pointwise.S @@ -1,11 +1,14 @@ #include "params.h" -#include "cdecl.inc" +#include "cdecl.h" +.text .global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx) +.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx): +_cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_avx): #consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 xor %eax,%eax _looptop1: @@ -18,41 +21,41 @@ vmovdqa 32(%rdx),%ymm12 vmovdqa 64(%rdx),%ymm14 vpsrlq $32,%ymm2,%ymm3 vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm6,%ymm7 +vmovshdup %ymm6,%ymm7 vpsrlq $32,%ymm10,%ymm11 vpsrlq $32,%ymm12,%ymm13 -vpsrlq $32,%ymm14,%ymm15 +vmovshdup %ymm14,%ymm15 #mul -vpmuludq %ymm2,%ymm10,%ymm2 -vpmuludq %ymm3,%ymm11,%ymm3 -vpmuludq %ymm4,%ymm12,%ymm4 -vpmuludq %ymm5,%ymm13,%ymm5 -vpmuludq %ymm6,%ymm14,%ymm6 -vpmuludq %ymm7,%ymm15,%ymm7 +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm6,%ymm14,%ymm6 +vpmuldq %ymm7,%ymm15,%ymm7 #reduce -vpmuludq %ymm0,%ymm2,%ymm10 -vpmuludq %ymm0,%ymm3,%ymm11 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm0,%ymm6,%ymm14 -vpmuludq %ymm0,%ymm7,%ymm15 -vpmuludq %ymm1,%ymm10,%ymm10 -vpmuludq %ymm1,%ymm11,%ymm11 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm2,%ymm10,%ymm2 -vpaddq %ymm3,%ymm11,%ymm3 -vpaddq %ymm4,%ymm12,%ymm4 -vpaddq %ymm5,%ymm13,%ymm5 -vpaddq %ymm6,%ymm14,%ymm6 -vpaddq %ymm7,%ymm15,%ymm7 +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm0,%ymm6,%ymm14 +vpmuldq %ymm0,%ymm7,%ymm15 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpmuldq %ymm1,%ymm14,%ymm14 +vpmuldq %ymm1,%ymm15,%ymm15 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsubq %ymm14,%ymm6,%ymm6 +vpsubq %ymm15,%ymm7,%ymm7 vpsrlq $32,%ymm2,%ymm2 vpsrlq $32,%ymm4,%ymm4 -vpsrlq $32,%ymm6,%ymm6 +vmovshdup %ymm6,%ymm6 #store vpblendd $0xAA,%ymm3,%ymm2,%ymm2 @@ -67,7 +70,7 @@ add $96,%rsi add $96,%rdx add $1,%eax cmp $10,%eax -jb _looptop1 +jb _looptop1 vmovdqa (%rsi),%ymm2 vmovdqa 32(%rsi),%ymm4 @@ -75,30 +78,30 @@ vmovdqa (%rdx),%ymm10 vmovdqa 32(%rdx),%ymm12 vpsrlq $32,%ymm2,%ymm3 vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm10,%ymm11 -vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 #mul -vpmuludq %ymm2,%ymm10,%ymm2 -vpmuludq %ymm3,%ymm11,%ymm3 -vpmuludq %ymm4,%ymm12,%ymm4 -vpmuludq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 #reduce -vpmuludq %ymm0,%ymm2,%ymm10 -vpmuludq %ymm0,%ymm3,%ymm11 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm1,%ymm10,%ymm10 -vpmuludq %ymm1,%ymm11,%ymm11 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpaddq %ymm2,%ymm10,%ymm2 -vpaddq %ymm3,%ymm11,%ymm3 -vpaddq %ymm4,%ymm12,%ymm4 -vpaddq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 vpsrlq $32,%ymm2,%ymm2 -vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm4,%ymm4 #store vpblendd $0x55,%ymm2,%ymm3,%ymm2 @@ -116,14 +119,14 @@ vmovdqa \off(%rdx),%ymm10 vmovdqa \off+32(%rdx),%ymm12 vpsrlq $32,%ymm6,%ymm7 vpsrlq $32,%ymm8,%ymm9 -vpsrlq $32,%ymm10,%ymm11 -vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 #mul -vpmuludq %ymm6,%ymm10,%ymm6 -vpmuludq %ymm7,%ymm11,%ymm7 -vpmuludq %ymm8,%ymm12,%ymm8 -vpmuludq %ymm9,%ymm13,%ymm9 +vpmuldq %ymm6,%ymm10,%ymm6 +vpmuldq %ymm7,%ymm11,%ymm7 +vpmuldq %ymm8,%ymm12,%ymm8 +vpmuldq %ymm9,%ymm13,%ymm9 .endm .macro acc @@ -134,10 +137,12 @@ vpaddq %ymm9,%ymm5,%ymm5 .endm .global cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx) +.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx): +_cdecl(PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx): #consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm1 +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 xor %eax,%eax _looptop2: @@ -158,22 +163,26 @@ acc pointwise 3072 acc +pointwise 4096 +acc + + #reduce -vpmuludq %ymm0,%ymm2,%ymm6 -vpmuludq %ymm0,%ymm3,%ymm7 -vpmuludq %ymm0,%ymm4,%ymm8 -vpmuludq %ymm0,%ymm5,%ymm9 -vpmuludq %ymm1,%ymm6,%ymm6 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm8,%ymm8 -vpmuludq %ymm1,%ymm9,%ymm9 -vpaddq %ymm2,%ymm6,%ymm2 -vpaddq %ymm3,%ymm7,%ymm3 -vpaddq %ymm4,%ymm8,%ymm4 -vpaddq %ymm5,%ymm9,%ymm5 +vpmuldq %ymm0,%ymm2,%ymm6 +vpmuldq %ymm0,%ymm3,%ymm7 +vpmuldq %ymm0,%ymm4,%ymm8 +vpmuldq %ymm0,%ymm5,%ymm9 +vpmuldq %ymm1,%ymm6,%ymm6 +vpmuldq %ymm1,%ymm7,%ymm7 +vpmuldq %ymm1,%ymm8,%ymm8 +vpmuldq %ymm1,%ymm9,%ymm9 +vpsubq %ymm6,%ymm2,%ymm2 +vpsubq %ymm7,%ymm3,%ymm3 +vpsubq %ymm8,%ymm4,%ymm4 +vpsubq %ymm9,%ymm5,%ymm5 vpsrlq $32,%ymm2,%ymm2 -vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm4,%ymm4 #store vpblendd $0xAA,%ymm3,%ymm2,%ymm2 diff --git a/crypto_sign/dilithium3/avx2/poly.c b/crypto_sign/dilithium3/avx2/poly.c index f6876f81..550ed556 100644 --- a/crypto_sign/dilithium3/avx2/poly.c +++ b/crypto_sign/dilithium3/avx2/poly.c @@ -1,52 +1,94 @@ -#include -#include - +#include "align.h" +#include "consts.h" #include "fips202x4.h" #include "ntt.h" -#include "nttconsts.h" #include "params.h" #include "poly.h" -#include "reduce.h" #include "rejsample.h" #include "rounding.h" #include "symmetric.h" +#include +#include +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_reduce * -* Description: Reduce all coefficients of input polynomial to representative -* in [0,2*Q[. +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. Assumes input +* coefficients to be at most 2^31 - 2^22 - 1 in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_poly_reduce(poly *a) { - PQCLEAN_DILITHIUM3_AVX2_reduce_avx(a->coeffs); + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM3_AVX2_qdata.vec[_8XQ / 8]); + const __m256i off = _mm256_set1_epi32(1 << 22); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_add_epi32(f, off); + g = _mm256_srai_epi32(g, 23); + g = _mm256_mullo_epi32(g, q); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); } /************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_poly_csubq +* Name: poly_addq * -* Description: For all coefficients of input polynomial subtract Q if -* coefficient is bigger than Q. +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_poly_csubq(poly *a) { - PQCLEAN_DILITHIUM3_AVX2_csubq_avx(a->coeffs); +void PQCLEAN_DILITHIUM3_AVX2_poly_caddq(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM3_AVX2_qdata.vec[_8XQ / 8]); + const __m256i zero = _mm256_setzero_si256(); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_blendv_epi32(zero, q, f); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_freeze * -* Description: Reduce all coefficients of the polynomial to standard -* representatives. +* Description: Inplace reduction of all coefficients of polynomial to +* positive standard representatives. Assumes input +* coefficients to be at most 2^31 - 2^22 + 1 in +* absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a) { - PQCLEAN_DILITHIUM3_AVX2_reduce_avx(a->coeffs); - PQCLEAN_DILITHIUM3_AVX2_csubq_avx(a->coeffs); + DBENCH_START(); + PQCLEAN_DILITHIUM3_AVX2_poly_reduce(a); + PQCLEAN_DILITHIUM3_AVX2_poly_caddq(a); + + DBENCH_STOP(*tred); } /************************************************* @@ -59,20 +101,24 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a) { * - const poly *b: pointer to second summand **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) { - __m256i vec0, vec1; - for (size_t i = 0; i < N / 8; i++) { - vec0 = _mm256_load_si256(&a->coeffs_x8[i]); - vec1 = _mm256_load_si256(&b->coeffs_x8[i]); - vec0 = _mm256_add_epi32(vec0, vec1); - _mm256_store_si256(&c->coeffs_x8[i], vec0); + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); } + + DBENCH_STOP(*tadd); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_sub * -* Description: Subtract polynomials. Assumes coefficients of second input -* polynomial to be less than 2*Q. No modular reduction is +* Description: Subtract polynomials. No modular reduction is * performed. * * Arguments: - poly *c: pointer to output polynomial @@ -81,227 +127,239 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b) { * subtraced from first input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { - __m256i vec0, vec1; - const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM3_AVX2_8x2q.as_vec); + unsigned int i; + __m256i f, g; + DBENCH_START(); - for (size_t i = 0; i < N / 8; i++) { - vec0 = _mm256_load_si256(&a->coeffs_x8[i]); - vec1 = _mm256_load_si256(&b->coeffs_x8[i]); - vec0 = _mm256_add_epi32(vec0, twoq); - vec0 = _mm256_sub_epi32(vec0, vec1); - _mm256_store_si256(&c->coeffs_x8[i], vec0); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); } + + DBENCH_STOP(*tadd); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_shiftl * * Description: Multiply polynomial by 2^D without modular reduction. Assumes -* input coefficients to be less than 2^{32-D}. +* input coefficients to be less than 2^{31-D} in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(poly *a) { - __m256i vec; + unsigned int i; + __m256i f; + DBENCH_START(); - for (size_t i = 0; i < N / 8; i++) { - vec = _mm256_load_si256(&a->coeffs_x8[i]); - vec = _mm256_slli_epi32(vec, D); - _mm256_store_si256(&a->coeffs_x8[i], vec); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_slli_epi32(f, D); + _mm256_store_si256(&a->vec[i], f); } + + DBENCH_STOP(*tmul); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_ntt * -* Description: Forward NTT. Output coefficients can be up to 16*Q larger than -* input coefficients. +* Description: Inplace forward NTT. Coefficients can grow by up to +* 8*Q in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_poly_ntt(poly *a) { - ALIGNED_UINT64(N) tmp; + DBENCH_START(); - for (size_t i = 0; i < N / 32; ++i) { - PQCLEAN_DILITHIUM3_AVX2_ntt_levels0t2_avx(tmp.as_arr + 4 * i, a->coeffs + 4 * i, PQCLEAN_DILITHIUM3_AVX2_zetas.as_arr + 1); - } - for (size_t i = 0; i < N / 32; ++i) { - PQCLEAN_DILITHIUM3_AVX2_ntt_levels3t8_avx(a->coeffs + 32 * i, tmp.as_arr + 32 * i, PQCLEAN_DILITHIUM3_AVX2_zetas.as_arr + 8 + 31 * i); - } + PQCLEAN_DILITHIUM3_AVX2_ntt_avx(a->vec, PQCLEAN_DILITHIUM3_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); } /************************************************* -* Name: poly_invntt_montgomery +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont * -* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients -* need to be less than 2*Q. Output coefficients are less than 2*Q. +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(poly *a) { - ALIGNED_UINT64(N) tmp; +void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(poly *a) { + DBENCH_START(); - for (size_t i = 0; i < N / 32; i++) { - PQCLEAN_DILITHIUM3_AVX2_invntt_levels0t4_avx(tmp.as_arr + 32 * i, a->coeffs + 32 * i, PQCLEAN_DILITHIUM3_AVX2_zetas_inv.as_arr + 31 * i); - } - for (size_t i = 0; i < N / 32; i++) { - PQCLEAN_DILITHIUM3_AVX2_invntt_levels5t7_avx(a->coeffs + 4 * i, tmp.as_arr + 4 * i, PQCLEAN_DILITHIUM3_AVX2_zetas_inv.as_arr + 248); - } + PQCLEAN_DILITHIUM3_AVX2_invntt_avx(a->vec, PQCLEAN_DILITHIUM3_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +void PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx(a->vec); + + DBENCH_STOP(*tmul); } /************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery * * Description: Pointwise multiplication of polynomials in NTT domain * representation and multiplication of resulting polynomial -* with 2^{-32}. Output coefficients are less than 2*Q if input -* coefficient are less than 22*Q. +* by 2^{-32}. * * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(c->coeffs, a->coeffs, b->coeffs); +void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3_AVX2_pointwise_avx(c->vec, a->vec, b->vec, PQCLEAN_DILITHIUM3_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_power2round * * Description: For all coefficients c of the input polynomial, -* compute c0, c1 such that c mod Q = c1*2^D + c0 +* compute c0, c1 such that c mod^+ Q = c1*2^D + c0 * with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be -* standard representatives. +* positive standard representatives. * * Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *v: pointer to input polynomial +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *restrict a1, - poly *restrict a0, - const poly *restrict a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_power2round(a->coeffs[i], &a0->coeffs[i]); - } +void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3_AVX2_power2round_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_decompose * * Description: For all coefficients c of the input polynomial, -* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 -* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we * set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. -* Assumes coefficients to be standard representatives. +* Assumes coefficients to be positive standard representatives. * * Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *c: pointer to input polynomial +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_poly_decompose( - poly *restrict a1, - poly *restrict a0, - const poly *restrict a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_decompose(a->coeffs[i], &a0->coeffs[i]); - } +void PQCLEAN_DILITHIUM3_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3_AVX2_decompose_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_make_hint * -* Description: Compute hint polynomial. The coefficients of which indicate -* whether the low bits of the corresponding coefficient of -* the input polynomial overflow into the high bits. +* Description: Compute hint array. The coefficients of which are the +* indices of the coefficients of the input polynomial +* whose low bits overflow into the high bits. * -* Arguments: - poly *h: pointer to output hint polynomial +* Arguments: - uint8_t *h: pointer to output hint array (preallocated of length N) * - const poly *a0: pointer to low part of input polynomial * - const poly *a1: pointer to high part of input polynomial * -* Returns number of 1 bits. +* Returns number of hints, i.e. length of hint array. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_AVX2_poly_make_hint( - poly *restrict h, - const poly *restrict a0, - const poly *restrict a1) { - uint32_t s = 0; - for (size_t i = 0; i < N; ++i) { - h->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_make_hint(a0->coeffs[i], a1->coeffs[i]); - s += h->coeffs[i]; - } - return s; +unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1) { + unsigned int r; + DBENCH_START(); + + r = PQCLEAN_DILITHIUM3_AVX2_make_hint_avx(hint, a0->vec, a1->vec); + + DBENCH_STOP(*tround); + return r; } /************************************************* - * Name: PQCLEAN_DILITHIUM3_AVX2_poly_use_hint - * - * Description: Use hint polynomial to correct the high bits of a polynomial. +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_use_hint * -* Arguments: - poly *a: pointer to output polynomial with corrected high bits -* - const poly *b: pointer to input polynomial +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial * - const poly *h: pointer to input hint polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint( - poly *restrict a, - const poly *restrict b, - const poly *restrict h) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM3_AVX2_use_hint(b->coeffs[i], h->coeffs[i]); - } +void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3_AVX2_use_hint_avx(b->vec, a->vec, h->vec); + + DBENCH_STOP(*tround); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_poly_chknorm * * Description: Check infinity norm of polynomial against given bound. -* Assumes input coefficients to be standard representatives. +* Assumes input polynomial to be reduced by PQCLEAN_DILITHIUM3_AVX2_poly_reduce(). * * Arguments: - const poly *a: pointer to polynomial -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm is strictly smaller than B and 1 otherwise. +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, uint32_t B) { - int32_t t; +int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int r; + __m256i f, t; + const __m256i bound = _mm256_set1_epi32(B - 1); + DBENCH_START(); - /* It is ok to leak which coefficient violates the bound since - the probability for each coefficient is independent of secret - data but we must not leak the sign of the centralized representative. */ - for (size_t i = 0; i < N; ++i) { - /* Absolute value of centralized representative */ - t = (Q - 1) / 2 - a->coeffs[i]; - t ^= (t >> 31); - t = (Q - 1) / 2 - t; - - if ((uint32_t)t >= B) { - return 1; - } + if (B > (Q - 1) / 8) { + return 1; } - return 0; + t = _mm256_setzero_si256(); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_abs_epi32(f); + f = _mm256_cmpgt_epi32(f, bound); + t = _mm256_or_si256(t, f); + } + + r = 1 - _mm256_testz_si256(t, t); + DBENCH_STOP(*tsample); + return r; } /************************************************* -* Name: rej_uniform_ref +* Name: rej_uniform * * Description: Sample uniformly random coefficients in [0, Q-1] by -* performing rejection sampling using array of random bytes. +* performing rejection sampling on array of random bytes. * -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled * - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes +* - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough * random bytes were given. **************************************************/ -static size_t rej_uniform_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; uint32_t t; + DBENCH_START(); ctr = pos = 0; while (ctr < len && pos + 3 <= buflen) { @@ -315,101 +373,87 @@ static size_t rej_uniform_ref( } } + DBENCH_STOP(*tsample); return ctr; } /************************************************* -* Name: poly_uniform +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_uniform * * Description: Sample polynomial with uniformly random coefficients -* in [0,Q-1] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a, - const uint8_t seed[SEEDBYTES], - uint16_t nonce) { - size_t ctr, off; - size_t nblocks = POLY_UNIFORM_NBLOCKS; - size_t buflen = POLY_UNIFORM_BUFLEN; - uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; - stream128_state state; +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + /* PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx reads up to 8 additional bytes */ + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf; - stream128_init(&state, seed, nonce); - stream128_squeezeblocks(buf, nblocks, &state); - - ctr = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a->coeffs, N, buf, buflen); + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(a->coeffs, buf.coeffs); while (ctr < N) { - off = buflen % 3; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } - - buflen = STREAM128_BLOCKBYTES + off; - stream128_squeezeblocks(buf + off, 1, &state); - ctr += rej_uniform_ref(a->coeffs + ctr, N - ctr, buf, buflen); + /* length of buf is always divisible by 3; hence, no bytes left */ + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); } - stream128_ctx_release(&state); +} + +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_preinit(a, &state); + stream128_release(&state); } void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t seed[SEEDBYTES], + const uint8_t seed[32], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][SEEDBYTES + 2]; - uint8_t outbuf[4][5 * SHAKE128_RATE]; - __m256i state[25]; + unsigned int ctr0, ctr1, ctr2, ctr3; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf[4]; + keccakx4_state state; + __m256i f; - for (size_t i = 0; i < SEEDBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][SEEDBYTES + 0] = nonce0; - inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; - inbuf[1][SEEDBYTES + 0] = nonce1; - inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; - inbuf[2][SEEDBYTES + 0] = nonce2; - inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; - inbuf[3][SEEDBYTES + 0] = nonce3; - inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); - PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - SEEDBYTES + 2); - PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, - state); + buf[0].coeffs[SEEDBYTES + 0] = nonce0; + buf[0].coeffs[SEEDBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[SEEDBYTES + 0] = nonce1; + buf[1].coeffs[SEEDBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[SEEDBYTES + 0] = nonce2; + buf[2].coeffs[SEEDBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[SEEDBYTES + 0] = nonce3; + buf[3].coeffs[SEEDBYTES + 1] = nonce3 >> 8; - ctr0 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a0->coeffs, N, outbuf[0], 5 * SHAKE128_RATE); - ctr1 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a1->coeffs, N, outbuf[1], 5 * SHAKE128_RATE); - ctr2 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a2->coeffs, N, outbuf[2], 5 * SHAKE128_RATE); - ctr3 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform(a3->coeffs, N, outbuf[3], 5 * SHAKE128_RATE); + PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, SEEDBYTES + 2); + PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_NBLOCKS, &state); + + ctr0 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(a0->coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(a1->coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(a2->coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(a3->coeffs, buf[3].coeffs); while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); + PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); - ctr0 += rej_uniform_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], - SHAKE128_RATE); - ctr1 += rej_uniform_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], - SHAKE128_RATE); - ctr2 += rej_uniform_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], - SHAKE128_RATE); - ctr3 += rej_uniform_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], - SHAKE128_RATE); + ctr0 += rej_uniform(a0->coeffs + ctr0, N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_uniform(a1->coeffs + ctr1, N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_uniform(a2->coeffs + ctr2, N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_uniform(a3->coeffs + ctr3, N - ctr3, buf[3].coeffs, SHAKE128_RATE); } } @@ -417,433 +461,454 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0, * Name: rej_eta * * Description: Sample uniformly random coefficients in [-ETA, ETA] by -* performing rejection sampling using array of random bytes. +* performing rejection sampling on array of random bytes. * -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled * - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes +* - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough * random bytes were given. **************************************************/ -static size_t rej_eta_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; uint32_t t0, t1; + DBENCH_START(); ctr = pos = 0; while (ctr < len && pos < buflen) { t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - if (t0 <= 2 * ETA) { - a[ctr++] = Q + ETA - t0; + if (t0 < 9) { + a[ctr++] = 4 - t0; } - if (t1 <= 2 * ETA && ctr < len) { - a[ctr++] = Q + ETA - t1; + if (t1 < 9 && ctr < len) { + a[ctr++] = 4 - t1; } } + DBENCH_STOP(*tsample); return ctr; } /************************************************* -* Name: poly_uniform_eta +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta * * Description: Sample polynomial with uniformly random coefficients * in [-ETA,ETA] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). +* output stream of SHAKE256(seed|nonce) +* or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta( - poly *a, - const uint8_t seed[SEEDBYTES], - uint16_t nonce) { - size_t ctr; - uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN * STREAM128_BLOCKBYTES) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_ETA_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { stream128_state state; - stream128_init(&state, seed, nonce); - stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); - - ctr = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); - - while (ctr < N) { - stream128_squeezeblocks(buf, 1, &state); - ctr += rej_eta_ref(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); - } - stream128_ctx_release(&state); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_preinit(a, &state); + stream128_release(&state); } -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x( - poly *a0, - poly *a1, - poly *a2, - poly *a3, - const uint8_t seed[SEEDBYTES], - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][SEEDBYTES + 2]; - uint8_t outbuf[4][2 * SHAKE128_RATE]; - __m256i state[25]; - - for (size_t i = 0; i < SEEDBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][SEEDBYTES + 0] = nonce0; - inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; - inbuf[1][SEEDBYTES + 0] = nonce1; - inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; - inbuf[2][SEEDBYTES + 0] = nonce2; - inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; - inbuf[3][SEEDBYTES + 0] = nonce3; - inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; - - PQCLEAN_DILITHIUM3_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - SEEDBYTES + 2); - PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 2, - state); - - ctr0 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a0->coeffs, N, outbuf[0], 2 * SHAKE128_RATE); - ctr1 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a1->coeffs, N, outbuf[1], 2 * SHAKE128_RATE); - ctr2 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a2->coeffs, N, outbuf[2], 2 * SHAKE128_RATE); - ctr3 = PQCLEAN_DILITHIUM3_AVX2_rej_eta(a3->coeffs, N, outbuf[3], 2 * SHAKE128_RATE); - - while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM3_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); - - ctr0 += rej_eta_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], SHAKE128_RATE); - ctr1 += rej_eta_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], SHAKE128_RATE); - ctr2 += rej_eta_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], SHAKE128_RATE); - ctr3 += rej_eta_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], SHAKE128_RATE); - } -} - -/************************************************* -* Name: rej_gamma1m1_ref -* -* Description: Sample uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling -* using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_gamma1m1_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint32_t t0, t1; - - ctr = pos = 0; - while (ctr < len && pos + 5 <= buflen) { - t0 = buf[pos]; - t0 |= (uint32_t)buf[pos + 1] << 8; - t0 |= (uint32_t)buf[pos + 2] << 16; - t0 &= 0xFFFFF; - - t1 = buf[pos + 2] >> 4; - t1 |= (uint32_t)buf[pos + 3] << 4; - t1 |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (t0 <= 2 * GAMMA1 - 2) { - a[ctr++] = Q + GAMMA1 - 1 - t0; - } - if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { - a[ctr++] = Q + GAMMA1 - 1 - t1; - } - } - return ctr; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1 -* -* Description: Sample polynomial with uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection -* sampling on output stream of SHAKE256(seed|nonce). -* -* Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* CRHBYTES -* - uint16_t nonce: 16-bit nonce -**************************************************/ -#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) -#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1( - poly *a, - const uint8_t seed[CRHBYTES], - uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; - stream256_state state; - - stream256_init(&state, seed, nonce); - stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); - - ctr = PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1(a->coeffs, N, buf, POLY_UNIFORM_GAMMA1M1_BUFLEN); - - while (ctr < N) { - off = buflen % 5; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } - - buflen = STREAM256_BLOCKBYTES + off; - stream256_squeezeblocks(buf + off, 1, &state); - ctr += rej_gamma1m1_ref(a->coeffs + ctr, N - ctr, buf, buflen); - } - stream256_ctx_release(&state); -} - -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(poly *a0, +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t seed[CRHBYTES], + const uint8_t seed[32], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][CRHBYTES + 2]; - uint8_t outbuf[4][5 * SHAKE256_RATE]; - __m256i state[25]; + unsigned int ctr0, ctr1, ctr2, ctr3; + ALIGNED_UINT8(REJ_UNIFORM_ETA_BUFLEN) buf[4]; - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][CRHBYTES + 0] = nonce0 & 0xFF; - inbuf[0][CRHBYTES + 1] = nonce0 >> 8; - inbuf[1][CRHBYTES + 0] = nonce1 & 0xFF; - inbuf[1][CRHBYTES + 1] = nonce1 >> 8; - inbuf[2][CRHBYTES + 0] = nonce2 & 0xFF; - inbuf[2][CRHBYTES + 1] = nonce2 >> 8; - inbuf[3][CRHBYTES + 0] = nonce3 & 0xFF; - inbuf[3][CRHBYTES + 1] = nonce3 >> 8; + __m256i f; + keccakx4_state state; - PQCLEAN_DILITHIUM3_AVX2_shake256_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - CRHBYTES + 2); - PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, - state); + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); - ctr0 = rej_gamma1m1_ref(a0->coeffs, N, outbuf[0], 5 * SHAKE256_RATE); - ctr1 = rej_gamma1m1_ref(a1->coeffs, N, outbuf[1], 5 * SHAKE256_RATE); - ctr2 = rej_gamma1m1_ref(a2->coeffs, N, outbuf[2], 5 * SHAKE256_RATE); - ctr3 = rej_gamma1m1_ref(a3->coeffs, N, outbuf[3], 5 * SHAKE256_RATE); + buf[0].coeffs[SEEDBYTES + 0] = nonce0; + buf[0].coeffs[SEEDBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[SEEDBYTES + 0] = nonce1; + buf[1].coeffs[SEEDBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[SEEDBYTES + 0] = nonce2; + buf[2].coeffs[SEEDBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[SEEDBYTES + 0] = nonce3; + buf[3].coeffs[SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM3_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, SEEDBYTES + 2); + PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_ETA_NBLOCKS, &state); + + ctr0 = PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(a0->coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(a1->coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(a2->coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(a3->coeffs, buf[3].coeffs); while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM3_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); + PQCLEAN_DILITHIUM3_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); - ctr0 += rej_gamma1m1_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], - SHAKE256_RATE); - ctr1 += rej_gamma1m1_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], - SHAKE256_RATE); - ctr2 += rej_gamma1m1_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], - SHAKE256_RATE); - ctr3 += rej_gamma1m1_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], - SHAKE256_RATE); + ctr0 += rej_eta(a0->coeffs + ctr0, N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_eta(a1->coeffs + ctr1, N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_eta(a2->coeffs + ctr2, N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_eta(a3->coeffs + ctr3, N - ctr3, buf[3].coeffs, SHAKE128_RATE); } } +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES+STREAM256_BLOCKBYTES-1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state) { + /* PQCLEAN_DILITHIUM3_AVX2_polyz_unpack reads 14 additional bytes */ + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf; + stream256_squeezeblocks(buf.coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, state); + PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(a, buf.coeffs); +} + +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { + stream256_state state; + stream256_init(&state, seed, nonce); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_preinit(a, &state); + stream256_release(&state); +} + +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t seed[48], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf[4]; + keccakx4_state state; + __m256i f; + __m128i g; + + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + g = _mm_loadu_si128((__m128i *)&seed[32]); + _mm_store_si128((__m128i *)&buf[0].vec[1], g); + _mm_store_si128((__m128i *)&buf[1].vec[1], g); + _mm_store_si128((__m128i *)&buf[2].vec[1], g); + _mm_store_si128((__m128i *)&buf[3].vec[1], g); + + buf[0].coeffs[CRHBYTES + 0] = nonce0; + buf[0].coeffs[CRHBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[CRHBYTES + 0] = nonce1; + buf[1].coeffs[CRHBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[CRHBYTES + 0] = nonce2; + buf[2].coeffs[CRHBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[CRHBYTES + 0] = nonce3; + buf[3].coeffs[CRHBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM3_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, CRHBYTES + 2); + PQCLEAN_DILITHIUM3_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + + PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(a0, buf[0].coeffs); + PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(a1, buf[1].coeffs); + PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(a2, buf[2].coeffs); + PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(a3, buf[3].coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_poly_challenge(poly *restrict c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + ALIGNED_UINT8(SHAKE256_RATE) buf; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + + memcpy(&signs, buf.coeffs, 8); + pos = 8; + + memset(c->vec, 0, sizeof(poly)); + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + pos = 0; + } + + b = buf.coeffs[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyeta_pack * * Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. -* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLETA_SIZE_PACKED bytes +* POLYETA_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(uint8_t *restrict r, const poly *restrict a) { - uint8_t t[N / 2]; - for (size_t i = 0; i < N / 2; ++i) { - t[0] = Q + ETA - a->coeffs[2 * i + 0]; - t[1] = Q + ETA - a->coeffs[2 * i + 1]; +void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = ETA - a->coeffs[2 * i + 0]; + t[1] = ETA - a->coeffs[2 * i + 1]; r[i] = t[0] | (t[1] << 4); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack * * Description: Unpack polynomial with coefficients in [-ETA,ETA]. -* Output coefficients lie in [Q-ETA,Q+ETA]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 2; ++i) { +void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *restrict r, const uint8_t a[POLYETA_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[i] & 0x0F; r->coeffs[2 * i + 1] = a[i] >> 4; - r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1]; + r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyt1_pack * -* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. -* Input coefficients are assumed to be standard representatives. +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be positive standard representatives. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT1_SIZE_PACKED bytes +* POLYT1_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(uint8_t *restrict r, const poly *restrict a) { - for (size_t i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); +void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2); + r[5 * i + 2] = (a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4); + r[5 * i + 3] = (a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6); + r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack * -* Description: Unpack polynomial t1 with 9-bit coefficients. -* Output coefficients are standard representatives. +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are positive standard representatives. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] >> 0) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; +void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *restrict r, const uint8_t a[POLYT1_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyt0_pack * * Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT0_SIZE_PACKED bytes +* POLYT0_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(uint8_t *restrict r, const poly *restrict a) { - uint32_t t[4]; +void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); - for (size_t i = 0; i < N / 4; ++i) { - t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; - r[7 * i + 0] = t[0]; - r[7 * i + 1] = t[0] >> 8; - r[7 * i + 1] |= t[1] << 6; - r[7 * i + 2] = t[1] >> 2; - r[7 * i + 3] = t[1] >> 10; - r[7 * i + 3] |= t[2] << 4; - r[7 * i + 4] = t[2] >> 4; - r[7 * i + 5] = t[2] >> 12; - r[7 * i + 5] |= t[3] << 2; - r[7 * i + 6] = t[3] >> 6; + r[13 * i + 0] = t[0]; + r[13 * i + 1] = t[0] >> 8; + r[13 * i + 1] |= t[1] << 5; + r[13 * i + 2] = t[1] >> 3; + r[13 * i + 3] = t[1] >> 11; + r[13 * i + 3] |= t[2] << 2; + r[13 * i + 4] = t[2] >> 6; + r[13 * i + 4] |= t[3] << 7; + r[13 * i + 5] = t[3] >> 1; + r[13 * i + 6] = t[3] >> 9; + r[13 * i + 6] |= t[4] << 4; + r[13 * i + 7] = t[4] >> 4; + r[13 * i + 8] = t[4] >> 12; + r[13 * i + 8] |= t[5] << 1; + r[13 * i + 9] = t[5] >> 7; + r[13 * i + 9] |= t[6] << 6; + r[13 * i + 10] = t[6] >> 2; + r[13 * i + 11] = t[6] >> 10; + r[13 * i + 11] |= t[7] << 3; + r[13 * i + 12] = t[7] >> 5; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack * * Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; +void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *restrict r, const uint8_t a[POLYT0_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; - r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; - r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; - r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; - r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyz_pack * -* Description: Bit-pack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Input coefficients are assumed to be standard representatives. +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLZ_SIZE_PACKED bytes +* POLYZ_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t *restrict r, const poly *restrict a) { - uint32_t t[2]; +void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); - for (size_t i = 0; i < N / 2; ++i) { - /* Map to {0,...,2*GAMMA1 - 2} */ - t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; - t[0] += ((int32_t)t[0] >> 31) & Q; - t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; - t[1] += ((int32_t)t[1] >> 31) & Q; + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; r[5 * i + 0] = t[0]; r[5 * i + 1] = t[0] >> 8; @@ -852,50 +917,82 @@ void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t *restrict r, const poly *restric r[5 * i + 3] = t[1] >> 4; r[5 * i + 4] = t[1] >> 12; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyz_unpack * * Description: Unpack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Output coefficients are standard representatives. +* in [-(GAMMA1 - 1), GAMMA1]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; +void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *restrict r, const uint8_t a[POLYZ_PACKEDBYTES + 12]) { + unsigned int i; + __m256i f; + const __m256i shufbidx = _mm256_set_epi8(-1, 11, 10, 9, -1, 9, 8, 7, -1, 6, 5, 4, -1, 4, 3, 2, + -1, 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0); + const __m256i srlvdidx = _mm256_set1_epi64x((uint64_t)4 << 32); + const __m256i mask = _mm256_set1_epi32(0xFFFFF); + const __m256i gamma1 = _mm256_set1_epi32(GAMMA1); + DBENCH_START(); - r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; - - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; - r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; + for (i = 0; i < N / 8; i++) { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_srlv_epi32(f, srlvdidx); + f = _mm256_and_si256(f, mask); + f = _mm256_sub_epi32(gamma1, f); + _mm256_store_si256(&r->vec[i], f); } + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyw1_pack * -* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. -* Input coefficients are assumed to be standard representatives. +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be positive standard representatives. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLW1_SIZE_PACKED bytes +* POLYW1_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack( - uint8_t *restrict r, - const poly *restrict a) { - for (size_t i = 0; i < N / 2; ++i) { - r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); +void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + __m256i f0, f1, f2, f3, f4, f5, f6, f7; + const __m256i shift = _mm256_set1_epi16((16 << 8) + 1); + const __m256i shufbidx = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, + 15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + DBENCH_START(); + + for (i = 0; i < N / 64; ++i) { + f0 = _mm256_load_si256(&a->vec[8 * i + 0]); + f1 = _mm256_load_si256(&a->vec[8 * i + 1]); + f2 = _mm256_load_si256(&a->vec[8 * i + 2]); + f3 = _mm256_load_si256(&a->vec[8 * i + 3]); + f4 = _mm256_load_si256(&a->vec[8 * i + 4]); + f5 = _mm256_load_si256(&a->vec[8 * i + 5]); + f6 = _mm256_load_si256(&a->vec[8 * i + 6]); + f7 = _mm256_load_si256(&a->vec[8 * i + 7]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f2 = _mm256_packus_epi32(f4, f5); + f3 = _mm256_packus_epi32(f6, f7); + f0 = _mm256_packus_epi16(f0, f1); + f1 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift); + f1 = _mm256_maddubs_epi16(f1, shift); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_permute4x64_epi64(f0, 0xD8); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); } + + DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium3/avx2/poly.h b/crypto_sign/dilithium3/avx2/poly.h index a944b600..b424c625 100644 --- a/crypto_sign/dilithium3/avx2/poly.h +++ b/crypto_sign/dilithium3/avx2/poly.h @@ -1,19 +1,14 @@ -#ifndef POLY_H -#define POLY_H - -#include +#ifndef PQCLEAN_DILITHIUM3_AVX2_POLY_H +#define PQCLEAN_DILITHIUM3_AVX2_POLY_H +#include "align.h" +#include "params.h" +#include "symmetric.h" #include -#include "alignment.h" -#include "params.h" - -typedef union { - uint32_t coeffs[N]; - __m256i coeffs_x8[N / 8]; -} poly; +typedef ALIGNED_INT32(N) poly; void PQCLEAN_DILITHIUM3_AVX2_poly_reduce(poly *a); -void PQCLEAN_DILITHIUM3_AVX2_poly_csubq(poly *a); +void PQCLEAN_DILITHIUM3_AVX2_poly_caddq(poly *a); void PQCLEAN_DILITHIUM3_AVX2_poly_freeze(poly *a); void PQCLEAN_DILITHIUM3_AVX2_poly_add(poly *c, const poly *a, const poly *b); @@ -21,63 +16,64 @@ void PQCLEAN_DILITHIUM3_AVX2_poly_sub(poly *c, const poly *a, const poly *b); void PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(poly *a); void PQCLEAN_DILITHIUM3_AVX2_poly_ntt(poly *a); -void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(poly *a); -void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(poly *a); +void PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); void PQCLEAN_DILITHIUM3_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); void PQCLEAN_DILITHIUM3_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); -unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1); -void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h); +unsigned int PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM3_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); -int PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(const poly *a, uint32_t B); -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform(poly *a, - const uint8_t *seed, - uint16_t nonce); void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t *seed, + const uint8_t seed[SEEDBYTES], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3); -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(poly *a, - const uint8_t *seed, - uint16_t nonce); void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t *seed, + const uint8_t seed[SEEDBYTES], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3); -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1(poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(poly *a0, +void PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_4x(poly *a0, poly *a1, poly *a2, poly *a3, - const uint8_t *seed, + const uint8_t seed[CRHBYTES], uint16_t nonce0, uint16_t nonce1, uint16_t nonce2, uint16_t nonce3); -void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *r, const uint8_t *a); +void PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); -void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *r, const uint8_t *a); +void PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); -void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *r, const uint8_t *a); +void PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); -void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *r, const uint8_t *a); +void PQCLEAN_DILITHIUM3_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); + +void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); -void PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(uint8_t *r, const poly *a); #endif diff --git a/crypto_sign/dilithium3/avx2/polyvec.c b/crypto_sign/dilithium3/avx2/polyvec.c index 583c102c..10ccd9f8 100644 --- a/crypto_sign/dilithium3/avx2/polyvec.c +++ b/crypto_sign/dilithium3/avx2/polyvec.c @@ -1,14 +1,127 @@ -#include - +#include "consts.h" #include "ntt.h" #include "params.h" #include "poly.h" #include "polyvec.h" +#include + +#define UNUSED(x) (void)x + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + polyvecl tmp; + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(&mat[0], &mat[1], rho); + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(&mat[1], &mat[2], rho); + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(&mat[2], &mat[3], rho); + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho); + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(&mat[4], &mat[5], rho); + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(&mat[5], &tmp, rho); +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[4], &rowb->vec[0], &rowb->vec[1], &rowb->vec[2], rho, 4, 256, 257, 258); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[0]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[2]); +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[3], &rowa->vec[4], &rowb->vec[0], &rowb->vec[1], rho, 259, 260, 512, 513); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]); +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[2], &rowa->vec[3], &rowa->vec[4], &rowb->vec[0], rho, 514, 515, 516, 768); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]); +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + UNUSED(rowb); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[1], &rowa->vec[2], &rowa->vec[3], &rowa->vec[4], rho, 769, 770, 771, 772); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 1024, 1025, 1026, 1027); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[4], &rowb->vec[0], &rowb->vec[1], &rowb->vec[2], rho, 1028, 1280, 1281, 1282); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[0]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[0]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[1]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowb->vec[2]); +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&rowa->vec[3], &rowa->vec[4], &rowb->vec[0], &rowb->vec[1], rho, 1283, 1284, 1536, 1537); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM3_AVX2_poly_nttunpack(&rowa->vec[4]); +} + + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} /**************************************************************/ /************ Vectors of polynomials of length L **************/ /**************************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&v->vec[i]); + } +} + /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze * @@ -18,7 +131,9 @@ * Arguments: - polyvecl *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]); } } @@ -34,7 +149,9 @@ void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v) { * - const polyvecl *v: pointer to second summand **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -48,44 +165,60 @@ void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const * Arguments: - polyvecl *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]); } } +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + /************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery +* Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery * * Description: Pointwise multiply vectors of polynomials of length L, multiply * resulting vector by 2^{-32} and add (accumulate) polynomials * in it. Input/output vectors are in NTT domain representation. -* Input coefficients are assumed to be less than 22*Q. Output -* coeffcient are less than 2*L*Q. * * Arguments: - poly *w: output polynomial * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, - const polyvecl *u, - const polyvecl *v) { - PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs); +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { + PQCLEAN_DILITHIUM3_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM3_AVX2_qdata.vec); } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm * * Description: Check infinity norm of polynomials in vector of length L. -* Assumes input coefficients to be standard representatives. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce(). * * Arguments: - const polyvecl *v: pointer to vector -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm of all polynomials is strictly smaller than B and 1 -* otherwise. +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { - for (size_t i = 0; i < L; ++i) { +int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) { return 1; } @@ -98,37 +231,48 @@ int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) /************ Vectors of polynomials of length K **************/ /**************************************************************/ +void PQCLEAN_DILITHIUM3_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce * * Description: Reduce coefficients of polynomials in vector of length K -* to representatives in [0,2*Q[. +* to representatives in [-6283009,6283007]. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&v->vec[i]); } } /************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq * * Description: For all coefficients of polynomials in vector of length K -* subtract Q if coefficient is bigger than Q. +* add Q if coefficient is negative. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_poly_csubq(&v->vec[i]); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&v->vec[i]); } } /************************************************* -* Name: polyveck_freeze +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze * * Description: Reduce coefficients of polynomials in vector of length K * to standard representatives. @@ -136,7 +280,9 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(polyveck *v) { * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_freeze(&v->vec[i]); } } @@ -152,7 +298,9 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v) { * - const polyveck *v: pointer to second summand **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -161,8 +309,7 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const * Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_sub * * Description: Subtract vectors of polynomials of length K. -* Assumes coefficients of polynomials in second input vector -* to be less than 2*Q. No modular reduction is performed. +* No modular reduction is performed. * * Arguments: - polyveck *w: pointer to output vector * - const polyveck *u: pointer to first input vector @@ -170,7 +317,9 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const * subtracted from first input vector **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -179,12 +328,14 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const * Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl * * Description: Multiply vector of polynomials of Length K by 2^D without modular -* reduction. Assumes input coefficients to be less than 2^{32-D}. +* reduction. Assumes input coefficients to be less than 2^{31-D}. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(&v->vec[i]); } } @@ -198,13 +349,15 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v) { * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&v->vec[i]); } } /************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont * * Description: Inverse NTT and multiplication by 2^{32} of polynomials * in vector of length K. Input coefficients need to be less @@ -212,9 +365,19 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v) { * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&v->vec[i]); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); } } @@ -222,16 +385,18 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(polyveck *v) { * Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm * * Description: Check infinity norm of polynomials in vector of length K. -* Assumes input coefficients to be standard representatives. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(). * * Arguments: - const polyveck *v: pointer to vector -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm of all polynomials are strictly smaller than B and 1 -* otherwise. +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) { - for (size_t i = 0; i < K; ++i) { +int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&v->vec[i], bound)) { return 1; } @@ -244,18 +409,20 @@ int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) * Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round * * Description: For all coefficients a of polynomials in vector of length K, -* compute a0, a1 such that a mod Q = a1*2^D + a0 +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 * with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be * standard representatives. * * Arguments: - polyveck *v1: pointer to output vector of polynomials with * coefficients a1 * - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 +* coefficients a0 * - const polyveck *v: pointer to input vector **************************************************/ void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); } } @@ -264,7 +431,7 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, co * Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose * * Description: For all coefficients a of polynomials in vector of length K, -* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 * with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we * set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. * Assumes coefficients to be standard representatives. @@ -272,12 +439,13 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, co * Arguments: - polyveck *v1: pointer to output vector of polynomials with * coefficients a1 * - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 +* coefficients a0 * - const polyveck *v: pointer to input vector **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); } } @@ -287,37 +455,44 @@ void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose( * * Description: Compute hint vector. * -* Arguments: - polyveck *h: pointer to output vector +* Arguments: - uint8_t *hint: pointer to output hint array * - const polyveck *v0: pointer to low part of input vector * - const polyveck *v1: pointer to high part of input vector * * Returns number of 1 bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1) { - uint32_t s = 0; +unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { + unsigned int i, n = 0; - for (size_t i = 0; i < K; ++i) { - s += PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + for (i = 0; i < K; ++i) { + n += PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); } - return s; + return n; } /************************************************* -* Name: polyveck_use_hint +* Name: PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint * * Description: Use hint vector to correct the high bits of input vector. * * Arguments: - polyveck *w: pointer to output vector of polynomials with * corrected high bits -* - const polyveck *v: pointer to input vector +* - const polyveck *u: pointer to input vector * - const polyveck *h: pointer to input hint vector **************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); } } diff --git a/crypto_sign/dilithium3/avx2/polyvec.h b/crypto_sign/dilithium3/avx2/polyvec.h index 297407a1..983e13f0 100644 --- a/crypto_sign/dilithium3/avx2/polyvec.h +++ b/crypto_sign/dilithium3/avx2/polyvec.h @@ -1,58 +1,72 @@ #ifndef PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H #define PQCLEAN_DILITHIUM3_AVX2_POLYVEC_H - -#include - #include "params.h" #include "poly.h" +#include /* Vectors of polynomials of length L */ typedef struct { poly vec[L]; } polyvecl; +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_reduce(polyvecl *v); + void PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(polyvecl *v); void PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); void PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(polyvecl *v); -void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v); - -int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B); - +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); +int PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); /* Vectors of polynomials of length K */ typedef struct { poly vec[K]; } polyveck; +void PQCLEAN_DILITHIUM3_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + void PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(polyveck *v); -void PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(polyveck *v); void PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(polyveck *v); -void PQCLEAN_DILITHIUM3_AVX2_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v); -void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); void PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(polyveck *v); void PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(polyveck *v); -void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); -int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm( - const polyveck *v, uint32_t B); +int PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); -void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v); -void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v); -uint32_t PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1); -void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); +void PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); #endif diff --git a/crypto_sign/dilithium3/avx2/reduce.S b/crypto_sign/dilithium3/avx2/reduce.S deleted file mode 100644 index cef9a7a4..00000000 --- a/crypto_sign/dilithium3/avx2/reduce.S +++ /dev/null @@ -1,93 +0,0 @@ -#include "cdecl.inc" - -.global cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx) -cdecl(PQCLEAN_DILITHIUM3_AVX2_reduce_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8x23ones)(%rip),%ymm0 - -xor %eax,%eax -_looptop_rdc32: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm7 - -#reduce -vpsrld $23,%ymm1,%ymm2 -vpsrld $23,%ymm3,%ymm4 -vpsrld $23,%ymm5,%ymm6 -vpsrld $23,%ymm7,%ymm8 -vpand %ymm0,%ymm1,%ymm1 -vpand %ymm0,%ymm3,%ymm3 -vpand %ymm0,%ymm5,%ymm5 -vpand %ymm0,%ymm7,%ymm7 -vpsubd %ymm2,%ymm1,%ymm1 -vpsubd %ymm4,%ymm3,%ymm3 -vpsubd %ymm6,%ymm5,%ymm5 -vpsubd %ymm8,%ymm7,%ymm7 -vpslld $13,%ymm2,%ymm2 -vpslld $13,%ymm4,%ymm4 -vpslld $13,%ymm6,%ymm6 -vpslld $13,%ymm8,%ymm8 -vpaddd %ymm2,%ymm1,%ymm1 -vpaddd %ymm4,%ymm3,%ymm3 -vpaddd %ymm6,%ymm5,%ymm5 -vpaddd %ymm8,%ymm7,%ymm7 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -add $128,%rdi -add $1,%eax -cmp $8,%eax -jb _looptop_rdc32 - -ret - -.global cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx) -cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM3_AVX2_8xq)(%rip),%ymm0 - -xor %eax,%eax -_looptop_csubq: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm7 - -#cdecl(PQCLEAN_DILITHIUM3_AVX2_csubq) -vpsubd %ymm0,%ymm1,%ymm1 -vpsubd %ymm0,%ymm3,%ymm3 -vpsubd %ymm0,%ymm5,%ymm5 -vpsubd %ymm0,%ymm7,%ymm7 -vpsrad $31,%ymm1,%ymm2 -vpsrad $31,%ymm3,%ymm4 -vpsrad $31,%ymm5,%ymm6 -vpsrad $31,%ymm7,%ymm8 -vpand %ymm0,%ymm2,%ymm2 -vpand %ymm0,%ymm4,%ymm4 -vpand %ymm0,%ymm6,%ymm6 -vpand %ymm0,%ymm8,%ymm8 -vpaddd %ymm2,%ymm1,%ymm1 -vpaddd %ymm4,%ymm3,%ymm3 -vpaddd %ymm6,%ymm5,%ymm5 -vpaddd %ymm8,%ymm7,%ymm7 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -add $128,%rdi -add $1,%eax -cmp $8,%eax -jb _looptop_csubq - -ret diff --git a/crypto_sign/dilithium3/avx2/reduce.h b/crypto_sign/dilithium3/avx2/reduce.h deleted file mode 100644 index 2488cbfd..00000000 --- a/crypto_sign/dilithium3/avx2/reduce.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef REDUCE_H -#define REDUCE_H - -#include - -void PQCLEAN_DILITHIUM3_AVX2_reduce_avx(uint32_t a[N]); -void PQCLEAN_DILITHIUM3_AVX2_csubq_avx(uint32_t a[N]); - -#endif diff --git a/crypto_sign/dilithium3/avx2/rejsample.c b/crypto_sign/dilithium3/avx2/rejsample.c index deff658f..a70674c2 100644 --- a/crypto_sign/dilithium3/avx2/rejsample.c +++ b/crypto_sign/dilithium3/avx2/rejsample.c @@ -1,9 +1,10 @@ -#include - #include "params.h" #include "rejsample.h" +#include "symmetric.h" +#include +#include -static const uint8_t idx[256][8] = { +const uint8_t PQCLEAN_DILITHIUM3_AVX2_idxlut[256][8] = { { 0, 0, 0, 0, 0, 0, 0, 0}, { 0, 0, 0, 0, 0, 0, 0, 0}, { 1, 0, 0, 0, 0, 0, 0, 0}, @@ -262,178 +263,128 @@ static const uint8_t idx[256][8] = { { 0, 1, 2, 3, 4, 5, 6, 7} }; -uint32_t PQCLEAN_DILITHIUM3_AVX2_rej_uniform( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos, vec[8]; - __m256i d, tmp; +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { + unsigned int ctr, pos; uint32_t good; + __m256i d, tmp; const __m256i bound = _mm256_set1_epi32(Q); + const __m256i mask = _mm256_set1_epi32(0x7FFFFF); + const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, + -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, + -1, 5, 4, 3, -1, 2, 1, 0); ctr = pos = 0; - while (ctr + 8 <= len && pos + 24 <= buflen) { - for (size_t i = 0; i < 8; i++) { - vec[i] = buf[pos++]; - vec[i] |= (uint32_t)buf[pos++] << 8; - vec[i] |= (uint32_t)buf[pos++] << 16; - vec[i] &= 0x7FFFFF; - } + while (pos <= REJ_UNIFORM_BUFLEN - 24) { + d = _mm256_loadu_si256((__m256i *)&buf[pos]); + d = _mm256_permute4x64_epi64(d, 0x94); + d = _mm256_shuffle_epi8(d, idx8); + d = _mm256_and_si256(d, mask); + pos += 24; - d = _mm256_loadu_si256((__m256i_u *)vec); - tmp = _mm256_cmpgt_epi32(bound, d); + tmp = _mm256_sub_epi32(d, bound); good = _mm256_movemask_ps((__m256)tmp); - - __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); - tmp = _mm256_cvtepu8_epi32(rid); + tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good])); d = _mm256_permutevar8x32_epi32(d, tmp); - _mm256_storeu_si256((__m256i_u *)&r[ctr], d); - ctr += __builtin_popcount(good); + + _mm256_storeu_si256((__m256i *)&r[ctr], d); + ctr += _mm_popcnt_u32(good); + + if (ctr > N - 8) { + break; + } } - while (ctr < len && pos + 3 <= buflen) { - vec[0] = buf[pos++]; - vec[0] |= (uint32_t)buf[pos++] << 8; - vec[0] |= (uint32_t)buf[pos++] << 16; - vec[0] &= 0x7FFFFF; + uint32_t t; + while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; - if (vec[0] < Q) { - r[ctr++] = vec[0]; + if (t < Q) { + r[ctr++] = t; } } return ctr; } -uint32_t PQCLEAN_DILITHIUM3_AVX2_rej_eta( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos; - uint8_t vec[32]; - __m256i tmp0, tmp1; - __m128i d0, d1, rid; +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { + unsigned int ctr, pos; uint32_t good; - const __m256i bound = _mm256_set1_epi8(2 * ETA + 1); - const __m256i off = _mm256_set1_epi32(Q + ETA); + __m256i f0, f1; + __m128i g0, g1; + const __m256i mask = _mm256_set1_epi8(15); + const __m256i eta = _mm256_set1_epi8(4); + const __m256i bound = _mm256_set1_epi8(9); ctr = pos = 0; - while (ctr + 32 <= len && pos + 16 <= buflen) { - for (size_t i = 0; i < 16; i++) { - vec[2 * i + 0] = buf[pos] & 0x0F; - vec[2 * i + 1] = buf[pos++] >> 4; + while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { + f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); + f1 = _mm256_slli_epi16(f0, 4); + f0 = _mm256_or_si256(f0, f1); + f0 = _mm256_and_si256(f0, mask); + + f1 = _mm256_sub_epi8(f0, bound); + f0 = _mm256_sub_epi8(eta, f0); + good = _mm256_movemask_epi8(f1); + + g0 = _mm256_castsi256_si128(f0); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; - tmp0 = _mm256_loadu_si256((__m256i_u *)vec); - tmp1 = _mm256_cmpgt_epi8(bound, tmp0); - good = _mm256_movemask_epi8(tmp1); + if (ctr > N - 8) { + break; + } + g0 = _mm256_extracti128_si256(f0, 1); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; - d0 = _mm256_castsi256_si128(tmp0); - rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount(good & 0xFF); - - d0 = _mm_bsrli_si128(d0, 8); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 8) & 0xFF); - - d0 = _mm256_extracti128_si256(tmp0, 1); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 16) & 0xFF); - - d0 = _mm_bsrli_si128(d0, 8); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 24) & 0xFF); + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3_AVX2_idxlut[good]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good); + pos += 4; } - while (ctr < len && pos < buflen) { - vec[0] = buf[pos] & 0x0F; - vec[1] = buf[pos++] >> 4; + uint32_t t0, t1; + while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; - if (vec[0] <= 2 * ETA) { - r[ctr++] = Q + ETA - vec[0]; + if (t0 < 9) { + r[ctr++] = 4 - t0; } - if (vec[1] <= 2 * ETA && ctr < len) { - r[ctr++] = Q + ETA - vec[1]; - } - } - - return ctr; -} - -uint32_t PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos; - uint32_t vec[8]; - __m256i d, tmp; - uint32_t good; - const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1); - const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1); - - ctr = pos = 0; - while (ctr + 8 <= len && pos + 20 <= buflen) { - for (size_t i = 0; i < 4; i++) { - vec[2 * i + 0] = buf[pos + 0]; - vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8; - vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16; - vec[2 * i + 0] &= 0xFFFFF; - - vec[2 * i + 1] = buf[pos + 2] >> 4; - vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4; - vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - } - - d = _mm256_loadu_si256((__m256i_u *)vec); - tmp = _mm256_cmpgt_epi32(bound, d); - good = _mm256_movemask_ps((__m256)tmp); - d = _mm256_sub_epi32(off, d); - - __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); - tmp = _mm256_cvtepu8_epi32(rid); - d = _mm256_permutevar8x32_epi32(d, tmp); - _mm256_storeu_si256((__m256i_u *)&r[ctr], d); - ctr += __builtin_popcount(good); - } - - while (ctr < len && pos + 5 <= buflen) { - vec[0] = buf[pos + 0]; - vec[0] |= (uint32_t)buf[pos + 1] << 8; - vec[0] |= (uint32_t)buf[pos + 2] << 16; - vec[0] &= 0xFFFFF; - - vec[1] = buf[pos + 2] >> 4; - vec[1] |= (uint32_t)buf[pos + 3] << 4; - vec[1] |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (vec[0] <= 2 * GAMMA1 - 2) { - r[ctr++] = Q + GAMMA1 - 1 - vec[0]; - } - if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) { - r[ctr++] = Q + GAMMA1 - 1 - vec[1]; + if (t1 < 9 && ctr < N) { + r[ctr++] = 4 - t1; } } diff --git a/crypto_sign/dilithium3/avx2/rejsample.h b/crypto_sign/dilithium3/avx2/rejsample.h index 799f3753..9012ae97 100644 --- a/crypto_sign/dilithium3/avx2/rejsample.h +++ b/crypto_sign/dilithium3/avx2/rejsample.h @@ -1,25 +1,19 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H - -#include +#ifndef PQCLEAN_DILITHIUM3_AVX2_REJSAMPLE_H +#define PQCLEAN_DILITHIUM3_AVX2_REJSAMPLE_H +#include "params.h" +#include "symmetric.h" #include -uint32_t PQCLEAN_DILITHIUM3_AVX2_rej_uniform( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); +#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) -uint32_t PQCLEAN_DILITHIUM3_AVX2_rej_eta( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); +#define REJ_UNIFORM_ETA_NBLOCKS ((228+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) -uint32_t PQCLEAN_DILITHIUM3_AVX2_rej_gamma1m1( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); +extern const uint8_t PQCLEAN_DILITHIUM3_AVX2_idxlut[256][8]; + +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); + +unsigned int PQCLEAN_DILITHIUM3_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); #endif diff --git a/crypto_sign/dilithium3/avx2/rounding.c b/crypto_sign/dilithium3/avx2/rounding.c index 920f0f70..1fbe15f5 100644 --- a/crypto_sign/dilithium3/avx2/rounding.c +++ b/crypto_sign/dilithium3/avx2/rounding.c @@ -1,115 +1,154 @@ +#include "consts.h" +#include "params.h" +#include "rejsample.h" #include "rounding.h" +#include +#include +#include + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) /************************************************* * Name: power2round * -* Description: For finite field element a, compute a0, a1 such that -* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. -* Assumes a to be standard representative. +* Description: For finite field elements a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be positive standard representative. * -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 +* Arguments: - __m256i *a1: output array of length N/8 with high bits +* - __m256i *a0: output array of length N/8 with low bits a0 +* - const __m256i *a: input array of length N/8 * -* Returns a1. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_AVX2_power2round(uint32_t a, uint32_t *a0) { - int32_t t; +void PQCLEAN_DILITHIUM3_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i mask = _mm256_set1_epi32(-(1 << D)); + const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); - /* Centralized remainder mod 2^D */ - t = a & ((1U << D) - 1); - t -= (1U << (D - 1)) + 1; - t += (t >> 31) & (1U << D); - t -= (1U << (D - 1)) - 1; - *a0 = Q + t; - a = (a - t) >> D; - return a; + for (i = 0; i < N / 8; ++i) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, half); + f0 = _mm256_and_si256(f1, mask); + f1 = _mm256_srli_epi32(f1, D); + f0 = _mm256_sub_epi32(f, f0); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } } /************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_decompose +* Name: decompose * -* Description: For finite field element a, compute high and low bits a0, a1 such -* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* Description: For finite field element a, compute high and low parts a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except * if a1 = (Q-1)/ALPHA where we set a1 = 0 and -* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard * representative. * -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 +* Arguments: - __m256i *a1: output array of length N/8 with high parts +* - __m256i *a0: output array of length N/8 with low parts a0 +* - const __m256i *a: input array of length N/8 * -* Returns a1. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_AVX2_decompose(uint32_t a, uint32_t *a0) { - int32_t t, u; +void PQCLEAN_DILITHIUM3_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM3_AVX2_qdata.vec[_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + const __m256i v = _mm256_set1_epi32(1025); + const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(512); + const __m256i mask = _mm256_set1_epi32(15); - /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFF; - t += (a >> 19) << 9; - t -= ALPHA / 2 + 1; - t += (t >> 31) & ALPHA; - t -= ALPHA / 2 - 1; - a -= t; + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} - /* Divide by ALPHA (possible to avoid) */ - u = a - 1; - u >>= 31; - a = (a >> 19) + 1; - a -= u & 1; - /* Border case */ - *a0 = Q + t - (a >> 4); - a &= 0xF; - return a; +/************************************************* +* Name: make_hint +* +* Description: Compute indices of polynomial coefficients whose low bits +* overflow into the high bits. +* +* Arguments: - uint8_t *hint: hint array +* - const __m256i *a0: low bits of input elements +* - const __m256i *a1: high bits of input elements +* +* Returns number of overflowing low bits +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { + unsigned int i, n = 0; + __m256i f0, f1, g0, g1; + uint32_t bad; + uint64_t idx; + const __m256i low = _mm256_set1_epi32(-GAMMA2); + const __m256i high = _mm256_set1_epi32(GAMMA2); + + for (i = 0; i < N / 8; ++i) { + f0 = _mm256_load_si256(&a0[i]); + f1 = _mm256_load_si256(&a1[i]); + g0 = _mm256_abs_epi32(f0); + g0 = _mm256_cmpgt_epi32(g0, high); + g1 = _mm256_cmpeq_epi32(f0, low); + g1 = _mm256_sign_epi32(g1, f1); + g0 = _mm256_or_si256(g0, g1); + + bad = _mm256_movemask_ps((__m256)g0); + memcpy(&idx, PQCLEAN_DILITHIUM3_AVX2_idxlut[bad], 8); + idx += (uint64_t)0x0808080808080808 * i; + memcpy(&hint[n], &idx, 8); + n += _mm_popcnt_u32(bad); + } + + return n; } /************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_make_hint +* Name: use_hint * -* Description: Compute hint bit indicating whether the low bits of the -* input element overflow into the high bits. Inputs assumed to be -* standard representatives. +* Description: Correct high parts according to hint. * -* Arguments: - uint32_t a0: low bits of input element -* - uint32_t a1: high bits of input element +* Arguments: - __m256i *b: output array of length N/8 with corrected high parts +* - const __m256i *a: input array of length N/8 +* - const __m256i *a: input array of length N/8 with hint bits * -* Returns 1 if high bits of a and b differ and 0 otherwise. **************************************************/ -unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint(const uint32_t a0, const uint32_t a1) { - if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { - return 0; - } +void PQCLEAN_DILITHIUM3_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { + unsigned int i; + __m256i a0[N / 8]; + __m256i f, g, h, t; + const __m256i zero = _mm256_setzero_si256(); + const __m256i mask = _mm256_set1_epi32(15); - return 1; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_use_hint -* -* Description: Correct high bits according to hint. -* -* Arguments: - uint32_t a: input element -* - unsigned int hint: hint bit -* -* Returns corrected high bits. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM3_AVX2_use_hint(const uint32_t a, const unsigned int hint) { - uint32_t a0, a1; - - a1 = PQCLEAN_DILITHIUM3_AVX2_decompose(a, &a0); - if (hint == 0) { - return a1; - } - if (a0 > Q) { - return (a1 + 1) & 0xF; - } - return (a1 - 1) & 0xF; - - /* If decompose does not divide out ALPHA: - if(hint == 0) - return a1; - else if(a0 > Q) - return (a1 + ALPHA) % (Q - 1); - else - return (a1 - ALPHA) % (Q - 1); - */ + PQCLEAN_DILITHIUM3_AVX2_decompose_avx(b, a0, a); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a0[i]); + g = _mm256_load_si256(&b[i]); + h = _mm256_load_si256(&hint[i]); + t = _mm256_blendv_epi32(zero, h, f); + t = _mm256_slli_epi32(t, 1); + h = _mm256_sub_epi32(h, t); + g = _mm256_add_epi32(g, h); + g = _mm256_and_si256(g, mask); + _mm256_store_si256(&b[i], g); + } } diff --git a/crypto_sign/dilithium3/avx2/rounding.h b/crypto_sign/dilithium3/avx2/rounding.h index 048e8aaa..69a15900 100644 --- a/crypto_sign/dilithium3/avx2/rounding.h +++ b/crypto_sign/dilithium3/avx2/rounding.h @@ -1,12 +1,12 @@ -#ifndef ROUNDING_H -#define ROUNDING_H - +#ifndef PQCLEAN_DILITHIUM3_AVX2_ROUNDING_H +#define PQCLEAN_DILITHIUM3_AVX2_ROUNDING_H #include "params.h" +#include #include -uint32_t PQCLEAN_DILITHIUM3_AVX2_power2round(uint32_t a, uint32_t *a0); -uint32_t PQCLEAN_DILITHIUM3_AVX2_decompose(uint32_t a, uint32_t *a0); -unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint(uint32_t a0, uint32_t a1); -uint32_t PQCLEAN_DILITHIUM3_AVX2_use_hint(uint32_t a, unsigned int hint); +void PQCLEAN_DILITHIUM3_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); +void PQCLEAN_DILITHIUM3_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); +unsigned int PQCLEAN_DILITHIUM3_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); +void PQCLEAN_DILITHIUM3_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); #endif diff --git a/crypto_sign/dilithium3/avx2/shuffle.S b/crypto_sign/dilithium3/avx2/shuffle.S new file mode 100644 index 00000000..e81f2486 --- /dev/null +++ b/crypto_sign/dilithium3/avx2/shuffle.S @@ -0,0 +1,54 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +#store +vmovdqa %ymm9,(%rdi) +vmovdqa %ymm8,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm3,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx) +.global _cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx) +cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx): +_cdecl(PQCLEAN_DILITHIUM3_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret diff --git a/crypto_sign/dilithium3/avx2/shuffle.inc b/crypto_sign/dilithium3/avx2/shuffle.inc index df352030..73e9ffe0 100644 --- a/crypto_sign/dilithium3/avx2/shuffle.inc +++ b/crypto_sign/dilithium3/avx2/shuffle.inc @@ -9,15 +9,17 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 .endm diff --git a/crypto_sign/dilithium3/avx2/sign.c b/crypto_sign/dilithium3/avx2/sign.c index b605203a..f746e8f8 100644 --- a/crypto_sign/dilithium3/avx2/sign.c +++ b/crypto_sign/dilithium3/avx2/sign.c @@ -1,6 +1,4 @@ -#include -#include - +#include "align.h" #include "fips202.h" #include "packing.h" #include "params.h" @@ -9,105 +7,36 @@ #include "randombytes.h" #include "sign.h" #include "symmetric.h" +#include +#include -/************************************************* -* Name: expand_mat -* -* Description: Implementation of ExpandA. Generates matrix A with uniformly -* random coefficients a_{i,j} by performing rejection -* sampling on the output stream of SHAKE128(rho|i|j). -* -* Arguments: - polyvecl mat[K]: output matrix -* - const uint8_t rho[]: byte array containing seed rho -**************************************************/ - -void PQCLEAN_DILITHIUM3_AVX2_expand_mat(polyvecl mat[5], const uint8_t rho[SEEDBYTES]) { - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[0].vec[0], - &mat[0].vec[1], - &mat[0].vec[2], - &mat[0].vec[3], - rho, 0, 1, 2, 3); - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[1].vec[0], - &mat[1].vec[1], - &mat[1].vec[2], - &mat[1].vec[3], - rho, 256, 257, 258, 259); - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[2].vec[0], - &mat[2].vec[1], - &mat[2].vec[2], - &mat[2].vec[3], - rho, 512, 513, 514, 515); - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[3].vec[0], - &mat[3].vec[1], - &mat[3].vec[2], - &mat[3].vec[3], - rho, 768, 769, 770, 771); - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_4x(&mat[4].vec[0], - &mat[4].vec[1], - &mat[4].vec[2], - &mat[4].vec[3], - rho, 1024, 1025, 1026, 1027); -} - - -/************************************************* -* Name: PQCLEAN_DILITHIUM3_AVX2_challenge -* -* Description: Implementation of H. Samples polynomial with 60 nonzero -* coefficients in {-1,1} using the output stream of -* SHAKE256(mu|w1). -* -* Arguments: - poly *c: pointer to output polynomial -* - const uint8_t mu[]: byte array containing mu -* - const polyveck *w1: pointer to vector w1 -**************************************************/ -void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, - const uint8_t mu[CRHBYTES], - const polyveck *w1) { - uint8_t b; - size_t pos; - uint64_t signs; - uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - uint8_t outbuf[SHAKE256_RATE]; - shake256ctx state; - - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[i] = mu[i]; +static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) { + switch (i) { + case 0: + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho); + *row = buf; + break; + case 1: + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho); + *row = buf + 1; + break; + case 2: + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho); + *row = buf; + break; + case 3: + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho); + *row = buf + 1; + break; + case 4: + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row4(buf, buf + 1, rho); + *row = buf; + break; + case 5: + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand_row5(buf + 1, buf, rho); + *row = buf + 1; + break; } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); - } - - shake256_absorb(&state, inbuf, sizeof(inbuf)); - shake256_squeezeblocks(outbuf, 1, &state); - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t) outbuf[i] << 8 * i; - } - - pos = 8; - - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - for (size_t i = 196; i < 256; ++i) { - do { - if (pos >= SHAKE256_RATE) { - shake256_squeezeblocks(outbuf, 1, &state); - pos = 0; - } - - b = outbuf[pos++]; - } while (b > i); - - c->coeffs[i] = c->coeffs[b]; - c->coeffs[b] = 1; - c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - shake256_ctx_release(&state); } /************************************************* @@ -116,57 +45,70 @@ void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, * Description: Generates public and private key. * * Arguments: - uint8_t *pk: pointer to output public key (allocated -* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES bytes) * - uint8_t *sk: pointer to output private key (allocated -* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* array of PQCLEAN_DILITHIUM3_AVX2_CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; uint8_t seedbuf[3 * SEEDBYTES]; - uint8_t tr[CRHBYTES]; const uint8_t *rho, *rhoprime, *key; - uint16_t nonce = 0; - polyvecl mat[K]; - polyvecl s1, s1hat; - polyveck s2, t, t1, t0; + polyvecl rowbuf[2]; + polyvecl s1, *row = rowbuf; + polyveck s2; + poly t1, t0; - /* Expand 32 bytes of randomness into rho, rhoprime and key */ - randombytes(seedbuf, 3 * SEEDBYTES); + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); rho = seedbuf; rhoprime = seedbuf + SEEDBYTES; key = seedbuf + 2 * SEEDBYTES; - /* Expand matrix */ - PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho); + /* Store rho, key */ + memcpy(pk, rho, SEEDBYTES); + memcpy(sk, rho, SEEDBYTES); + memcpy(sk + SEEDBYTES, key, SEEDBYTES); /* Sample short vectors s1 and s2 */ - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, - nonce, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s2.vec[0], &s2.vec[1], &s2.vec[2], &s2.vec[3], rhoprime, - nonce + 4, nonce + 5, nonce + 6, nonce + 7); - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta(&s2.vec[4], rhoprime, nonce + 8); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s1.vec[4], &s2.vec[0], &s2.vec[1], &s2.vec[2], rhoprime, 4, 5, 6, 7); + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_eta_4x(&s2.vec[3], &s2.vec[4], &s2.vec[5], &t0, rhoprime, 8, 9, 10, 11); - /* Matrix-vector multiplication */ - s1hat = s1; - PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1hat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); - //PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&t.vec[i]); - PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&t.vec[i]); + /* Pack secret vectors */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); + } + for (i = 0; i < K; i++) { + PQCLEAN_DILITHIUM3_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); } - /* Add error vector s2 */ - PQCLEAN_DILITHIUM3_AVX2_polyveck_add(&t, &t, &s2); + /* Transform s1 */ + PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1); - /* Extract t1 and write public key */ - PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(&t); - PQCLEAN_DILITHIUM3_AVX2_polyveck_power2round(&t1, &t0, &t); - PQCLEAN_DILITHIUM3_AVX2_pack_pk(pk, rho, &t1); - /* Compute CRH(rho, t1) and write secret key */ - crh(tr, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES); - PQCLEAN_DILITHIUM3_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); + for (i = 0; i < K; i++) { + /* Expand matrix row */ + polyvec_matrix_expand_row(&row, rowbuf, rho, i); + + /* Compute inner-product */ + PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&t1); + + /* Add error polynomial */ + PQCLEAN_DILITHIUM3_AVX2_poly_add(&t1, &t1, &s2.vec[i]); + + /* Round t and pack t1, t0 */ + PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&t1); + PQCLEAN_DILITHIUM3_AVX2_poly_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM3_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); + PQCLEAN_DILITHIUM3_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); + } + + /* Compute CRH(rho, t1) and store in secret key */ + crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES); return 0; } @@ -174,42 +116,40 @@ int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature * -* Description: Compute signed message. +* Description: Computes signature. * -* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES -* of len) -* - size_t *siglen: pointer to output length of signed message -* (should be PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) -* - uint8_t *m: pointer to message to be signed -* - size_t mlen: length of message -* - uint8_t *sk: pointer to bit-packed secret key +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key * * Returns 0 (success) **************************************************/ -int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - uint32_t n; +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + unsigned int i, n, pos; uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; uint8_t *rho, *tr, *key, *mu, *rhoprime; - uint16_t nonce = 0; - poly c, chat; - polyvecl mat[K], s1, y, yhat, z; - polyveck t0, s2, w, w1, w0; - polyveck h, cs2, ct0; + uint8_t hintbuf[N]; + uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce = 0; + polyvecl mat[K], s1, z; + polyveck t0, s2, w1; + poly c, tmp; + union { + polyvecl y; + polyveck w0; + } tmpv; + shake256incctx state; rho = seedbuf; tr = rho + SEEDBYTES; key = tr + CRHBYTES; mu = key + SEEDBYTES; rhoprime = mu + CRHBYTES; - PQCLEAN_DILITHIUM3_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); + PQCLEAN_DILITHIUM3_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); - - // use incremental hash API instead of copying around buffers - /* Compute CRH(tr, m) */ - shake256incctx state; + /* Compute CRH(tr, msg) */ shake256_inc_init(&state); shake256_inc_absorb(&state, tr, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -220,76 +160,89 @@ int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature( crh(rhoprime, key, SEEDBYTES + CRHBYTES); /* Expand matrix and transform vectors */ - PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho); + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_expand(mat, rho); PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&s1); PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&s2); PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&t0); + rej: /* Sample intermediate vector y */ - PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &y.vec[3], + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3], rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); - nonce += 4; + PQCLEAN_DILITHIUM3_AVX2_poly_uniform_gamma1(&z.vec[4], rhoprime, nonce + 4); + nonce += 5; - /* Matrix-vector multiplication */ - yhat = y; - PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&yhat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); - PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&w.vec[i]); - PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&w.vec[i]); - } + /* Matrix-vector product */ + tmpv.y = z; + PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&tmpv.y); + PQCLEAN_DILITHIUM3_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); + PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_tomont(&w1); /* Decompose w and call the random oracle */ - PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&w); - PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(&w1, &w0, &w); - PQCLEAN_DILITHIUM3_AVX2_challenge(&c, mu, &w1); - chat = c; - PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&chat); + PQCLEAN_DILITHIUM3_AVX2_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM3_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); + PQCLEAN_DILITHIUM3_AVX2_polyveck_pack_w1(sig, &w1); - /* Check that subtracting cs2 does not change high bits of w and low bits - * do not reveal secret information */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); - PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&cs2.vec[i]); - } - PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(&w0, &w0, &cs2); - PQCLEAN_DILITHIUM3_AVX2_polyveck_freeze(&w0); - if (PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) { - goto rej; - } + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM3_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&c); /* Compute z, reject if it reveals secret */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); - PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&z.vec[i]); - } - PQCLEAN_DILITHIUM3_AVX2_polyvecl_add(&z, &z, &y); - PQCLEAN_DILITHIUM3_AVX2_polyvecl_freeze(&z); - if (PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { - goto rej; + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM3_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); + PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&z.vec[i]); + if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { + goto rej; + } } - /* Compute hints for w1 */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); - PQCLEAN_DILITHIUM3_AVX2_poly_invntt_montgomery(&ct0.vec[i]); + /* Zero hint vector in signature */ + pos = 0; + memset(hint, 0, OMEGA); + + for (i = 0; i < K; i++) { + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM3_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&tmpv.w0.vec[i]); + if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints */ + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&tmp); + if (PQCLEAN_DILITHIUM3_AVX2_poly_chknorm(&tmp, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM3_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + n = PQCLEAN_DILITHIUM3_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); + if (pos + n > OMEGA) { + goto rej; + } + + /* Store hints in signature */ + memcpy(&hint[pos], hintbuf, n); + hint[OMEGA + i] = pos = pos + n; } - PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&ct0); - if (PQCLEAN_DILITHIUM3_AVX2_polyveck_chknorm(&ct0, GAMMA2)) { - goto rej; + /* Pack z into signature */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM3_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); } - PQCLEAN_DILITHIUM3_AVX2_polyveck_add(&w0, &w0, &ct0); - PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&w0); - n = PQCLEAN_DILITHIUM3_AVX2_polyveck_make_hint(&h, &w0, &w1); - if (n > OMEGA) { - goto rej; - } - - /* Write signature */ - PQCLEAN_DILITHIUM3_AVX2_pack_sig(sig, &z, &h, &c); *siglen = PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES; return 0; } @@ -303,63 +256,55 @@ rej: * array with PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + mlen bytes), * can be equal to m * - size_t *smlen: pointer to output length of signed -* message +* message * - const uint8_t *m: pointer to message to be signed * - size_t mlen: length of message * - const uint8_t *sk: pointer to bit-packed secret key * * Returns 0 (success) **************************************************/ -int PQCLEAN_DILITHIUM3_AVX2_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - int rc; - memmove(sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, m, mlen); - rc = PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk); +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, mlen, sk); *smlen += mlen; - return rc; + return 0; } /************************************************* * Name: PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify * -* Description: Verify signed message. +* Description: Verifies signature. * -* Arguments: - uint8_t *sig: signature -* - size_t siglen: length of signature (PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) -* - uint8_t *m: pointer to message -* - size_t *mlen: pointer to output length of message -* - uint8_t *pk: pointer to bit-packed public key +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key * -* Returns 0 if signed message could be verified correctly and -1 otherwise +* Returns 0 if signature could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, - const uint8_t *pk) { - uint8_t rho[SEEDBYTES]; +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { + unsigned int i, j, pos = 0; + /* PQCLEAN_DILITHIUM3_AVX2_polyw1_pack writes additional 14 bytes */ + ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; uint8_t mu[CRHBYTES]; - poly c, chat, cp; - polyvecl mat[K], z; - polyveck t1, w1, h, tmp1, tmp2; + const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + polyvecl rowbuf[2]; + polyvecl *row = rowbuf; + polyvecl z; + poly c, w1, h; + shake256incctx state; - if (siglen < PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) { - return -1; - } - - PQCLEAN_DILITHIUM3_AVX2_unpack_pk(rho, &t1, pk); - if (PQCLEAN_DILITHIUM3_AVX2_unpack_sig(&z, &h, &c, sig)) { - return -1; - } - if (PQCLEAN_DILITHIUM3_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + if (siglen != PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) { return -1; } /* Compute CRH(CRH(rho, t1), msg) */ crh(mu, pk, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_PUBLICKEYBYTES); - - shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -367,33 +312,69 @@ int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify( shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); - /* Matrix-vector multiplication; compute Az - c2^dt1 */ - PQCLEAN_DILITHIUM3_AVX2_expand_mat(mat, rho); - PQCLEAN_DILITHIUM3_AVX2_polyvecl_ntt(&z); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); + /* Expand PQCLEAN_DILITHIUM3_AVX2_challenge */ + PQCLEAN_DILITHIUM3_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&c); + + /* Unpack z; shortness follows from unpacking */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM3_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); + PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&z.vec[i]); } - chat = c; - PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&chat); - PQCLEAN_DILITHIUM3_AVX2_polyveck_shiftl(&t1); - PQCLEAN_DILITHIUM3_AVX2_polyveck_ntt(&t1); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + polyvec_matrix_expand_row(&row, rowbuf, pk, i); + + /* Compute i-th row of Az - c2^Dt1 */ + PQCLEAN_DILITHIUM3_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); + + PQCLEAN_DILITHIUM3_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); + PQCLEAN_DILITHIUM3_AVX2_poly_shiftl(&h); + PQCLEAN_DILITHIUM3_AVX2_poly_ntt(&h); + PQCLEAN_DILITHIUM3_AVX2_poly_pointwise_montgomery(&h, &c, &h); + + PQCLEAN_DILITHIUM3_AVX2_poly_sub(&w1, &w1, &h); + PQCLEAN_DILITHIUM3_AVX2_poly_reduce(&w1); + PQCLEAN_DILITHIUM3_AVX2_poly_invntt_tomont(&w1); + + /* Get hint polynomial and reconstruct w1 */ + memset(h.vec, 0, sizeof(poly)); + if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { + return -1; + } + + for (j = pos; j < hint[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > pos && hint[j] <= hint[j - 1]) { + return -1; + } + h.coeffs[hint[j]] = 1; + } + pos = hint[OMEGA + i]; + + PQCLEAN_DILITHIUM3_AVX2_poly_caddq(&w1); + PQCLEAN_DILITHIUM3_AVX2_poly_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM3_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); } - PQCLEAN_DILITHIUM3_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2); - PQCLEAN_DILITHIUM3_AVX2_polyveck_reduce(&tmp1); - PQCLEAN_DILITHIUM3_AVX2_polyveck_invntt_montgomery(&tmp1); + /* Extra indices are zero for strong unforgeability */ + for (j = pos; j < OMEGA; ++j) { + if (hint[j]) { + return -1; + } + } - /* Reconstruct w1 */ - PQCLEAN_DILITHIUM3_AVX2_polyveck_csubq(&tmp1); - PQCLEAN_DILITHIUM3_AVX2_polyveck_use_hint(&w1, &tmp1, &h); - - /* Call random oracle and verify challenge */ - PQCLEAN_DILITHIUM3_AVX2_challenge(&cp, mu, &w1); - for (size_t i = 0; i < N; ++i) { - if (c.coeffs[i] != cp.coeffs[i]) { + /* Call random oracle and verify PQCLEAN_DILITHIUM3_AVX2_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (buf.coeffs[i] != sig[i]) { return -1; } } @@ -407,7 +388,7 @@ int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify( * Description: Verify signed message. * * Arguments: - uint8_t *m: pointer to output message (allocated -* array with smlen bytes), can be equal to sm +* array with smlen bytes), can be equal to sm * - size_t *mlen: pointer to output length of message * - const uint8_t *sm: pointer to signed message * - size_t smlen: length of signed message @@ -415,30 +396,28 @@ int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify( * * Returns 0 if signed message could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk) { +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { + size_t i; + if (smlen < PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES; - if (PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, - sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ - for (size_t i = 0; i < *mlen; ++i) { + for (i = 0; i < *mlen; ++i) { m[i] = sm[PQCLEAN_DILITHIUM3_AVX2_CRYPTO_BYTES + i]; } return 0; } - /* Signature verification failed */ badsig: - *mlen = (size_t) -1; - for (size_t i = 0; i < smlen; ++i) { + /* Signature verification failed */ + *mlen = -1; + for (i = 0; i < smlen; ++i) { m[i] = 0; } diff --git a/crypto_sign/dilithium3/avx2/sign.h b/crypto_sign/dilithium3/avx2/sign.h index 15112b4d..d42631b3 100644 --- a/crypto_sign/dilithium3/avx2/sign.h +++ b/crypto_sign/dilithium3/avx2/sign.h @@ -1,15 +1,29 @@ -#ifndef SIGN_H -#define SIGN_H - -#include "api.h" +#ifndef PQCLEAN_DILITHIUM3_AVX2_SIGN_H +#define PQCLEAN_DILITHIUM3_AVX2_SIGN_H #include "params.h" #include "poly.h" #include "polyvec.h" +#include +#include -void PQCLEAN_DILITHIUM3_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES], - const polyveck *w1); +void PQCLEAN_DILITHIUM3_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM3_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); #endif - diff --git a/crypto_sign/dilithium3/avx2/stream.c b/crypto_sign/dilithium3/avx2/stream.c deleted file mode 100644 index eaa4326b..00000000 --- a/crypto_sign/dilithium3/avx2/stream.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "stream.h" - -#include - -void PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { - - uint8_t buf[SEEDBYTES + 2]; - memcpy(buf, seed, SEEDBYTES); - buf[SEEDBYTES] = (uint8_t)nonce; - buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); - - shake128_absorb(state, buf, SEEDBYTES + 2); -} - - -void PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { - - uint8_t buf[CRHBYTES + 2]; - memcpy(buf, seed, CRHBYTES); - buf[CRHBYTES] = (uint8_t)nonce; - buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); - - shake256_absorb(state, buf, CRHBYTES + 2); -} diff --git a/crypto_sign/dilithium3/avx2/stream.h b/crypto_sign/dilithium3/avx2/stream.h deleted file mode 100644 index 93ce3d06..00000000 --- a/crypto_sign/dilithium3/avx2/stream.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM3_AVX2_STREAM_H -#define PQCLEAN_DILITHIUM3_AVX2_STREAM_H - -#include - -#include "fips202.h" -#include "params.h" - -void PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); - -void PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); - -#endif diff --git a/crypto_sign/dilithium3/avx2/symmetric-shake.c b/crypto_sign/dilithium3/avx2/symmetric-shake.c new file mode 100644 index 00000000..1baa0e8f --- /dev/null +++ b/crypto_sign/dilithium3/avx2/symmetric-shake.c @@ -0,0 +1,26 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake128_inc_init(state); + shake128_inc_absorb(state, seed, SEEDBYTES); + shake128_inc_absorb(state, t, 2); + shake128_inc_finalize(state); +} + +void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake256_inc_init(state); + shake256_inc_absorb(state, seed, CRHBYTES); + shake256_inc_absorb(state, t, 2); + shake256_inc_finalize(state); +} diff --git a/crypto_sign/dilithium3/avx2/symmetric.h b/crypto_sign/dilithium3/avx2/symmetric.h index a54a9ab4..3d97a2b3 100644 --- a/crypto_sign/dilithium3/avx2/symmetric.h +++ b/crypto_sign/dilithium3/avx2/symmetric.h @@ -1,25 +1,36 @@ #ifndef PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H #define PQCLEAN_DILITHIUM3_AVX2_SYMMETRIC_H - -#include "params.h" -#include "stream.h" - - #include "fips202.h" +#include "params.h" +#include -#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) -#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_AVX2_shake128_stream_init(STATE, SEED, NONCE) -#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream128_ctx_release(STATE) shake128_ctx_release(STATE) -#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_AVX2_shake256_stream_init(STATE, SEED, NONCE) -#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream256_ctx_release(STATE) shake256_ctx_release(STATE) + + +typedef shake128incctx stream128_state; +typedef shake256incctx stream256_state; + +void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(shake128incctx *state, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); + +void PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(shake256incctx *state, + const uint8_t seed[CRHBYTES], + uint16_t nonce); #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE -typedef shake128ctx stream128_state; -typedef shake256ctx stream256_state; +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM3_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) +#define stream128_release(STATE) shake128_inc_ctx_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM3_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) +#define stream256_release(STATE) shake256_inc_ctx_release(STATE) #endif diff --git a/crypto_sign/dilithium3/clean/LICENSE b/crypto_sign/dilithium3/clean/LICENSE index 40541676..08473af7 100644 --- a/crypto_sign/dilithium3/clean/LICENSE +++ b/crypto_sign/dilithium3/clean/LICENSE @@ -1,6 +1,5 @@ Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) -For Keccak and the random number generator -we are using public-domain code from sources -and by authors listed in comments on top of -the respective files. +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium3/clean/Makefile b/crypto_sign/dilithium3/clean/Makefile index cd309004..9c094e09 100644 --- a/crypto_sign/dilithium3/clean/Makefile +++ b/crypto_sign/dilithium3/clean/Makefile @@ -1,13 +1,10 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libdilithium3_clean.a +HEADERS=api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS=ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o -SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c -OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o -HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - reduce.h rounding.h symmetric.h stream.h - -CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) all: $(LIB) diff --git a/crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake index f41af919..79d38690 100644 --- a/crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake +++ b/crypto_sign/dilithium3/clean/Makefile.Microsoft_nmake @@ -2,8 +2,13 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libdilithium3_clean.lib -OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj -CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX +OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj + +# Warning C4146 is raised when a unary minus operator is applied to an +# unsigned type; this has nonetheless been standard and portable for as +# long as there has been a C standard, and we need it for constant-time +# computations. Thus, we disable that spurious warning. +CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 all: $(LIBRARY) @@ -11,7 +16,7 @@ all: $(LIBRARY) $(OBJECTS): *.h $(LIBRARY): $(OBJECTS) - LIB.EXE /NOLOGO /WX /OUT:$@ $** + LIB.EXE /NOLOGO /WX /OUT:$@ $** clean: -DEL $(OBJECTS) diff --git a/crypto_sign/dilithium3/clean/api.h b/crypto_sign/dilithium3/clean/api.h index 77809909..1799052b 100644 --- a/crypto_sign/dilithium3/clean/api.h +++ b/crypto_sign/dilithium3/clean/api.h @@ -4,26 +4,14 @@ #include #include - -#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES 1472U -#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES 3504U -#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES 2701U +#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES 1952 +#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES 4016 +#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES 3293 #define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_ALGNAME "Dilithium3" -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair( - uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature( uint8_t *sig, size_t *siglen, @@ -33,6 +21,12 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); #endif diff --git a/crypto_sign/dilithium3/clean/ntt.c b/crypto_sign/dilithium3/clean/ntt.c index 9856d734..75c4ecbf 100644 --- a/crypto_sign/dilithium3/clean/ntt.c +++ b/crypto_sign/dilithium3/clean/ntt.c @@ -1,138 +1,98 @@ +#include "ntt.h" +#include "params.h" +#include "reduce.h" #include -#include "params.h" -#include "ntt.h" -#include "poly.h" -#include "reduce.h" - -/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM3_CLEAN_ntt */ -static const uint32_t PQCLEAN_DILITHIUM3_CLEAN_zetas[N] = { - 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, - 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464, - 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231, - 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005, 2706023, - 95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439, 4519302, - 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118, 6681150, - 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596, 811944, - 531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638, 4450022, - 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196, 7122806, - 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922, 3412210, - 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370, 7709315, - 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987, 5037034, - 264944, 508951, 3097992, 44288, 7280319, 904516, 3958618, 4656075, 8371839, - 1653064, 5130689, 2389356, 8169440, 759969, 7063561, 189548, 4827145, - 3159746, 6529015, 5971092, 8202977, 1315589, 1341330, 1285669, 6795489, - 7567685, 6940675, 5361315, 4499357, 4751448, 3839961, 2091667, 3407706, - 2316500, 3817976, 5037939, 2244091, 5933984, 4817955, 266997, 2434439, - 7144689, 3513181, 4860065, 4621053, 7183191, 5187039, 900702, 1859098, - 909542, 819034, 495491, 6767243, 8337157, 7857917, 7725090, 5257975, - 2031748, 3207046, 4823422, 7855319, 7611795, 4784579, 342297, 286988, - 5942594, 4108315, 3437287, 5038140, 1735879, 203044, 2842341, 2691481, - 5790267, 1265009, 4055324, 1247620, 2486353, 1595974, 4613401, 1250494, - 2635921, 4832145, 5386378, 1869119, 1903435, 7329447, 7047359, 1237275, - 5062207, 6950192, 7929317, 1312455, 3306115, 6417775, 7100756, 1917081, - 5834105, 7005614, 1500165, 777191, 2235880, 3406031, 7838005, 5548557, - 6709241, 6533464, 5796124, 4656147, 594136, 4603424, 6366809, 2432395, - 2454455, 8215696, 1957272, 3369112, 185531, 7173032, 5196991, 162844, - 1616392, 3014001, 810149, 1652634, 4686184, 6581310, 5341501, 3523897, - 3866901, 269760, 2213111, 7404533, 1717735, 472078, 7953734, 1723600, - 6577327, 1910376, 6712985, 7276084, 8119771, 4546524, 5441381, 6144432, - 7959518, 6094090, 183443, 7403526, 1612842, 4834730, 7826001, 3919660, - 8332111, 7018208, 3937738, 1400424, 7534263, 1976782 -}; - -/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM3_CLEAN_ntt */ -static const uint32_t PQCLEAN_DILITHIUM3_CLEAN_zetas_inv[N] = { - 6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416, - 3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036, - 3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683, - 7908339, 6662682, 975884, 6167306, 8110657, 4513516, 4856520, 3038916, - 1799107, 3694233, 6727783, 7570268, 5366416, 6764025, 8217573, 3183426, - 1207385, 8194886, 5011305, 6423145, 164721, 5925962, 5948022, 2013608, - 3776993, 7786281, 3724270, 2584293, 1846953, 1671176, 2831860, 542412, - 4974386, 6144537, 7603226, 6880252, 1374803, 2546312, 6463336, 1279661, - 1962642, 5074302, 7067962, 451100, 1430225, 3318210, 7143142, 1333058, - 1050970, 6476982, 6511298, 2994039, 3548272, 5744496, 7129923, 3767016, - 6784443, 5894064, 7132797, 4325093, 7115408, 2590150, 5688936, 5538076, - 8177373, 6644538, 3342277, 4943130, 4272102, 2437823, 8093429, 8038120, - 3595838, 768622, 525098, 3556995, 5173371, 6348669, 3122442, 655327, - 522500, 43260, 1613174, 7884926, 7561383, 7470875, 6521319, 7479715, - 3193378, 1197226, 3759364, 3520352, 4867236, 1235728, 5945978, 8113420, - 3562462, 2446433, 6136326, 3342478, 4562441, 6063917, 4972711, 6288750, - 4540456, 3628969, 3881060, 3019102, 1439742, 812732, 1584928, 7094748, - 7039087, 7064828, 177440, 2409325, 1851402, 5220671, 3553272, 8190869, - 1316856, 7620448, 210977, 5991061, 3249728, 6727353, 8578, 3724342, - 4421799, 7475901, 1100098, 8336129, 5282425, 7871466, 8115473, 3343383, - 1430430, 6527646, 7031341, 381987, 1308169, 22981, 1228525, 671102, - 2477047, 411027, 3693493, 2967645, 5665122, 6232521, 983419, 4968207, - 8253495, 3632928, 3157330, 3190144, 1000202, 4083598, 6441103, 1257611, - 1585221, 6203962, 4904467, 1452451, 3041255, 3677745, 1528703, 3930395, - 2797779, 6308525, 2556880, 4479693, 4499374, 7426187, 7849063, 7568473, - 4680821, 1600420, 2140649, 4873154, 3821735, 4874723, 1643818, 1699267, - 539299, 6031717, 300467, 4840449, 2867647, 4805995, 3043716, 3861115, - 4464978, 2537516, 3592148, 1661693, 4849980, 5303092, 8284641, 5674394, - 8100412, 4369920, 19422, 6623180, 3277672, 1399561, 3859737, 2118186, - 2108549, 5760665, 1119584, 549488, 4794489, 1079900, 7356305, 5654953, - 5700314, 5268920, 2884855, 5260684, 2091905, 359251, 6026966, 6554070, - 7913949, 876248, 777960, 8143293, 518909, 2608894, 8354570 +static const int32_t zetas[N] = { + 0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, + 2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, + -2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, + -1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, + 3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, + -671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, + -3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, + -3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, + 189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, + 1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, + 2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, + 266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, + 900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, + -655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, + 342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, + 2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, + -3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, + -1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, + -1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, + -542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, + -2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, + -3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, + -3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, + -426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, + -2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, + -554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 }; /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_ntt * * Description: Forward NTT, in-place. No modular reduction is performed after -* additions or subtractions. Hence output coefficients can be up -* to 16*Q larger than the coefficients of the input polynomial. -* Output vector is in bitreversed order. +* additions or subtractions. Output vector is in bitreversed order. * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]) { - size_t k, j; - uint32_t zeta, t; +void PQCLEAN_DILITHIUM3_CLEAN_ntt(int32_t a[N]) { + unsigned int len, start, j, k; + int32_t zeta, t; - k = 1; - for (size_t len = 128; len > 0; len >>= 1) { - for (size_t start = 0; start < N; start = j + len) { - zeta = PQCLEAN_DILITHIUM3_CLEAN_zetas[k++]; + k = 0; + for (len = 128; len > 0; len >>= 1) { + for (start = 0; start < N; start = j + len) { + zeta = zetas[++k]; for (j = start; j < start + len; ++j) { - t = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); - p[j + len] = p[j] + 2 * Q - t; - p[j] = p[j] + t; + t = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + a[j + len] = a[j] - t; + a[j] = a[j] + t; } } } } /************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont +* Name: PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont * * Description: Inverse NTT and multiplication by Montgomery factor 2^32. * In-place. No modular reductions after additions or -* subtractions. Input coefficient need to be smaller than 2*Q. -* Output coefficient are smaller than 2*Q. +* subtractions; input coefficients need to be smaller than +* Q in absolute value. Output coefficient are smaller than Q in +* absolute value. * * Arguments: - uint32_t p[N]: input/output coefficient array **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t p[N]) { - size_t start, len, j, k; - uint32_t t, zeta; - const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q; +void PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(int32_t a[N]) { + unsigned int start, len, j, k; + int32_t t, zeta; + const int32_t f = 41978; // mont^2/256 - k = 0; + k = 256; for (len = 1; len < N; len <<= 1) { for (start = 0; start < N; start = j + len) { - zeta = PQCLEAN_DILITHIUM3_CLEAN_zetas_inv[k++]; + zeta = -zetas[--k]; for (j = start; j < start + len; ++j) { - t = p[j]; - p[j] = t + p[j + len]; - p[j + len] = t + 256 * Q - p[j + len]; - p[j + len] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); + t = a[j]; + a[j] = t + a[j + len]; + a[j + len] = t - a[j + len]; + a[j + len] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); } } } for (j = 0; j < N; ++j) { - p[j] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t) f * p[j]); + a[j] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)f * a[j]); } } diff --git a/crypto_sign/dilithium3/clean/ntt.h b/crypto_sign/dilithium3/clean/ntt.h index e0176177..ac07568d 100644 --- a/crypto_sign/dilithium3/clean/ntt.h +++ b/crypto_sign/dilithium3/clean/ntt.h @@ -1,11 +1,10 @@ #ifndef PQCLEAN_DILITHIUM3_CLEAN_NTT_H #define PQCLEAN_DILITHIUM3_CLEAN_NTT_H - +#include "params.h" #include -#include "params.h" +void PQCLEAN_DILITHIUM3_CLEAN_ntt(int32_t a[N]); -void PQCLEAN_DILITHIUM3_CLEAN_ntt(uint32_t p[N]); -void PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(uint32_t p[N]); +void PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(int32_t a[N]); #endif diff --git a/crypto_sign/dilithium3/clean/packing.c b/crypto_sign/dilithium3/clean/packing.c index ebaee136..ed75c70d 100644 --- a/crypto_sign/dilithium3/clean/packing.c +++ b/crypto_sign/dilithium3/clean/packing.c @@ -3,6 +3,7 @@ #include "poly.h" #include "polyvec.h" + /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_pack_pk * @@ -12,17 +13,18 @@ * - const uint8_t rho[]: byte array containing rho * - const polyveck *t1: pointer to vector t1 **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { pk[i] = rho[i]; } pk += SEEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); } } @@ -35,212 +37,201 @@ void PQCLEAN_DILITHIUM3_CLEAN_pack_pk( * - const polyveck *t1: pointer to output vector t1 * - uint8_t pk[]: byte array containing bit-packed pk **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { rho[i] = pk[i]; } pk += SEEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); } } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sk * -* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). * * Arguments: - uint8_t sk[]: output byte array * - const uint8_t rho[]: byte array containing rho -* - const uint8_t key[]: byte array containing key * - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 * - const polyvecl *s1: pointer to vector s1 * - const polyveck *s2: pointer to vector s2 -* - const polyveck *t0: pointer to vector t0 **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { sk[i] = rho[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < SEEDBYTES; ++i) { + for (i = 0; i < SEEDBYTES; ++i) { sk[i] = key[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < CRHBYTES; ++i) { + for (i = 0; i < CRHBYTES; ++i) { sk[i] = tr[i]; } sk += CRHBYTES; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); } - sk += L * POLETA_SIZE_PACKED; + sk += L * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); } - sk += K * POLETA_SIZE_PACKED; + sk += K * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); } } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sk * -* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). * * Arguments: - const uint8_t rho[]: output byte array for rho -* - const uint8_t key[]: output byte array for key * - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 * - const polyvecl *s1: pointer to output vector s1 * - const polyveck *s2: pointer to output vector s2 -* - const polyveck *r0: pointer to output vector t0 * - uint8_t sk[]: byte array containing bit-packed sk **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { rho[i] = sk[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < SEEDBYTES; ++i) { + for (i = 0; i < SEEDBYTES; ++i) { key[i] = sk[i]; } sk += SEEDBYTES; - for (size_t i = 0; i < CRHBYTES; ++i) { + for (i = 0; i < CRHBYTES; ++i) { tr[i] = sk[i]; } sk += CRHBYTES; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); } - sk += L * POLETA_SIZE_PACKED; + sk += L * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); } - sk += K * POLETA_SIZE_PACKED; + sk += K * POLYETA_PACKEDBYTES; - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); } } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_pack_sig * -* Description: Bit-pack signature sig = (z, h, c). +* Description: Bit-pack signature sig = (c, z, h). * * Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM3_CLEAN_challenge hash length SEEDBYTES * - const polyvecl *z: pointer to vector z * - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { - size_t k; - uint64_t signs, mask; +void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; } - sig += L * POLZ_SIZE_PACKED; + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; /* Encode h */ - k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { - if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (uint8_t)j; - } - } - - sig[OMEGA + i] = (uint8_t)k; - } - while (k < OMEGA) { - sig[k++] = 0; - } - sig += OMEGA + K; - - /* Encode c */ - signs = 0; - mask = 1; - for (size_t i = 0; i < N / 8; ++i) { + for (i = 0; i < OMEGA + K; ++i) { sig[i] = 0; - for (size_t j = 0; j < 8; ++j) { - if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (uint8_t)(1u << j); - if (c->coeffs[8 * i + j] == (Q - 1)) { - signs |= mask; - } - mask <<= 1; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; } } - } - sig += N / 8; - for (size_t i = 0; i < 8; ++i) { - sig[i] = (uint8_t)(signs >> 8u * i); + + sig[OMEGA + i] = (uint8_t) k; } } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_unpack_sig * -* Description: Unpack signature sig = (z, h, c). +* Description: Unpack signature sig = (c, z, h). * -* Arguments: - polyvecl *z: pointer to output vector z +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM3_CLEAN_challenge hash +* - polyvecl *z: pointer to output vector z * - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial * - const uint8_t sig[]: byte array containing * bit-packed signature * * Returns 1 in case of malformed signature; otherwise 0. **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES]) { - size_t k; - uint64_t signs; +int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES]) { + unsigned int i, j, k; - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; } - sig += L * POLZ_SIZE_PACKED; + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; /* Decode h */ k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { h->vec[i].coeffs[j] = 0; } @@ -248,7 +239,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig( return 1; } - for (size_t j = k; j < sig[OMEGA + i]; ++j) { + for (j = k; j < sig[OMEGA + i]; ++j) { /* Coefficients are ordered for strong unforgeability */ if (j > k && sig[j] <= sig[j - 1]) { return 1; @@ -260,38 +251,11 @@ int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig( } /* Extra indices are zero for strong unforgeability */ - for (size_t j = k; j < OMEGA; ++j) { + for (j = k; j < OMEGA; ++j) { if (sig[j]) { return 1; } } - sig += OMEGA + K; - - /* Decode c */ - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)sig[N / 8 + i] << 8 * i; - } - - /* Extra sign bits are zero for strong unforgeability */ - if (signs >> 60) { - return 1; - } - - for (size_t i = 0; i < N / 8; ++i) { - for (size_t j = 0; j < 8; ++j) { - if ((sig[i] >> j) & 0x01) { - c->coeffs[8 * i + j] = 1; - c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - } - } - return 0; } diff --git a/crypto_sign/dilithium3/clean/packing.h b/crypto_sign/dilithium3/clean/packing.h index 2900e869..734c6f10 100644 --- a/crypto_sign/dilithium3/clean/packing.h +++ b/crypto_sign/dilithium3/clean/packing.h @@ -1,42 +1,31 @@ #ifndef PQCLEAN_DILITHIUM3_CLEAN_PACKING_H #define PQCLEAN_DILITHIUM3_CLEAN_PACKING_H - -#include "api.h" #include "params.h" #include "polyvec.h" +#include -void PQCLEAN_DILITHIUM3_CLEAN_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1); -void PQCLEAN_DILITHIUM3_CLEAN_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[SEEDBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM3_CLEAN_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, const polyveck *h, const poly *c); +void PQCLEAN_DILITHIUM3_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); -void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t *sk); -int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES]); +void PQCLEAN_DILITHIUM3_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM3_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES]); #endif diff --git a/crypto_sign/dilithium3/clean/params.h b/crypto_sign/dilithium3/clean/params.h index a6b3739d..76bca543 100644 --- a/crypto_sign/dilithium3/clean/params.h +++ b/crypto_sign/dilithium3/clean/params.h @@ -2,28 +2,40 @@ #define PQCLEAN_DILITHIUM3_CLEAN_PARAMS_H + #define SEEDBYTES 32 #define CRHBYTES 48 #define N 256 #define Q 8380417 -#define QBITS 23 -#define D 14 -#define GAMMA1 ((Q - 1)/16) -#define GAMMA2 (GAMMA1/2) -#define ALPHA (2*GAMMA2) +#define D 13 +#define ROOT_OF_UNITY 1753 -#define K 5 -#define L 4 -#define ETA 5 -#define SETABITS 4 -#define BETA 275 -#define OMEGA 96 +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 55 +#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_ALGNAME "Dilithium3" -#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) -#define POLT0_SIZE_PACKED ((N*D)/8) -#define POLETA_SIZE_PACKED ((N*SETABITS)/8) -#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) -#define POLW1_SIZE_PACKED ((N*4)/8) +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 640 + +#define POLYW1_PACKEDBYTES 128 + +#define POLYETA_PACKEDBYTES 128 + +#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) #endif diff --git a/crypto_sign/dilithium3/clean/poly.c b/crypto_sign/dilithium3/clean/poly.c index 84273cab..7a956575 100644 --- a/crypto_sign/dilithium3/clean/poly.c +++ b/crypto_sign/dilithium3/clean/poly.c @@ -4,48 +4,66 @@ #include "reduce.h" #include "rounding.h" #include "symmetric.h" +#include +#define DBENCH_START() +#define DBENCH_STOP(t) /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_poly_reduce * -* Description: Reduce all coefficients of input polynomial to representative -* in [0,2*Q[. +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a) { - for (size_t i = 0; i < N; ++i) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_reduce32(a->coeffs[i]); } + + DBENCH_STOP(*tred); } /************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_csubq +* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_caddq * -* Description: For all coefficients of input polynomial subtract Q if -* coefficient is bigger than Q. +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_poly_csubq(poly *a) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_csubq(a->coeffs[i]); +void PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_caddq(a->coeffs[i]); } + + DBENCH_STOP(*tred); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_poly_freeze * -* Description: Reduce all coefficients of the polynomial to standard -* representatives. +* Description: Inplace reduction of all coefficients of polynomial to +* standard representatives. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a) { - for (size_t i = 0; i < N; ++i) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_freeze(a->coeffs[i]); } + + DBENCH_STOP(*tred); } /************************************************* @@ -57,85 +75,111 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a) { * - const poly *a: pointer to first summand * - const poly *b: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } + + DBENCH_STOP(*tadd); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_poly_sub * -* Description: Subtract polynomials. Assumes coefficients of second input -* polynomial to be less than 2*Q. No modular reduction is +* Description: Subtract polynomials. No modular reduction is * performed. * * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial to be -* subtracted from first input polynomial +* subtraced from first input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i]; + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; } + + DBENCH_STOP(*tadd); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl * * Description: Multiply polynomial by 2^D without modular reduction. Assumes -* input coefficients to be less than 2^{32-D}. +* input coefficients to be less than 2^{31-D} in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(poly *a) { - for (size_t i = 0; i < N; ++i) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { a->coeffs[i] <<= D; } + + DBENCH_STOP(*tmul); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_poly_ntt * -* Description: Forward NTT. Output coefficients can be up to 16*Q larger than -* input coefficients. +* Description: Inplace forward NTT. Coefficients can grow by +* 8*Q in absolute value. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(poly *a) { + DBENCH_START(); + PQCLEAN_DILITHIUM3_CLEAN_ntt(a->coeffs); + + DBENCH_STOP(*tmul); } /************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery +* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont * -* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients -* need to be less than 2*Q. Output coefficients are less than 2*Q. +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. * * Arguments: - poly *a: pointer to input/output polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(poly *a) { - PQCLEAN_DILITHIUM3_CLEAN_invntt_frominvmont(a->coeffs); +void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3_CLEAN_invntt_tomont(a->coeffs); + + DBENCH_STOP(*tmul); } /************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery +* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery * * Description: Pointwise multiplication of polynomials in NTT domain * representation and multiplication of resulting polynomial -* with 2^{-32}. Output coefficients are less than 2*Q if input -* coefficient are less than 22*Q. +* by 2^{-32}. * * Arguments: - poly *c: pointer to output polynomial * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]); +void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); } + DBENCH_STOP(*tmul); } /************************************************* @@ -147,13 +191,18 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly * * standard representatives. * * Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *v: pointer to input polynomial +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_power2round(a->coeffs[i], &a0->coeffs[i]); + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]); } + + DBENCH_STOP(*tround); } /************************************************* @@ -166,13 +215,18 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a * Assumes coefficients to be standard representatives. * * Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *c: pointer to input polynomial +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_decompose(a->coeffs[i], &a0->coeffs[i]); + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]); } + + DBENCH_STOP(*tround); } /************************************************* @@ -188,12 +242,16 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) * * Returns number of 1 bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { - uint32_t s = 0; - for (size_t i = 0; i < N; ++i) { +unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { + unsigned int i, s = 0; + DBENCH_START(); + + for (i = 0; i < N; ++i) { h->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); s += h->coeffs[i]; } + + DBENCH_STOP(*tround); return s; } @@ -202,42 +260,56 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const * * Description: Use hint polynomial to correct the high bits of a polynomial. * -* Arguments: - poly *a: pointer to output polynomial with corrected high bits -* - const poly *b: pointer to input polynomial +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial * - const poly *h: pointer to input hint polynomial **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *h) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_use_hint(b->coeffs[i], h->coeffs[i]); +void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + b->coeffs[i] = PQCLEAN_DILITHIUM3_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]); } + + DBENCH_STOP(*tround); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm * * Description: Check infinity norm of polynomial against given bound. -* Assumes input coefficients to be standard representatives. +* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM3_CLEAN_reduce32(). * * Arguments: - const poly *a: pointer to polynomial -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm is strictly smaller than B and 1 otherwise. +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) { +int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; int32_t t; + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + /* It is ok to leak which coefficient violates the bound since the probability for each coefficient is independent of secret data but we must not leak the sign of the centralized representative. */ - for (size_t i = 0; i < N; ++i) { - /* Absolute value of centralized representative */ - t = (int32_t)((Q - 1) / 2 - a->coeffs[i]); - t ^= (t >> 31); - t = (Q - 1) / 2 - t; + for (i = 0; i < N; ++i) { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2 * a->coeffs[i]); - if ((uint32_t)t >= B) { + if (t >= B) { + DBENCH_STOP(*tsample); return 1; } } + + DBENCH_STOP(*tsample); return 0; } @@ -245,23 +317,23 @@ int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, uint32_t B) { * Name: rej_uniform * * Description: Sample uniformly random coefficients in [0, Q-1] by -* performing rejection sampling using array of random bytes. +* performing rejection sampling on array of random bytes. * -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled * - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes +* - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough * random bytes were given. **************************************************/ -static size_t rej_uniform( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; uint32_t t; + DBENCH_START(); ctr = pos = 0; while (ctr < len && pos + 3 <= buflen) { @@ -275,6 +347,7 @@ static size_t rej_uniform( } } + DBENCH_STOP(*tsample); return ctr; } @@ -282,22 +355,20 @@ static size_t rej_uniform( * Name: PQCLEAN_DILITHIUM3_CLEAN_poly_uniform * * Description: Sample polynomial with uniformly random coefficients -* in [0,Q-1] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_BUFLEN; - uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2]; stream128_state state; stream128_init(&state, seed, nonce); @@ -307,52 +378,53 @@ void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, while (ctr < N) { off = buflen % 3; - for (size_t i = 0; i < off; ++i) { + for (i = 0; i < off; ++i) { buf[i] = buf[buflen - off + i]; } - buflen = STREAM128_BLOCKBYTES + off; stream128_squeezeblocks(buf + off, 1, &state); + buflen = STREAM128_BLOCKBYTES + off; ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); } - stream128_ctx_release(&state); + stream128_release(&state); } /************************************************* * Name: rej_eta * * Description: Sample uniformly random coefficients in [-ETA, ETA] by -* performing rejection sampling using array of random bytes. +* performing rejection sampling on array of random bytes. * -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled * - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes +* - unsigned int buflen: length of array of random bytes * * Returns number of sampled coefficients. Can be smaller than len if not enough * random bytes were given. **************************************************/ -static size_t rej_eta( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; uint32_t t0, t1; + DBENCH_START(); ctr = pos = 0; while (ctr < len && pos < buflen) { t0 = buf[pos] & 0x0F; t1 = buf[pos++] >> 4; - if (t0 <= 2 * ETA) { - a[ctr++] = Q + ETA - t0; + if (t0 < 9) { + a[ctr++] = 4 - t0; } - if (t1 <= 2 * ETA && ctr < len) { - a[ctr++] = Q + ETA - t1; + if (t1 < 9 && ctr < len) { + a[ctr++] = 4 - t1; } } + DBENCH_STOP(*tsample); return ctr; } @@ -360,345 +432,387 @@ static size_t rej_eta( * Name: PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta * * Description: Sample polynomial with uniformly random coefficients -* in [-ETA,ETA] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). +* in [-ETA,ETA] by performing rejection sampling on the +* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES * - uint16_t nonce: 2-byte nonce **************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) +#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a, - const uint8_t *seed, + const uint8_t seed[SEEDBYTES], uint16_t nonce) { - size_t ctr; - uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES]; stream128_state state; stream128_init(&state, seed, nonce); stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); - ctr = rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); + ctr = rej_eta(a->coeffs, N, buf, buflen); while (ctr < N) { stream128_squeezeblocks(buf, 1, &state); ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); } - stream128_ctx_release(&state); + stream128_release(&state); } /************************************************* -* Name: rej_gamma1m1 -* -* Description: Sample uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling -* using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_gamma1m1( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - - size_t ctr, pos; - uint32_t t0, t1; - - ctr = pos = 0; - while (ctr < len && pos + 5 <= buflen) { - t0 = buf[pos]; - t0 |= (uint32_t)buf[pos + 1] << 8; - t0 |= (uint32_t)buf[pos + 2] << 16; - t0 &= 0xFFFFF; - - t1 = buf[pos + 2] >> 4; - t1 |= (uint32_t)buf[pos + 3] << 4; - t1 |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (t0 <= 2 * GAMMA1 - 2) { - a[ctr++] = Q + GAMMA1 - 1 - t0; - } - if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { - a[ctr++] = Q + GAMMA1 - 1 - t1; - } - } - - return ctr; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1 +* Name: poly_uniform_gamma1m1 * * Description: Sample polynomial with uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection -* sampling on output stream of SHAKE256(seed|nonce). +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). * * Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* CRHBYTES +* - const uint8_t seed[]: byte array with seed of length CRHBYTES * - uint16_t nonce: 16-bit nonce **************************************************/ -#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) -#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) -void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1(poly *a, +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; + uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES]; stream256_state state; stream256_init(&state, seed, nonce); - stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + stream256_release(&state); + PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(a, buf); +} - ctr = rej_gamma1m1(a->coeffs, N, buf, buflen); +/************************************************* +* Name: PQCLEAN_DILITHIUM3_CLEAN_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[SHAKE256_RATE]; + shake256incctx state; - while (ctr < N) { - off = buflen % 5; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf, sizeof buf, &state); - buflen = STREAM256_BLOCKBYTES + off; - stream256_squeezeblocks(buf + off, 1, &state); - ctr += rej_gamma1m1(a->coeffs + ctr, N - ctr, buf, buflen); + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)buf[i] << 8 * i; } - stream256_ctx_release(&state); + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf, sizeof buf, &state); + pos = 0; + } + + b = buf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack * * Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. -* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLETA_SIZE_PACKED bytes +* POLYETA_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { + unsigned int i; uint8_t t[8]; + DBENCH_START(); - for (size_t i = 0; i < N / 2; ++i) { - t[0] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 0]); - t[1] = (uint8_t)(Q + ETA - a->coeffs[2 * i + 1]); - r[i] = (uint8_t)(t[0] | (t[1] << 4)); + for (i = 0; i < N / 2; ++i) { + t[0] = (uint8_t) (ETA - a->coeffs[2 * i + 0]); + t[1] = (uint8_t) (ETA - a->coeffs[2 * i + 1]); + r[i] = t[0] | (t[1] << 4); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack * * Description: Unpack polynomial with coefficients in [-ETA,ETA]. -* Output coefficients lie in [Q-ETA,Q+ETA]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 2; ++i) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[i] & 0x0F; r->coeffs[2 * i + 1] = a[i] >> 4; - r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1]; + r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; } + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack * -* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. * Input coefficients are assumed to be standard representatives. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT1_SIZE_PACKED bytes +* POLYT1_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); - for (size_t i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)); + r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)); + r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)); + r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2); } + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack * -* Description: Unpack polynomial t1 with 9-bit coefficients. +* Description: Unpack polynomial t1 with 10-bit coefficients. * Output coefficients are standard representatives. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t) a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t) a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t) a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t) a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t) a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t) a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t) a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t) a[9 * i + 8] << 1)) & 0x1FF; + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack * * Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT0_SIZE_PACKED bytes +* POLYT0_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { - uint32_t t[4]; + unsigned int i; + uint32_t t[8]; + DBENCH_START(); - for (size_t i = 0; i < N / 4; ++i) { - t[0] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 3]; + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; - r[7 * i + 0] = (uint8_t)(t[0]); - r[7 * i + 1] = (uint8_t)(t[0] >> 8); - r[7 * i + 1] |= (uint8_t)(t[1] << 6); - r[7 * i + 2] = (uint8_t)(t[1] >> 2); - r[7 * i + 3] = (uint8_t)(t[1] >> 10); - r[7 * i + 3] |= (uint8_t)(t[2] << 4); - r[7 * i + 4] = (uint8_t)(t[2] >> 4); - r[7 * i + 5] = (uint8_t)(t[2] >> 12); - r[7 * i + 5] |= (uint8_t)(t[3] << 2); - r[7 * i + 6] = (uint8_t)(t[3] >> 6); + r[13 * i + 0] = (uint8_t) t[0]; + r[13 * i + 1] = (uint8_t) (t[0] >> 8); + r[13 * i + 1] |= (uint8_t) (t[1] << 5); + r[13 * i + 2] = (uint8_t) (t[1] >> 3); + r[13 * i + 3] = (uint8_t) (t[1] >> 11); + r[13 * i + 3] |= (uint8_t) (t[2] << 2); + r[13 * i + 4] = (uint8_t) (t[2] >> 6); + r[13 * i + 4] |= (uint8_t) (t[3] << 7); + r[13 * i + 5] = (uint8_t) (t[3] >> 1); + r[13 * i + 6] = (uint8_t) (t[3] >> 9); + r[13 * i + 6] |= (uint8_t) (t[4] << 4); + r[13 * i + 7] = (uint8_t) (t[4] >> 4); + r[13 * i + 8] = (uint8_t) (t[4] >> 12); + r[13 * i + 8] |= (uint8_t) (t[5] << 1); + r[13 * i + 9] = (uint8_t) (t[5] >> 7); + r[13 * i + 9] |= (uint8_t) (t[6] << 6); + r[13 * i + 10] = (uint8_t) (t[6] >> 2); + r[13 * i + 11] = (uint8_t) (t[6] >> 10); + r[13 * i + 11] |= (uint8_t) (t[7] << 3); + r[13 * i + 12] = (uint8_t) (t[7] >> 5); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack * * Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); - for (size_t i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t) (a[7 * i + 1] & 0x3F) << 8; + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t) a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t) (a[7 * i + 3] & 0x0F) << 10; + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t) a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t) (a[7 * i + 5] & 0x03) << 12; + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t) a[7 * i + 6] << 6; + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; - r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; - r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; - r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; - r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyz_pack * -* Description: Bit-pack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Input coefficients are assumed to be standard representatives. +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLZ_SIZE_PACKED bytes +* POLYZ_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyz_pack(uint8_t *r, const poly *a) { - uint32_t t[2]; + unsigned int i; + uint32_t t[4]; + DBENCH_START(); - for (size_t i = 0; i < N / 2; ++i) { - /* Map to {0,...,2*GAMMA1 - 2} */ - t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; - t[0] += ((int32_t)t[0] >> 31) & Q; - t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; - t[1] += ((int32_t)t[1] >> 31) & Q; + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; - r[5 * i + 0] = (uint8_t)t[0]; - r[5 * i + 1] = (uint8_t)(t[0] >> 8); - r[5 * i + 2] = (uint8_t)(t[0] >> 16); - r[5 * i + 2] |= (uint8_t)(t[1] << 4); - r[5 * i + 3] = (uint8_t)(t[1] >> 4); - r[5 * i + 4] = (uint8_t)(t[1] >> 12); + r[5 * i + 0] = (uint8_t) t[0]; + r[5 * i + 1] = (uint8_t) (t[0] >> 8); + r[5 * i + 2] = (uint8_t) (t[0] >> 16); + r[5 * i + 2] |= (uint8_t) (t[1] << 4); + r[5 * i + 3] = (uint8_t) (t[1] >> 4); + r[5 * i + 4] = (uint8_t) (t[1] >> 12); } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack * * Description: Unpack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Output coefficients are standard representatives. +* in [-(GAMMA1 - 1), GAMMA1]. * * Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: byte array with bit-packed polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 2; ++i) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t) a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t) (a[5 * i + 2] & 0x0F) << 16; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16; + r->coeffs[2 * i + 0] &= 0xFFFFF; r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 4] << 12; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + r->coeffs[2 * i + 0] &= 0xFFFFF; - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; - r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; + r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; } + + DBENCH_STOP(*tpack); } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack * -* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. * Input coefficients are assumed to be standard representatives. * * Arguments: - uint8_t *r: pointer to output byte array with at least -* POLW1_SIZE_PACKED bytes +* POLYW1_PACKEDBYTES bytes * - const poly *a: pointer to input polynomial **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { - for (size_t i = 0; i < N / 2; ++i) { - r[i] = (uint8_t)(a->coeffs[2 * i + 0] | a->coeffs[2 * i + 1] << 4); + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)); } + DBENCH_STOP(*tpack); } diff --git a/crypto_sign/dilithium3/clean/poly.h b/crypto_sign/dilithium3/clean/poly.h index 40b48c67..d5531e26 100644 --- a/crypto_sign/dilithium3/clean/poly.h +++ b/crypto_sign/dilithium3/clean/poly.h @@ -1,53 +1,40 @@ #ifndef PQCLEAN_DILITHIUM3_CLEAN_POLY_H #define PQCLEAN_DILITHIUM3_CLEAN_POLY_H - -#include +#include "params.h" #include -#include "params.h" - typedef struct { - uint32_t coeffs[N]; + int32_t coeffs[N]; } poly; void PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_poly_csubq(poly *a); +void PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(poly *a); void PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_poly_add( - poly *c, const poly *a, const poly *b); -void PQCLEAN_DILITHIUM3_CLEAN_poly_sub( - poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3_CLEAN_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); void PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(poly *a); void PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery( - poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); -void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round( - poly *a1, poly *a0, const poly *a); -void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose( - poly *a1, poly *a0, const poly *a); -uint32_t PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint( - poly *h, const poly *a0, const poly *a1); -void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint( - poly *a, const poly *b, const poly *h); +void PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); -int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm( - const poly *a, uint32_t B); -void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform( - poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta( - poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1( - poly *a, - const uint8_t seed[CRHBYTES], - uint16_t nonce); +int PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); void PQCLEAN_DILITHIUM3_CLEAN_polyeta_pack(uint8_t *r, const poly *a); void PQCLEAN_DILITHIUM3_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); diff --git a/crypto_sign/dilithium3/clean/polyvec.c b/crypto_sign/dilithium3/clean/polyvec.c index 8e7f08dc..67212779 100644 --- a/crypto_sign/dilithium3/clean/polyvec.c +++ b/crypto_sign/dilithium3/clean/polyvec.c @@ -1,14 +1,65 @@ -#include -#include - #include "params.h" #include "poly.h" #include "polyvec.h" +#include + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + + for (i = 0; i < K; ++i) { + for (j = 0; j < L; ++j) { + PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); + } + } +} + +void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} /**************************************************************/ /************ Vectors of polynomials of length L **************/ /**************************************************************/ +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); + } +} + +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(&v->vec[i]); + } +} + /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze * @@ -18,7 +69,9 @@ * Arguments: - polyvecl *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(&v->vec[i]); } } @@ -33,9 +86,10 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v) { * - const polyvecl *u: pointer to first summand * - const polyvecl *v: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add( - polyvecl *w, const polyvecl *u, const polyvecl *v) { - for (size_t i = 0; i < L; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -49,32 +103,49 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add( * Arguments: - polyvecl *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { + unsigned int i; + + for (i = 0; i < L; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&v->vec[i]); } } +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + /************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery * * Description: Pointwise multiply vectors of polynomials of length L, multiply * resulting vector by 2^{-32} and add (accumulate) polynomials * in it. Input/output vectors are in NTT domain representation. -* Input coefficients are assumed to be less than 22*Q. Output -* coeffcient are less than 2*L*Q. * * Arguments: - poly *w: output polynomial * - const polyvecl *u: pointer to first input vector * - const polyvecl *v: pointer to second input vector **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v) { +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + unsigned int i; poly t; - PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(w, &u->vec[0], &v->vec[0]); - - for (size_t i = 1; i < L; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(&t, &u->vec[i], &v->vec[i]); + PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); + for (i = 1; i < L; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); PQCLEAN_DILITHIUM3_CLEAN_poly_add(w, w, &t); } } @@ -83,17 +154,19 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery( * Name: PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm * * Description: Check infinity norm of polynomials in vector of length L. -* Assumes input coefficients to be standard representatives. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(). * * Arguments: - const polyvecl *v: pointer to vector -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm of all polynomials is strictly smaller than B and 1 -* otherwise. +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { - for (size_t i = 0; i < L; ++i) { - if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], B)) { +int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) { return 1; } } @@ -105,32 +178,43 @@ int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { /************ Vectors of polynomials of length K **************/ /**************************************************************/ +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce * * Description: Reduce coefficients of polynomials in vector of length K -* to representatives in [0,2*Q[. +* to representatives in [-6283009,6283007]. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(&v->vec[i]); } } /************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq * * Description: For all coefficients of polynomials in vector of length K -* subtract Q if coefficient is bigger than Q. +* add Q if coefficient is negative. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_csubq(&v->vec[i]); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_caddq(&v->vec[i]); } } @@ -143,7 +227,9 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(polyveck *v) { * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_freeze(&v->vec[i]); } } @@ -158,9 +244,10 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v) { * - const polyveck *u: pointer to first summand * - const polyveck *v: pointer to second summand **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -169,17 +256,17 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add( * Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub * * Description: Subtract vectors of polynomials of length K. -* Assumes coefficients of polynomials in second input vector -* to be less than 2*Q. No modular reduction is performed. +* No modular reduction is performed. * * Arguments: - polyveck *w: pointer to output vector * - const polyveck *u: pointer to first input vector * - const polyveck *v: pointer to second input vector to be * subtracted from first input vector **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); } } @@ -188,12 +275,14 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub( * Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl * * Description: Multiply vector of polynomials of Length K by 2^D without modular -* reduction. Assumes input coefficients to be less than 2^{32-D}. +* reduction. Assumes input coefficients to be less than 2^{31-D}. * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_shiftl(&v->vec[i]); } } @@ -207,13 +296,15 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v) { * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v) { - for (size_t i = 0; i < K; ++i) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&v->vec[i]); } } /************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery +* Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont * * Description: Inverse NTT and multiplication by 2^{32} of polynomials * in vector of length K. Input coefficients need to be less @@ -221,27 +312,40 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v) { * * Arguments: - polyveck *v: pointer to input/output vector **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(&v->vec[i]); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_tomont(&v->vec[i]); } } +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + + /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm * * Description: Check infinity norm of polynomials in vector of length K. -* Assumes input coefficients to be standard representatives. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(). * * Arguments: - const polyveck *v: pointer to vector -* - uint32_t B: norm bound +* - int32_t B: norm bound * -* Returns 0 if norm of all polynomials are strictly smaller than B and 1 -* otherwise. +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { - for (size_t i = 0; i < K; ++i) { - if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], B)) { +int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM3_CLEAN_poly_chknorm(&v->vec[i], bound)) { return 1; } } @@ -253,19 +357,20 @@ int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { * Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round * * Description: For all coefficients a of polynomials in vector of length K, -* compute a0, a1 such that a mod Q = a1*2^D + a0 +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 * with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be * standard representatives. * * Arguments: - polyveck *v1: pointer to output vector of polynomials with * coefficients a1 * - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 +* coefficients a0 * - const polyveck *v: pointer to input vector **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); } } @@ -274,7 +379,7 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round( * Name: PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose * * Description: For all coefficients a of polynomials in vector of length K, -* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 * with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we * set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. * Assumes coefficients to be standard representatives. @@ -282,12 +387,13 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round( * Arguments: - polyveck *v1: pointer to output vector of polynomials with * coefficients a1 * - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 +* coefficients a0 * - const polyveck *v: pointer to input vector **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { PQCLEAN_DILITHIUM3_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); } } @@ -303,15 +409,13 @@ void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose( * * Returns number of 1 bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1) { - uint32_t s = 0; +unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; - for (size_t i = 0; i < K; ++i) { - s += PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint( - &h->vec[i], &v0->vec[i], &v1->vec[i]); + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM3_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); } return s; @@ -324,13 +428,21 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint( * * Arguments: - polyveck *w: pointer to output vector of polynomials with * corrected high bits -* - const polyveck *v: pointer to input vector +* - const polyveck *u: pointer to input vector * - const polyveck *h: pointer to input hint vector **************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint( - &w->vec[i], &v->vec[i], &h->vec[i]); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); } } diff --git a/crypto_sign/dilithium3/clean/polyvec.h b/crypto_sign/dilithium3/clean/polyvec.h index abc57d38..be184a7b 100644 --- a/crypto_sign/dilithium3/clean/polyvec.h +++ b/crypto_sign/dilithium3/clean/polyvec.h @@ -1,25 +1,33 @@ #ifndef PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H #define PQCLEAN_DILITHIUM3_CLEAN_POLYVEC_H - -#include - #include "params.h" #include "poly.h" +#include /* Vectors of polynomials of length L */ typedef struct { poly vec[L]; } polyvecl; +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(polyvecl *v); + void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(polyvecl *v); void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(polyvecl *v); -void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); -int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B); + +int PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); @@ -28,31 +36,33 @@ typedef struct { poly vec[K]; } polyveck; +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + void PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(polyveck *v); -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(polyveck *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(polyveck *v); void PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(polyveck *v); -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v); -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); void PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(polyveck *v); void PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(polyveck *v); -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery(polyveck *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); -int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm( - const polyveck *v, uint32_t B); +int PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v); -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v); -uint32_t PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1); -void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); #endif diff --git a/crypto_sign/dilithium3/clean/reduce.c b/crypto_sign/dilithium3/clean/reduce.c index 02da8968..ded18523 100644 --- a/crypto_sign/dilithium3/clean/reduce.c +++ b/crypto_sign/dilithium3/clean/reduce.c @@ -1,60 +1,54 @@ -#include - #include "params.h" #include "reduce.h" +#include /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce * -* Description: For finite field element a with 0 <= a <= Q*2^32, -* compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q. +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. * -* Arguments: - uint64_t: finite field element a +* Arguments: - int64_t: finite field element a * * Returns r. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(uint64_t a) { - uint64_t t; +int32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(int64_t a) { + int32_t t; - t = a * QINV; - t &= (1ULL << 32) - 1; - t *= Q; - t = a + t; - t >>= 32; - return (uint32_t)t; + t = (int32_t)((uint64_t)a * (uint64_t)QINV); + t = (a - (int64_t)t * Q) >> 32; + return t; } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_reduce32 * -* Description: For finite field element a, compute r \equiv a (mod Q) -* such that 0 <= r < 2*Q. +* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, +* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. * -* Arguments: - uint32_t: finite field element a +* Arguments: - int32_t: finite field element a * * Returns r. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(uint32_t a) { - uint32_t t; +int32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(int32_t a) { + int32_t t; - t = a & 0x7FFFFF; - a >>= 23; - t += (a << 13) - a; + t = (a + (1 << 22)) >> 23; + t = a - t * Q; return t; } /************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_csubq +* Name: PQCLEAN_DILITHIUM3_CLEAN_caddq * -* Description: Subtract Q if input coefficient is bigger than Q. +* Description: Add Q if input coefficient is negative. * -* Arguments: - uint32_t: finite field element a +* Arguments: - int32_t: finite field element a * * Returns r. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_csubq(uint32_t a) { - a -= Q; - a += ((int32_t)a >> 31) & Q; +int32_t PQCLEAN_DILITHIUM3_CLEAN_caddq(int32_t a) { + a += (a >> 31) & Q; return a; } @@ -62,14 +56,14 @@ uint32_t PQCLEAN_DILITHIUM3_CLEAN_csubq(uint32_t a) { * Name: PQCLEAN_DILITHIUM3_CLEAN_freeze * * Description: For finite field element a, compute standard -* representative r = a mod Q. +* representative r = a mod^+ Q. * -* Arguments: - uint32_t: finite field element a +* Arguments: - int32_t: finite field element a * * Returns r. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_freeze(uint32_t a) { +int32_t PQCLEAN_DILITHIUM3_CLEAN_freeze(int32_t a) { a = PQCLEAN_DILITHIUM3_CLEAN_reduce32(a); - a = PQCLEAN_DILITHIUM3_CLEAN_csubq(a); + a = PQCLEAN_DILITHIUM3_CLEAN_caddq(a); return a; } diff --git a/crypto_sign/dilithium3/clean/reduce.h b/crypto_sign/dilithium3/clean/reduce.h index ba94792e..e3e3ed36 100644 --- a/crypto_sign/dilithium3/clean/reduce.h +++ b/crypto_sign/dilithium3/clean/reduce.h @@ -1,21 +1,17 @@ #ifndef PQCLEAN_DILITHIUM3_CLEAN_REDUCE_H #define PQCLEAN_DILITHIUM3_CLEAN_REDUCE_H - +#include "params.h" #include -#define MONT 4193792U // 2^32 % Q -#define QINV 4236238847U // -q^(-1) mod 2^32 +#define MONT (-4186625) // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 -/* a <= Q*2^32 => r < 2*Q */ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(uint64_t a); +int32_t PQCLEAN_DILITHIUM3_CLEAN_montgomery_reduce(int64_t a); -/* r < 2*Q */ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(uint32_t a); +int32_t PQCLEAN_DILITHIUM3_CLEAN_reduce32(int32_t a); -/* a < 2*Q => r < Q */ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_csubq(uint32_t a); +int32_t PQCLEAN_DILITHIUM3_CLEAN_caddq(int32_t a); -/* r < Q */ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_freeze(uint32_t a); +int32_t PQCLEAN_DILITHIUM3_CLEAN_freeze(int32_t a); #endif diff --git a/crypto_sign/dilithium3/clean/rounding.c b/crypto_sign/dilithium3/clean/rounding.c index 2da5f5d1..f0181477 100644 --- a/crypto_sign/dilithium3/clean/rounding.c +++ b/crypto_sign/dilithium3/clean/rounding.c @@ -1,86 +1,70 @@ #include "params.h" #include "rounding.h" +#include /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_power2round * * Description: For finite field element a, compute a0, a1 such that -* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. * Assumes a to be standard representative. * -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 * * Returns a1. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(uint32_t a, uint32_t *a0) { - uint32_t t; +int32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(int32_t *a0, int32_t a) { + int32_t a1; - /* Centralized remainder mod 2^D */ - t = a & ((1U << D) - 1); - t -= (1U << (D - 1)) + 1; - t += ((uint32_t)((int32_t)t >> 31) & (1 << D)); - t -= (1U << (D - 1)) - 1; - *a0 = Q + t; - a = (a - t) >> D; - return a; + a1 = (a + (1 << (D - 1)) - 1) >> D; + *a0 = a - (a1 << D); + return a1; } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_decompose * * Description: For finite field element a, compute high and low bits a0, a1 such -* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except * if a1 = (Q-1)/ALPHA where we set a1 = 0 and -* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard +* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard * representative. * -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 * * Returns a1. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(uint32_t a, uint32_t *a0) { - int32_t t, u; +int32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(int32_t *a0, int32_t a) { + int32_t a1; - /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFFu; - t += (int32_t)((a >> 19u) << 9u); - t -= ALPHA / 2 + 1; - t += (t >> 31) & ALPHA; - t -= ALPHA / 2 - 1; - a -= (uint32_t)t; + a1 = (a + 127) >> 7; + a1 = (a1 * 1025 + (1 << 21)) >> 22; + a1 &= 15; - /* Divide by ALPHA (possible to avoid) */ - u = (int32_t)(a - 1); - u >>= 31; - a = (a >> 19) + 1; - a -= u & 1; - - /* Border case */ - *a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u)); - a &= 0xFu; - return a; + *a0 = a - a1 * 2 * GAMMA2; + *a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; + return a1; } /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_make_hint * * Description: Compute hint bit indicating whether the low bits of the -* input element overflow into the high bits. Inputs assumed to be -* standard representatives. +* input element overflow into the high bits. * -* Arguments: - uint32_t a0: low bits of input element -* - uint32_t a1: high bits of input element +* Arguments: - int32_t a0: low bits of input element +* - int32_t a1: high bits of input element * -* Returns 1 if high bits of a and b differ and 0 otherwise. +* Returns 1 if overflow. **************************************************/ -unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(uint32_t a0, uint32_t a1) { - if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { - return 0; +unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(int32_t a0, int32_t a1) { + if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { + return 1; } - return 1; + return 0; } /************************************************* @@ -88,30 +72,21 @@ unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(uint32_t a0, uint32_t a1) { * * Description: Correct high bits according to hint. * -* Arguments: - uint32_t a: input element +* Arguments: - int32_t a: input element * - unsigned int hint: hint bit * * Returns corrected high bits. **************************************************/ -uint32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(uint32_t a, unsigned int hint) { - uint32_t a0, a1; +int32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(int32_t a, unsigned int hint) { + int32_t a0, a1; - a1 = PQCLEAN_DILITHIUM3_CLEAN_decompose(a, &a0); + a1 = PQCLEAN_DILITHIUM3_CLEAN_decompose(&a0, a); if (hint == 0) { return a1; } - if (a0 > Q) { - return (a1 + 1) & 0xF; + + if (a0 > 0) { + return (a1 + 1) & 15; } - - return (a1 - 1) & 0xF; - - /* If PQCLEAN_DILITHIUM3_CLEAN_decompose does not divide out ALPHA: - if(hint == 0) - return a1; - else if(a0 > Q) - return (a1 + ALPHA) % (Q - 1); - else - return (a1 - ALPHA) % (Q - 1); - */ + return (a1 - 1) & 15; } diff --git a/crypto_sign/dilithium3/clean/rounding.h b/crypto_sign/dilithium3/clean/rounding.h index acb2fbdd..0cc52cc0 100644 --- a/crypto_sign/dilithium3/clean/rounding.h +++ b/crypto_sign/dilithium3/clean/rounding.h @@ -1,11 +1,14 @@ #ifndef PQCLEAN_DILITHIUM3_CLEAN_ROUNDING_H #define PQCLEAN_DILITHIUM3_CLEAN_ROUNDING_H - +#include "params.h" #include -uint32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(uint32_t a, uint32_t *a0); -uint32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(uint32_t a, uint32_t *a0); -unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(uint32_t a0, uint32_t a1); -uint32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(uint32_t a, unsigned int hint); +int32_t PQCLEAN_DILITHIUM3_CLEAN_power2round(int32_t *a0, int32_t a); + +int32_t PQCLEAN_DILITHIUM3_CLEAN_decompose(int32_t *a0, int32_t a); + +unsigned int PQCLEAN_DILITHIUM3_CLEAN_make_hint(int32_t a0, int32_t a1); + +int32_t PQCLEAN_DILITHIUM3_CLEAN_use_hint(int32_t a, unsigned int hint); #endif diff --git a/crypto_sign/dilithium3/clean/sign.c b/crypto_sign/dilithium3/clean/sign.c index 1715c42f..a585a509 100644 --- a/crypto_sign/dilithium3/clean/sign.c +++ b/crypto_sign/dilithium3/clean/sign.c @@ -1,6 +1,3 @@ -#include -#include - #include "fips202.h" #include "packing.h" #include "params.h" @@ -9,84 +6,7 @@ #include "randombytes.h" #include "sign.h" #include "symmetric.h" - -/************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_expand_mat -* -* Description: Implementation of ExpandA. Generates matrix A with uniformly -* random coefficients a_{i,j} by performing rejection -* sampling on the output stream of SHAKE128(rho|i|j). -* -* Arguments: - polyvecl mat[K]: output matrix -* - const uint8_t rho[]: byte array containing seed rho -**************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < L; ++j) { - PQCLEAN_DILITHIUM3_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t)((i << 8) + j)); - } - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM3_CLEAN_challenge -* -* Description: Implementation of H. Samples polynomial with 60 nonzero -* coefficients in {-1,1} using the output stream of -* SHAKE256(mu|w1). -* -* Arguments: - poly *c: pointer to output polynomial -* - const uint8_t mu[]: byte array containing mu -* - const polyveck *w1: pointer to vector w1 -**************************************************/ -void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, - const uint8_t mu[CRHBYTES], - const polyveck *w1) { - uint64_t signs; - uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - uint8_t outbuf[SHAKE256_RATE]; - shake256ctx state; - uint8_t b; - size_t pos; - - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[i] = mu[i]; - } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); - } - - shake256_absorb(&state, inbuf, sizeof(inbuf)); - shake256_squeezeblocks(outbuf, 1, &state); - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)outbuf[i] << 8 * i; - } - - pos = 8; - - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - for (size_t i = 196; i < 256; ++i) { - do { - if (pos >= SHAKE256_RATE) { - shake256_squeezeblocks(outbuf, 1, &state); - pos = 0; - } - - b = outbuf[pos++]; - } while (b > i); - - c->coeffs[i] = c->coeffs[b]; - c->coeffs[b] = 1; - c->coeffs[b] ^= -((int32_t)signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - shake256_ctx_release(&state); -} +#include /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair @@ -94,9 +14,9 @@ void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, * Description: Generates public and private key. * * Arguments: - uint8_t *pk: pointer to output public key (allocated -* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) * - uint8_t *sk: pointer to output private key (allocated -* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES bytes) +* array of PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ @@ -104,48 +24,42 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { uint8_t seedbuf[3 * SEEDBYTES]; uint8_t tr[CRHBYTES]; const uint8_t *rho, *rhoprime, *key; - uint16_t nonce = 0; polyvecl mat[K]; polyvecl s1, s1hat; - polyveck s2, t, t1, t0; + polyveck s2, t1, t0; - /* Expand 32 bytes of randomness into rho, rhoprime and key */ - randombytes(seedbuf, 3 * SEEDBYTES); + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); rho = seedbuf; rhoprime = seedbuf + SEEDBYTES; key = seedbuf + 2 * SEEDBYTES; /* Expand matrix */ - PQCLEAN_DILITHIUM3_CLEAN_expand_mat(mat, rho); + PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho); /* Sample short vectors s1 and s2 */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(&s1.vec[i], rhoprime, nonce++); - } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_eta(&s2.vec[i], rhoprime, nonce++); - } + PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); /* Matrix-vector multiplication */ s1hat = s1; PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&s1hat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); - PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(&t.vec[i]); - PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(&t.vec[i]); - } + PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&t1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&t1); /* Add error vector s2 */ - PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(&t, &t, &s2); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(&t1, &t1, &s2); /* Extract t1 and write public key */ - PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(&t); - PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(&t1, &t0, &t); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&t1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_power2round(&t1, &t0, &t1); PQCLEAN_DILITHIUM3_CLEAN_pack_pk(pk, rho, &t1); /* Compute CRH(rho, t1) and write secret key */ crh(tr, pk, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES); - PQCLEAN_DILITHIUM3_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); + PQCLEAN_DILITHIUM3_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); return 0; } @@ -153,44 +67,41 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature * -* Description: Compute signed message. +* Description: Computes signature. * -* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES -* of len) -* - size_t *smlen: pointer to output length of signed message -* (should be PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) -* - uint8_t *m: pointer to message to be signed -* - size_t mlen: length of message -* - uint8_t *sk: pointer to bit-packed secret key +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key * * Returns 0 (success) **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *msg, size_t mlen, - const uint8_t *sk) { +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + unsigned int n; uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; uint8_t *rho, *tr, *key, *mu, *rhoprime; - uint32_t n; uint16_t nonce = 0; - poly c, chat; - polyvecl mat[K], s1, y, yhat, z; - polyveck t0, s2, w, w1, w0; - polyveck h, cs2, ct0; + polyvecl mat[K], s1, y, z; + polyveck t0, s2, w1, w0, h; + poly cp; + shake256incctx state; rho = seedbuf; tr = rho + SEEDBYTES; key = tr + CRHBYTES; mu = key + SEEDBYTES; rhoprime = mu + CRHBYTES; - PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); + PQCLEAN_DILITHIUM3_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); - // use incremental hash API instead of copying around buffers /* Compute CRH(tr, msg) */ - shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, tr, CRHBYTES); - shake256_inc_absorb(&state, msg, mlen); + shake256_inc_absorb(&state, m, mlen); shake256_inc_finalize(&state); shake256_inc_squeeze(mu, CRHBYTES, &state); shake256_inc_ctx_release(&state); @@ -198,76 +109,71 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature( crh(rhoprime, key, SEEDBYTES + CRHBYTES); /* Expand matrix and transform vectors */ - PQCLEAN_DILITHIUM3_CLEAN_expand_mat(mat, rho); + PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho); PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&s1); PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&s2); PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&t0); rej: /* Sample intermediate vector y */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_uniform_gamma1m1(&y.vec[i], rhoprime, nonce++); - } + PQCLEAN_DILITHIUM3_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); /* Matrix-vector multiplication */ - yhat = y; - PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&yhat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); - PQCLEAN_DILITHIUM3_CLEAN_poly_reduce(&w.vec[i]); - PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(&w.vec[i]); - } + z = y; + PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&w1); /* Decompose w and call the random oracle */ - PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(&w); - PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(&w1, &w0, &w); - PQCLEAN_DILITHIUM3_CLEAN_challenge(&c, mu, &w1); - chat = c; - PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&chat); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_decompose(&w1, &w0, &w1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(sig, &w1); - /* Check that subtracting cs2 does not change high bits of w and low bits - * do not reveal secret information */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); - PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(&cs2.vec[i]); - } - PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(&w0, &w0, &cs2); - PQCLEAN_DILITHIUM3_CLEAN_polyveck_freeze(&w0); - if (PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { - goto rej; - } + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(&cp, sig); + PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&cp); /* Compute z, reject if it reveals secret */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); - PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(&z.vec[i]); - } + PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); + PQCLEAN_DILITHIUM3_CLEAN_polyvecl_invntt_tomont(&z); PQCLEAN_DILITHIUM3_CLEAN_polyvecl_add(&z, &z, &y); - PQCLEAN_DILITHIUM3_CLEAN_polyvecl_freeze(&z); + PQCLEAN_DILITHIUM3_CLEAN_polyvecl_reduce(&z); if (PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { goto rej; } - /* Compute hints for w1 */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); - PQCLEAN_DILITHIUM3_CLEAN_poly_invntt_montgomery(&ct0.vec[i]); - } - - PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(&ct0); - if (PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(&ct0, GAMMA2)) { + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(&w0, &w0, &h); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w0); + if (PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { goto rej; } - PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(&w0, &w0, &ct0); - PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(&w0); + /* Compute hints for w1 */ + PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&h); + if (PQCLEAN_DILITHIUM3_CLEAN_polyveck_chknorm(&h, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM3_CLEAN_polyveck_add(&w0, &w0, &h); n = PQCLEAN_DILITHIUM3_CLEAN_polyveck_make_hint(&h, &w0, &w1); if (n > OMEGA) { goto rej; } /* Write signature */ - PQCLEAN_DILITHIUM3_CLEAN_pack_sig(sig, &z, &h, &c); + PQCLEAN_DILITHIUM3_CLEAN_pack_sig(sig, sig, &z, &h); *siglen = PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES; return 0; } @@ -281,53 +187,63 @@ rej: * array with PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + mlen bytes), * can be equal to m * - size_t *smlen: pointer to output length of signed -* message +* message * - const uint8_t *m: pointer to message to be signed * - size_t mlen: length of message * - const uint8_t *sk: pointer to bit-packed secret key * * Returns 0 (success) **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - int rc; - memmove(sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, m, mlen); - rc = PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); - *smlen += mlen; - return rc; -} +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + size_t i; + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} /************************************************* * Name: PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify * -* Description: Verify signed message. +* Description: Verifies signature. * -* Arguments: - uint8_t *sig: signature -* - size_t siglen: length of signature (PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) -* - uint8_t *m: pointer to message -* - size_t *mlen: pointer to output length of message -* - uint8_t *pk: pointer to bit-packed public key +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key * -* Returns 0 if signed message could be verified correctly and -1 otherwise +* Returns 0 if signature could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, const uint8_t *pk) { +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) { + unsigned int i; + uint8_t buf[K * POLYW1_PACKEDBYTES]; uint8_t rho[SEEDBYTES]; uint8_t mu[CRHBYTES]; - poly c, chat, cp; + uint8_t c[SEEDBYTES]; + uint8_t c2[SEEDBYTES]; + poly cp; polyvecl mat[K], z; - polyveck t1, w1, h, tmp1, tmp2; + polyveck t1, w1, h; + shake256incctx state; - if (siglen < PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) { + if (siglen != PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) { return -1; } PQCLEAN_DILITHIUM3_CLEAN_unpack_pk(rho, &t1, pk); - if (PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(&z, &h, &c, sig)) { + if (PQCLEAN_DILITHIUM3_CLEAN_unpack_sig(c, &z, &h, sig)) { return -1; } if (PQCLEAN_DILITHIUM3_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { @@ -336,8 +252,6 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( /* Compute CRH(CRH(rho, t1), msg) */ crh(mu, pk, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_PUBLICKEYBYTES); - - shake256incctx state; shake256_inc_init(&state); shake256_inc_absorb(&state, mu, CRHBYTES); shake256_inc_absorb(&state, m, mlen); @@ -346,38 +260,39 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( shake256_inc_ctx_release(&state); /* Matrix-vector multiplication; compute Az - c2^dt1 */ - PQCLEAN_DILITHIUM3_CLEAN_expand_mat(mat, rho); + PQCLEAN_DILITHIUM3_CLEAN_poly_challenge(&cp, c); + PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_expand(mat, rho); PQCLEAN_DILITHIUM3_CLEAN_polyvecl_ntt(&z); - for (size_t i = 0; i < K ; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); - } + PQCLEAN_DILITHIUM3_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); - chat = c; - PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&chat); + PQCLEAN_DILITHIUM3_CLEAN_poly_ntt(&cp); PQCLEAN_DILITHIUM3_CLEAN_polyveck_shiftl(&t1); PQCLEAN_DILITHIUM3_CLEAN_polyveck_ntt(&t1); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM3_CLEAN_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); - } + PQCLEAN_DILITHIUM3_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); - PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(&tmp1, &tmp1, &tmp2); - PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&tmp1); - PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_montgomery(&tmp1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_sub(&w1, &w1, &t1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_invntt_tomont(&w1); /* Reconstruct w1 */ - PQCLEAN_DILITHIUM3_CLEAN_polyveck_csubq(&tmp1); - PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(&w1, &tmp1, &h); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM3_CLEAN_polyveck_pack_w1(buf, &w1); - /* Call random oracle and verify challenge */ - PQCLEAN_DILITHIUM3_CLEAN_challenge(&cp, mu, &w1); - for (size_t i = 0; i < N; ++i) { - if (c.coeffs[i] != cp.coeffs[i]) { + /* Call random oracle and verify PQCLEAN_DILITHIUM3_CLEAN_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(c2, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (c[i] != c2[i]) { return -1; } } - // All good return 0; } @@ -387,7 +302,7 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( * Description: Verify signed message. * * Arguments: - uint8_t *m: pointer to output message (allocated -* array with smlen bytes), can be equal to sm +* array with smlen bytes), can be equal to sm * - size_t *mlen: pointer to output length of message * - const uint8_t *sm: pointer to signed message * - size_t smlen: length of signed message @@ -395,33 +310,34 @@ int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify( * * Returns 0 if signed message could be verified correctly and -1 otherwise **************************************************/ -int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk) { +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) { + size_t i; + if (smlen < PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES) { goto badsig; } - *mlen = smlen - PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES; - if (PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, - sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, *mlen, pk)) { + *mlen = smlen - PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES, *mlen, pk)) { goto badsig; } else { /* All good, copy msg, return 0 */ - for (size_t i = 0; i < *mlen; ++i) { + for (i = 0; i < *mlen; ++i) { m[i] = sm[PQCLEAN_DILITHIUM3_CLEAN_CRYPTO_BYTES + i]; } return 0; } - /* Signature verification failed */ badsig: + /* Signature verification failed */ *mlen = (size_t) -1; - for (size_t i = 0; i < smlen; ++i) { + for (i = 0; i < smlen; ++i) { m[i] = 0; } return -1; } - diff --git a/crypto_sign/dilithium3/clean/sign.h b/crypto_sign/dilithium3/clean/sign.h index ae80256a..20336537 100644 --- a/crypto_sign/dilithium3/clean/sign.h +++ b/crypto_sign/dilithium3/clean/sign.h @@ -1,12 +1,29 @@ #ifndef PQCLEAN_DILITHIUM3_CLEAN_SIGN_H #define PQCLEAN_DILITHIUM3_CLEAN_SIGN_H - -#include "api.h" #include "params.h" #include "poly.h" #include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM3_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); -void PQCLEAN_DILITHIUM3_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM3_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES], - const polyveck *w1); #endif diff --git a/crypto_sign/dilithium3/clean/stream.c b/crypto_sign/dilithium3/clean/stream.c deleted file mode 100644 index a1ac2ff0..00000000 --- a/crypto_sign/dilithium3/clean/stream.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "stream.h" - -#include - -void PQCLEAN_DILITHIUM3_CLEAN_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { - - uint8_t buf[SEEDBYTES + 2]; - memcpy(buf, seed, SEEDBYTES); - buf[SEEDBYTES] = (uint8_t)nonce; - buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); - - shake128_absorb(state, buf, SEEDBYTES + 2); -} - - -void PQCLEAN_DILITHIUM3_CLEAN_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { - - uint8_t buf[CRHBYTES + 2]; - memcpy(buf, seed, CRHBYTES); - buf[CRHBYTES] = (uint8_t)nonce; - buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); - - shake256_absorb(state, buf, CRHBYTES + 2); -} diff --git a/crypto_sign/dilithium3/clean/stream.h b/crypto_sign/dilithium3/clean/stream.h deleted file mode 100644 index 711b266f..00000000 --- a/crypto_sign/dilithium3/clean/stream.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM3_CLEAN_STREAM_H -#define PQCLEAN_DILITHIUM3_CLEAN_STREAM_H - -#include - -#include "fips202.h" -#include "params.h" - -void PQCLEAN_DILITHIUM3_CLEAN_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); - -void PQCLEAN_DILITHIUM3_CLEAN_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); - -#endif diff --git a/crypto_sign/dilithium3/clean/symmetric-shake.c b/crypto_sign/dilithium3/clean/symmetric-shake.c new file mode 100644 index 00000000..a09dbd64 --- /dev/null +++ b/crypto_sign/dilithium3/clean/symmetric-shake.c @@ -0,0 +1,26 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM3_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake128_inc_init(state); + shake128_inc_absorb(state, seed, SEEDBYTES); + shake128_inc_absorb(state, t, 2); + shake128_inc_finalize(state); +} + +void PQCLEAN_DILITHIUM3_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake256_inc_init(state); + shake256_inc_absorb(state, seed, CRHBYTES); + shake256_inc_absorb(state, t, 2); + shake256_inc_finalize(state); +} diff --git a/crypto_sign/dilithium3/clean/symmetric.h b/crypto_sign/dilithium3/clean/symmetric.h index 736f1744..dd88beef 100644 --- a/crypto_sign/dilithium3/clean/symmetric.h +++ b/crypto_sign/dilithium3/clean/symmetric.h @@ -1,25 +1,36 @@ #ifndef PQCLEAN_DILITHIUM3_CLEAN_SYMMETRIC_H #define PQCLEAN_DILITHIUM3_CLEAN_SYMMETRIC_H - -#include "params.h" -#include "stream.h" - - #include "fips202.h" +#include "params.h" +#include -#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) -#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_CLEAN_shake128_stream_init(STATE, SEED, NONCE) -#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream128_ctx_release(STATE) shake128_ctx_release(STATE) -#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3_CLEAN_shake256_stream_init(STATE, SEED, NONCE) -#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream256_ctx_release(STATE) shake256_ctx_release(STATE) + + +typedef shake128incctx stream128_state; +typedef shake256incctx stream256_state; + +void PQCLEAN_DILITHIUM3_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); + +void PQCLEAN_DILITHIUM3_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, + const uint8_t seed[CRHBYTES], + uint16_t nonce); #define STREAM128_BLOCKBYTES SHAKE128_RATE #define STREAM256_BLOCKBYTES SHAKE256_RATE -typedef shake128ctx stream128_state; -typedef shake256ctx stream256_state; +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM3_CLEAN_dilithium_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) +#define stream128_release(STATE) shake128_inc_ctx_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM3_CLEAN_dilithium_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) +#define stream256_release(STATE) shake256_inc_ctx_release(STATE) #endif diff --git a/crypto_sign/dilithium3aes/META.yml b/crypto_sign/dilithium3aes/META.yml new file mode 100644 index 00000000..7b42c902 --- /dev/null +++ b/crypto_sign/dilithium3aes/META.yml @@ -0,0 +1,31 @@ +name: Dilithium3-AES +type: signature +claimed-nist-level: 3 +length-public-key: 1952 +length-secret-key: 4016 +length-signature: 3293 +nistkat-sha256: c1519093239804f90d1c9386e2a95b42b45dc65cbdc7c1dd777fe27de3840517 +testvectors-sha256: 9637ff196abfad19f3479e6a6ec3e91fc6de3bae89adf8617d91154063a3262a +principal-submitters: + - Vadim Lyubashevsky +auxiliary-submitters: + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Peter Schwabe + - Gregor Seiler + - Damien Stehlé +implementations: + - name: clean + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium + - name: avx2 + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - aes + - avx2 + - popcnt diff --git a/crypto_sign/dilithium3aes/avx2/LICENSE b/crypto_sign/dilithium3aes/avx2/LICENSE new file mode 100644 index 00000000..08473af7 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/LICENSE @@ -0,0 +1,5 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium3aes/avx2/Makefile b/crypto_sign/dilithium3aes/avx2/Makefile new file mode 100644 index 00000000..2a174dd4 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/Makefile @@ -0,0 +1,23 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium3aes_avx2.a +HEADERS=aes256ctr.h align.h api.h cdecl.h consts.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc +OBJECTS=aes256ctr.o consts.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o invntt.o ntt.o pointwise.o shuffle.o +CFLAGS=-mavx2 -maes -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls -std=c99 \ + -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_sign/dilithium3aes/avx2/aes256ctr.c b/crypto_sign/dilithium3aes/avx2/aes256ctr.c new file mode 100644 index 00000000..e6e165b8 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/aes256ctr.c @@ -0,0 +1,142 @@ +#include "aes256ctr.h" +#include +#include +#include +/* Based heavily on public-domain code by Romain Dolbeau + * Different handling of nonce+counter than original version using + * separated 64-bit nonce and internal 64-bit counter, starting from zero + * Public Domain */ + + +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { + __m128i f, f0, f1, f2, f3; + const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + + /* Load current counter value */ + f = _mm_load_si128(n); + + /* Increase counter in 4 consecutive blocks */ + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); + + /* Write counter for next iteration, increased by 4 */ + _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); + + /* Actual AES encryption, 4x interleaved */ + f = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, f); + f1 = _mm_xor_si128(f1, f); + f2 = _mm_xor_si128(f2, f); + f3 = _mm_xor_si128(f3, f); + + for (int i = 1; i < 14; i++) { + f = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, f); + f1 = _mm_aesenc_si128(f1, f); + f2 = _mm_aesenc_si128(f2, f); + f3 = _mm_aesenc_si128(f3, f); + } + + f = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, f); + f1 = _mm_aesenclast_si128(f1, f); + f2 = _mm_aesenclast_si128(f2, f); + f3 = _mm_aesenclast_si128(f3, f); + + /* Write results */ + _mm_storeu_si128((__m128i *)(out + 0), f0); + _mm_storeu_si128((__m128i *)(out + 16), f1); + _mm_storeu_si128((__m128i *)(out + 32), f2); + _mm_storeu_si128((__m128i *)(out + 48), f3); +} + +void PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) { + __m128i key0, key1, temp0, temp1, temp2, temp4; + int idx = 0; + + key0 = _mm_loadu_si128((__m128i *)(key + 0)); + key1 = _mm_loadu_si128((__m128i *)(key + 16)); + state->n = _mm_loadl_epi64((__m128i *)&nonce); + + state->rkeys[idx++] = key0; + temp0 = key0; + temp2 = key1; + temp4 = _mm_setzero_si128(); + +#define BLOCK1(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \ + state->rkeys[idx++] = temp2; \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x10); \ + temp0 = _mm_xor_si128(temp0, temp4); \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x8c); \ + temp0 = _mm_xor_si128(temp0, temp4); \ + temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xff); \ + temp0 = _mm_xor_si128(temp0, temp1) + +#define BLOCK2(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ + state->rkeys[idx++] = temp0; \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x10); \ + temp2 = _mm_xor_si128(temp2, temp4); \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x8c); \ + temp2 = _mm_xor_si128(temp2, temp4); \ + temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xaa); \ + temp2 = _mm_xor_si128(temp2, temp1) + + BLOCK1(0x01); + BLOCK2(0x01); + + BLOCK1(0x02); + BLOCK2(0x02); + + BLOCK1(0x04); + BLOCK2(0x04); + + BLOCK1(0x08); + BLOCK2(0x08); + + BLOCK1(0x10); + BLOCK2(0x10); + + BLOCK1(0x20); + BLOCK2(0x20); + + BLOCK1(0x40); + state->rkeys[idx++] = temp0; +} + +void PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state) { + size_t i; + for (i = 0; i < nblocks; i++) { + aesni_encrypt4(out, &state->n, state->rkeys); + out += 64; + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t seed[32], + uint64_t nonce) { + unsigned int i; + uint8_t buf[64]; + aes256ctr_ctx state; + + PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(&state, seed, nonce); + + while (outlen >= 64) { + aesni_encrypt4(out, &state.n, state.rkeys); + outlen -= 64; + out += 64; + } + + if (outlen) { + aesni_encrypt4(buf, &state.n, state.rkeys); + for (i = 0; i < outlen; i++) { + out[i] = buf[i]; + } + } +} diff --git a/crypto_sign/dilithium3aes/avx2/aes256ctr.h b/crypto_sign/dilithium3aes/avx2/aes256ctr.h new file mode 100644 index 00000000..d0e6fda8 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/aes256ctr.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_AES256CTR_H +#define PQCLEAN_DILITHIUM3AES_AVX2_AES256CTR_H + +#include +#include +#include + + +#define AES256CTR_BLOCKBYTES 64 + +typedef struct { + __m128i rkeys[16]; + __m128i n; +} aes256ctr_ctx; + +void PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint64_t nonce); + +void PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +void PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t seed[32], + uint64_t nonce); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/align.h b/crypto_sign/dilithium3aes/avx2/align.h new file mode 100644 index 00000000..c041f48c --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/align.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_ALIGN_H +#define PQCLEAN_DILITHIUM3AES_AVX2_ALIGN_H + +#include +#include + +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[N]; \ + __m256i vec[((N)+31)/32]; \ + } + +#define ALIGNED_INT32(N) \ + union { \ + int32_t coeffs[N]; \ + __m256i vec[((N)+7)/8]; \ + } + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/api.h b/crypto_sign/dilithium3aes/avx2/api.h new file mode 100644 index 00000000..39a2fbad --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_API_H +#define PQCLEAN_DILITHIUM3AES_AVX2_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES 1952 +#define PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_SECRETKEYBYTES 4016 +#define PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES 3293 + +#define PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_ALGNAME "Dilithium3-AES" + + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/cdecl.h b/crypto_sign/dilithium3aes/avx2/cdecl.h new file mode 100644 index 00000000..50487c85 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/cdecl.h @@ -0,0 +1,24 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_CDECL_H +#define PQCLEAN_DILITHIUM3AES_AVX2_CDECL_H + + + +#define _8XQ 0 +#define _8XQINV 8 +#define _8XDIV_QINV 16 +#define _8XDIV 24 +#define _ZETAS_QINV 32 +#define _ZETAS 328 + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#define _cdecl(s) _##s +#define cdecl(s) s + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/consts.c b/crypto_sign/dilithium3aes/avx2/consts.c new file mode 100644 index 00000000..d5203cf9 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/consts.c @@ -0,0 +1,101 @@ +#include "consts.h" +#include "params.h" +#include + +#define QINV 58728449 // q^(-1) mod 2^32 +#define MONT (-4186625) // 2^32 mod q +#define DIV 41978 // mont^2/256 +#define DIV_QINV (-8395782) + +const qdata_t PQCLEAN_DILITHIUM3AES_AVX2_qdata = {{ +//#define _8XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, + +//#define _8XQINV 8 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +//#define _8XDIV_QINV 16 + DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, + +//#define _8XDIV 24 + DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, + +//#define _ZETAS_QINV 32 + -151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, + 308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, + -1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, + -1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, + -285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, + 1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, + 1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, + 1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, + 329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, + -1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, + -202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, + -1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, + 1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, + -1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, + -783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, + 1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, + -695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, + -654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, + -247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, + -916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, + 1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, + -898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, + 2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, + 831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, + -2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, + 991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, + 908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, + -1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, + 6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, + 1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, + -1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, + 1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, + 702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, + 746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, + 885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, + 1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, + -1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, + +//#define _ZETAS 328 + -3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, + -359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, + 3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, + 3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, + 2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, + -549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, + -2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, + 1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, + 3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, + -3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, + 189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, + -1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, + -983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, + 264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, + -3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, + 2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, + 342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, + -1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, + -3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, + 3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, + 286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, + 1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, + 3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, + 2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, + -2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, + -2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, + 3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, + 3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, + 4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, + -1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, + 269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, + } +}; diff --git a/crypto_sign/dilithium3aes/avx2/consts.h b/crypto_sign/dilithium3aes/avx2/consts.h new file mode 100644 index 00000000..3fbd6271 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/consts.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_CONSTS_H +#define PQCLEAN_DILITHIUM3AES_AVX2_CONSTS_H +#include "align.h" +#include "cdecl.h" + + +typedef ALIGNED_INT32(624) qdata_t; +extern const qdata_t PQCLEAN_DILITHIUM3AES_AVX2_qdata; + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/invntt.S b/crypto_sign/dilithium3aes/avx2/invntt.S new file mode 100644 index 00000000..9cc43174 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/invntt.S @@ -0,0 +1,240 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpsubd %ymm\l,%ymm\h,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vpmuldq %ymm\zl0,%ymm12,%ymm13 +vmovshdup %ymm12,%ymm\h +vpmuldq %ymm\zl1,%ymm\h,%ymm14 + +vpmuldq %ymm\zh0,%ymm12,%ymm12 +vpmuldq %ymm\zh1,%ymm\h,%ymm\h + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vpsubd %ymm13,%ymm12,%ymm12 +vpsubd %ymm14,%ymm\h,%ymm\h + +vmovshdup %ymm12,%ymm12 +vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h +.endm + +.macro levels0t5 off +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +/* level 0 */ +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,5,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 6,7,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,9,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 10,11,1,3,2,15 + +/* level 1 */ +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,6,1,3,2,15 +butterfly 5,7,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,10,1,3,2,15 +butterfly 9,11,1,3,2,15 + +/* level 2 */ +vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,8,1,3,2,15 +butterfly 5,9,1,3,2,15 +butterfly 6,10,1,3,2,15 +butterfly 7,11,1,3,2,15 + +/* level 3 */ +shuffle2 4,5,3,5 +shuffle2 6,7,4,7 +shuffle2 8,9,6,9 +shuffle2 10,11,8,11 + +vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 +butterfly 3,5 +butterfly 4,7 +butterfly 6,9 +butterfly 8,11 + +/* level 4 */ +shuffle4 3,4,10,4 +shuffle4 6,8,3,8 +shuffle4 5,7,6,7 +shuffle4 9,11,5,11 + +vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 +butterfly 10,4 +butterfly 3,8 +butterfly 6,7 +butterfly 5,11 + +/* level 5 */ +shuffle8 10,3,9,3 +shuffle8 6,5,10,5 +shuffle8 4,8,6,8 +shuffle8 7,11,4,11 + +vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 +butterfly 9,3 +butterfly 10,5 +butterfly 6,8 +butterfly 4,11 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm10,256*\off+ 32(%rdi) +vmovdqa %ymm6,256*\off+ 64(%rdi) +vmovdqa %ymm4,256*\off+ 96(%rdi) +vmovdqa %ymm3,256*\off+128(%rdi) +vmovdqa %ymm5,256*\off+160(%rdi) +vmovdqa %ymm8,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.macro levels6t7 off +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +/* level 6 */ +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +/* level 7 */ +vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) + +vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 +vmovdqa (_8XDIV)*4(%rsi),%ymm2 +vpmuldq %ymm1,%ymm4,%ymm12 +vpmuldq %ymm1,%ymm5,%ymm13 +vmovshdup %ymm4,%ymm8 +vmovshdup %ymm5,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm4,%ymm4 +vpmuldq %ymm2,%ymm5,%ymm5 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm4,%ymm4 +vpsubd %ymm13,%ymm5,%ymm5 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm4,%ymm4 +vmovshdup %ymm5,%ymm5 +vpblendd $0xAA,%ymm8,%ymm4,%ymm4 +vpblendd $0xAA,%ymm9,%ymm5,%ymm5 + +vpmuldq %ymm1,%ymm6,%ymm12 +vpmuldq %ymm1,%ymm7,%ymm13 +vmovshdup %ymm6,%ymm8 +vmovshdup %ymm7,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm6,%ymm6 +vpmuldq %ymm2,%ymm7,%ymm7 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm6,%ymm6 +vpsubd %ymm13,%ymm7,%ymm7 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm6,%ymm6 +vmovshdup %ymm7,%ymm7 +vpblendd $0xAA,%ymm8,%ymm6,%ymm6 +vpblendd $0xAA,%ymm9,%ymm7,%ymm7 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM3AES_AVX2_invntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM3AES_AVX2_invntt_avx) +cdecl(PQCLEAN_DILITHIUM3AES_AVX2_invntt_avx): +_cdecl(PQCLEAN_DILITHIUM3AES_AVX2_invntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t5 0 +levels0t5 1 +levels0t5 2 +levels0t5 3 + +levels6t7 0 +levels6t7 1 +levels6t7 2 +levels6t7 3 + +ret diff --git a/crypto_sign/dilithium3aes/avx2/ntt.S b/crypto_sign/dilithium3aes/avx2/ntt.S new file mode 100644 index 00000000..6e3920ed --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/ntt.S @@ -0,0 +1,199 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpmuldq %ymm\zl0,%ymm\h,%ymm13 +vmovshdup %ymm\h,%ymm12 +vpmuldq %ymm\zl1,%ymm12,%ymm14 + +vpmuldq %ymm\zh0,%ymm\h,%ymm\h +vpmuldq %ymm\zh1,%ymm12,%ymm12 + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vmovshdup %ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h + +vpsubd %ymm\h,%ymm\l,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vmovshdup %ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 + +vpaddd %ymm13,%ymm12,%ymm\h +vpsubd %ymm13,%ymm\l,%ymm\l +.endm + +.macro levels0t1 off +/* level 0 */ +vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 + +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +/* level 1 */ +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) +.endm + +.macro levels2t7 off +/* level 2 */ +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +/* level 3 */ +vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 + +butterfly 3,5 +butterfly 8,10 +butterfly 4,6 +butterfly 9,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +/* level 4 */ +vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 + +butterfly 7,8 +butterfly 5,6 +butterfly 3,4 +butterfly 10,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +/* level 5 */ +vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 + +butterfly 9,5,1,10,2,15 +butterfly 8,4,1,10,2,15 +butterfly 7,3,1,10,2,15 +butterfly 6,11,1,10,2,15 + +/* level 6 */ +vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,7,1,10,2,15 +butterfly 8,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,3,1,10,2,15 +butterfly 4,11,1,10,2,15 + +/* level 7 */ +vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,8,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 7,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,4,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 3,11,1,10,2,15 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm8,256*\off+ 32(%rdi) +vmovdqa %ymm7,256*\off+ 64(%rdi) +vmovdqa %ymm6,256*\off+ 96(%rdi) +vmovdqa %ymm5,256*\off+128(%rdi) +vmovdqa %ymm4,256*\off+160(%rdi) +vmovdqa %ymm3,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM3AES_AVX2_ntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM3AES_AVX2_ntt_avx) +cdecl(PQCLEAN_DILITHIUM3AES_AVX2_ntt_avx): +_cdecl(PQCLEAN_DILITHIUM3AES_AVX2_ntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 + +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 + +ret + diff --git a/crypto_sign/dilithium3aes/avx2/ntt.h b/crypto_sign/dilithium3aes/avx2/ntt.h new file mode 100644 index 00000000..4866c299 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/ntt.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_NTT_H +#define PQCLEAN_DILITHIUM3AES_AVX2_NTT_H + +#include + +void PQCLEAN_DILITHIUM3AES_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM3AES_AVX2_qdata); +void PQCLEAN_DILITHIUM3AES_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM3AES_AVX2_qdata); + +void PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx(__m256i *a); + +void PQCLEAN_DILITHIUM3AES_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM3AES_AVX2_qdata); +void PQCLEAN_DILITHIUM3AES_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM3AES_AVX2_qdata); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/packing.c b/crypto_sign/dilithium3aes/avx2/packing.c new file mode 100644 index 00000000..8a849f1f --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/packing.c @@ -0,0 +1,261 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM3AES_AVX2_challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Encode h */ + for (i = 0; i < OMEGA + K; ++i) { + sig[i] = 0; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; + } + } + + sig[OMEGA + i] = (uint8_t) k; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM3AES_AVX2_challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES]) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + return 0; +} diff --git a/crypto_sign/dilithium3aes/avx2/packing.h b/crypto_sign/dilithium3aes/avx2/packing.h new file mode 100644 index 00000000..b8053269 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/packing.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_PACKING_H +#define PQCLEAN_DILITHIUM3AES_AVX2_PACKING_H +#include "params.h" +#include "polyvec.h" +#include + +void PQCLEAN_DILITHIUM3AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +void PQCLEAN_DILITHIUM3AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM3AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM3AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM3AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM3AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES]); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/params.h b/crypto_sign/dilithium3aes/avx2/params.h new file mode 100644 index 00000000..af88be1e --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/params.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_PARAMS_H +#define PQCLEAN_DILITHIUM3AES_AVX2_PARAMS_H + + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 55 +#define PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_ALGNAME "Dilithium3-AES" + + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 640 + +#define POLYW1_PACKEDBYTES 128 + +#define POLYETA_PACKEDBYTES 128 + +#define PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/pointwise.S b/crypto_sign/dilithium3aes/avx2/pointwise.S new file mode 100644 index 00000000..1ff0179f --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/pointwise.S @@ -0,0 +1,201 @@ +#include "params.h" +#include "cdecl.h" + +.text +.global cdecl(PQCLEAN_DILITHIUM3AES_AVX2_pointwise_avx) +.global _cdecl(PQCLEAN_DILITHIUM3AES_AVX2_pointwise_avx) +cdecl(PQCLEAN_DILITHIUM3AES_AVX2_pointwise_avx): +_cdecl(PQCLEAN_DILITHIUM3AES_AVX2_pointwise_avx): +#consts +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop1: +#load +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa 64(%rsi),%ymm6 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vmovdqa 64(%rdx),%ymm14 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm6,%ymm7 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm14,%ymm15 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm6,%ymm14,%ymm6 +vpmuldq %ymm7,%ymm15,%ymm7 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm0,%ymm6,%ymm14 +vpmuldq %ymm0,%ymm7,%ymm15 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpmuldq %ymm1,%ymm14,%ymm14 +vpmuldq %ymm1,%ymm15,%ymm15 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsubq %ymm14,%ymm6,%ymm6 +vpsubq %ymm15,%ymm7,%ymm7 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm6,%ymm6 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 +vpblendd $0xAA,%ymm7,%ymm6,%ymm6 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm6,64(%rdi) + +add $96,%rdi +add $96,%rsi +add $96,%rdx +add $1,%eax +cmp $10,%eax +jb _looptop1 + +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0x55,%ymm2,%ymm3,%ymm2 +vpblendd $0x55,%ymm4,%ymm5,%ymm4 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +ret + +.macro pointwise off +#load +vmovdqa \off(%rsi),%ymm6 +vmovdqa \off+32(%rsi),%ymm8 +vmovdqa \off(%rdx),%ymm10 +vmovdqa \off+32(%rdx),%ymm12 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm6,%ymm10,%ymm6 +vpmuldq %ymm7,%ymm11,%ymm7 +vpmuldq %ymm8,%ymm12,%ymm8 +vpmuldq %ymm9,%ymm13,%ymm9 +.endm + +.macro acc +vpaddq %ymm6,%ymm2,%ymm2 +vpaddq %ymm7,%ymm3,%ymm3 +vpaddq %ymm8,%ymm4,%ymm4 +vpaddq %ymm9,%ymm5,%ymm5 +.endm + +.global cdecl(PQCLEAN_DILITHIUM3AES_AVX2_pointwise_acc_avx) +.global _cdecl(PQCLEAN_DILITHIUM3AES_AVX2_pointwise_acc_avx) +cdecl(PQCLEAN_DILITHIUM3AES_AVX2_pointwise_acc_avx): +_cdecl(PQCLEAN_DILITHIUM3AES_AVX2_pointwise_acc_avx): +#consts +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop2: +pointwise 0 + +#mov +vmovdqa %ymm6,%ymm2 +vmovdqa %ymm7,%ymm3 +vmovdqa %ymm8,%ymm4 +vmovdqa %ymm9,%ymm5 + +pointwise 1024 +acc + +pointwise 2048 +acc + +pointwise 3072 +acc + +pointwise 4096 +acc + + + +#reduce +vpmuldq %ymm0,%ymm2,%ymm6 +vpmuldq %ymm0,%ymm3,%ymm7 +vpmuldq %ymm0,%ymm4,%ymm8 +vpmuldq %ymm0,%ymm5,%ymm9 +vpmuldq %ymm1,%ymm6,%ymm6 +vpmuldq %ymm1,%ymm7,%ymm7 +vpmuldq %ymm1,%ymm8,%ymm8 +vpmuldq %ymm1,%ymm9,%ymm9 +vpsubq %ymm6,%ymm2,%ymm2 +vpsubq %ymm7,%ymm3,%ymm3 +vpsubq %ymm8,%ymm4,%ymm4 +vpsubq %ymm9,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 + +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +add $64,%rsi +add $64,%rdx +add $64,%rdi +add $1,%eax +cmp $16,%eax +jb _looptop2 + +ret diff --git a/crypto_sign/dilithium3aes/avx2/poly.c b/crypto_sign/dilithium3aes/avx2/poly.c new file mode 100644 index 00000000..1bdfde28 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/poly.c @@ -0,0 +1,862 @@ +#include "align.h" +#include "consts.h" +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "rejsample.h" +#include "rounding.h" +#include "symmetric.h" +#include +#include +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. Assumes input +* coefficients to be at most 2^31 - 2^22 - 1 in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM3AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i off = _mm256_set1_epi32(1 << 22); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_add_epi32(f, off); + g = _mm256_srai_epi32(g, 23); + g = _mm256_mullo_epi32(g, q); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: poly_addq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_caddq(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM3AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i zero = _mm256_setzero_si256(); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_blendv_epi32(zero, q, f); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* positive standard representatives. Assumes input +* coefficients to be at most 2^31 - 2^22 + 1 in +* absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_freeze(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(a); + PQCLEAN_DILITHIUM3AES_AVX2_poly_caddq(a); + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_shiftl(poly *a) { + unsigned int i; + __m256i f; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_slli_epi32(f, D); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by up to +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_AVX2_ntt_avx(a->vec, PQCLEAN_DILITHIUM3AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_AVX2_invntt_avx(a->vec, PQCLEAN_DILITHIUM3AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +void PQCLEAN_DILITHIUM3AES_AVX2_poly_nttunpack(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx(a->vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_AVX2_pointwise_avx(c->vec, a->vec, b->vec, PQCLEAN_DILITHIUM3AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod^+ Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* positive standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_AVX2_power2round_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be positive standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_AVX2_decompose_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_make_hint +* +* Description: Compute hint array. The coefficients of which are the +* indices of the coefficients of the input polynomial +* whose low bits overflow into the high bits. +* +* Arguments: - uint8_t *h: pointer to output hint array (preallocated of length N) +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of hints, i.e. length of hint array. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1) { + unsigned int r; + DBENCH_START(); + + r = PQCLEAN_DILITHIUM3AES_AVX2_make_hint_avx(hint, a0->vec, a1->vec); + + DBENCH_STOP(*tround); + return r; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_AVX2_use_hint_avx(b->vec, a->vec, h->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input polynomial to be reduced by PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int r; + __m256i f, t; + const __m256i bound = _mm256_set1_epi32(B - 1); + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + + t = _mm256_setzero_si256(); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_abs_epi32(f); + f = _mm256_cmpgt_epi32(f, bound); + t = _mm256_or_si256(t, f); + } + + r = 1 - _mm256_testz_si256(t, t); + DBENCH_STOP(*tsample); + return r; +} + +/************************************************* +* Name: rej_uniform +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + /* PQCLEAN_DILITHIUM3AES_AVX2_rej_uniform_avx reads up to 8 additional bytes */ + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM3AES_AVX2_rej_uniform_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + /* length of buf is always divisible by 3; hence, no bytes left */ + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_preinit(a, &state); + stream128_release(&state); +} + + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 9) { + a[ctr++] = 4 - t0; + } + if (t1 < 9 && ctr < len) { + a[ctr++] = 4 - t1; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling using the +* output stream of SHAKE256(seed|nonce) +* or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN * STREAM128_BLOCKBYTES) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_ETA_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM3AES_AVX2_rej_eta_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta_preinit(a, &state); + stream128_release(&state); +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_gamma1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES+STREAM256_BLOCKBYTES-1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state) { + /* PQCLEAN_DILITHIUM3AES_AVX2_polyz_unpack reads 14 additional bytes */ + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf; + stream256_squeezeblocks(buf.coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, state); + PQCLEAN_DILITHIUM3AES_AVX2_polyz_unpack(a, buf.coeffs); +} + +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { + stream256_state state; + stream256_init(&state, seed, nonce); + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_gamma1_preinit(a, &state); + stream256_release(&state); +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_poly_challenge(poly *restrict c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + ALIGNED_UINT8(SHAKE256_RATE) buf; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + + memcpy(&signs, buf.coeffs, 8); + pos = 8; + + memset(c->vec, 0, sizeof(poly)); + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + pos = 0; + } + + b = buf.coeffs[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = ETA - a->coeffs[2 * i + 0]; + t[1] = ETA - a->coeffs[2 * i + 1]; + r[i] = t[0] | (t[1] << 4); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyeta_unpack(poly *restrict r, const uint8_t a[POLYETA_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[i] & 0x0F; + r->coeffs[2 * i + 1] = a[i] >> 4; + r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be positive standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2); + r[5 * i + 2] = (a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4); + r[5 * i + 3] = (a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6); + r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are positive standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyt1_unpack(poly *restrict r, const uint8_t a[POLYT1_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = t[0]; + r[13 * i + 1] = t[0] >> 8; + r[13 * i + 1] |= t[1] << 5; + r[13 * i + 2] = t[1] >> 3; + r[13 * i + 3] = t[1] >> 11; + r[13 * i + 3] |= t[2] << 2; + r[13 * i + 4] = t[2] >> 6; + r[13 * i + 4] |= t[3] << 7; + r[13 * i + 5] = t[3] >> 1; + r[13 * i + 6] = t[3] >> 9; + r[13 * i + 6] |= t[4] << 4; + r[13 * i + 7] = t[4] >> 4; + r[13 * i + 8] = t[4] >> 12; + r[13 * i + 8] |= t[5] << 1; + r[13 * i + 9] = t[5] >> 7; + r[13 * i + 9] |= t[6] << 6; + r[13 * i + 10] = t[6] >> 2; + r[13 * i + 11] = t[6] >> 10; + r[13 * i + 11] |= t[7] << 3; + r[13 * i + 12] = t[7] >> 5; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyt0_unpack(poly *restrict r, const uint8_t a[POLYT0_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; + + r[5 * i + 0] = t[0]; + r[5 * i + 1] = t[0] >> 8; + r[5 * i + 2] = t[0] >> 16; + r[5 * i + 2] |= t[1] << 4; + r[5 * i + 3] = t[1] >> 4; + r[5 * i + 4] = t[1] >> 12; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyz_unpack(poly *restrict r, const uint8_t a[POLYZ_PACKEDBYTES + 12]) { + unsigned int i; + __m256i f; + const __m256i shufbidx = _mm256_set_epi8(-1, 11, 10, 9, -1, 9, 8, 7, -1, 6, 5, 4, -1, 4, 3, 2, + -1, 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0); + const __m256i srlvdidx = _mm256_set1_epi64x((uint64_t)4 << 32); + const __m256i mask = _mm256_set1_epi32(0xFFFFF); + const __m256i gamma1 = _mm256_set1_epi32(GAMMA1); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_srlv_epi32(f, srlvdidx); + f = _mm256_and_si256(f, mask); + f = _mm256_sub_epi32(gamma1, f); + _mm256_store_si256(&r->vec[i], f); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be positive standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + __m256i f0, f1, f2, f3, f4, f5, f6, f7; + const __m256i shift = _mm256_set1_epi16((16 << 8) + 1); + const __m256i shufbidx = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, + 15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + DBENCH_START(); + + for (i = 0; i < N / 64; ++i) { + f0 = _mm256_load_si256(&a->vec[8 * i + 0]); + f1 = _mm256_load_si256(&a->vec[8 * i + 1]); + f2 = _mm256_load_si256(&a->vec[8 * i + 2]); + f3 = _mm256_load_si256(&a->vec[8 * i + 3]); + f4 = _mm256_load_si256(&a->vec[8 * i + 4]); + f5 = _mm256_load_si256(&a->vec[8 * i + 5]); + f6 = _mm256_load_si256(&a->vec[8 * i + 6]); + f7 = _mm256_load_si256(&a->vec[8 * i + 7]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f2 = _mm256_packus_epi32(f4, f5); + f3 = _mm256_packus_epi32(f6, f7); + f0 = _mm256_packus_epi16(f0, f1); + f1 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift); + f1 = _mm256_maddubs_epi16(f1, shift); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_permute4x64_epi64(f0, 0xD8); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); + } + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium3aes/avx2/poly.h b/crypto_sign/dilithium3aes/avx2/poly.h new file mode 100644 index 00000000..6b6cf193 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/poly.h @@ -0,0 +1,52 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_POLY_H +#define PQCLEAN_DILITHIUM3AES_AVX2_POLY_H +#include "align.h" +#include "params.h" +#include "symmetric.h" +#include + +typedef ALIGNED_INT32(N) poly; + +void PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_caddq(poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM3AES_AVX2_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_nttunpack(poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM3AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM3AES_AVX2_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM3AES_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + + +void PQCLEAN_DILITHIUM3AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM3AES_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/polyvec.c b/crypto_sign/dilithium3aes/avx2/polyvec.c new file mode 100644 index 00000000..1047740e --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/polyvec.c @@ -0,0 +1,449 @@ +#include "aes256ctr.h" +#include "consts.h" +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include + +#define UNUSED(x) (void)x + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + uint64_t nonce; + aes256ctr_ctx state; + + PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(&state, rho, 0); + + for (i = 0; i < K; i++) { + for (j = 0; j < L; j++) { + nonce = (i << 8) + j; + state.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_preinit(&mat[i].vec[j], &state); + PQCLEAN_DILITHIUM3AES_AVX2_poly_nttunpack(&mat[i].vec[j]); + } + } +} + + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { + PQCLEAN_DILITHIUM3AES_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM3AES_AVX2_qdata.vec); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM3AES_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_caddq(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM3AES_AVX2_polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM3AES_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - uint8_t *hint: pointer to output hint array +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { + unsigned int i, n = 0; + + for (i = 0; i < K; ++i) { + n += PQCLEAN_DILITHIUM3AES_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); + } + + return n; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} diff --git a/crypto_sign/dilithium3aes/avx2/polyvec.h b/crypto_sign/dilithium3aes/avx2/polyvec.h new file mode 100644 index 00000000..0a76db3d --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/polyvec.h @@ -0,0 +1,64 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_POLYVEC_H +#define PQCLEAN_DILITHIUM3AES_AVX2_POLYVEC_H +#include "params.h" +#include "poly.h" +#include + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_reduce(polyvecl *v); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + +int PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_caddq(polyveck *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + +int PQCLEAN_DILITHIUM3AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + + +void PQCLEAN_DILITHIUM3AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/rejsample.c b/crypto_sign/dilithium3aes/avx2/rejsample.c new file mode 100644 index 00000000..04af5fde --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/rejsample.c @@ -0,0 +1,378 @@ +#include "params.h" +#include "rejsample.h" +#include "symmetric.h" +#include +#include + +const uint8_t PQCLEAN_DILITHIUM3AES_AVX2_idxlut[256][8] = { + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 1, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 0, 0, 0, 0, 0, 0}, + { 2, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0}, + { 1, 2, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 0, 0, 0, 0, 0}, + { 3, 0, 0, 0, 0, 0, 0, 0}, + { 0, 3, 0, 0, 0, 0, 0, 0}, + { 1, 3, 0, 0, 0, 0, 0, 0}, + { 0, 1, 3, 0, 0, 0, 0, 0}, + { 2, 3, 0, 0, 0, 0, 0, 0}, + { 0, 2, 3, 0, 0, 0, 0, 0}, + { 1, 2, 3, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 0, 0, 0, 0}, + { 4, 0, 0, 0, 0, 0, 0, 0}, + { 0, 4, 0, 0, 0, 0, 0, 0}, + { 1, 4, 0, 0, 0, 0, 0, 0}, + { 0, 1, 4, 0, 0, 0, 0, 0}, + { 2, 4, 0, 0, 0, 0, 0, 0}, + { 0, 2, 4, 0, 0, 0, 0, 0}, + { 1, 2, 4, 0, 0, 0, 0, 0}, + { 0, 1, 2, 4, 0, 0, 0, 0}, + { 3, 4, 0, 0, 0, 0, 0, 0}, + { 0, 3, 4, 0, 0, 0, 0, 0}, + { 1, 3, 4, 0, 0, 0, 0, 0}, + { 0, 1, 3, 4, 0, 0, 0, 0}, + { 2, 3, 4, 0, 0, 0, 0, 0}, + { 0, 2, 3, 4, 0, 0, 0, 0}, + { 1, 2, 3, 4, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 0, 0, 0}, + { 5, 0, 0, 0, 0, 0, 0, 0}, + { 0, 5, 0, 0, 0, 0, 0, 0}, + { 1, 5, 0, 0, 0, 0, 0, 0}, + { 0, 1, 5, 0, 0, 0, 0, 0}, + { 2, 5, 0, 0, 0, 0, 0, 0}, + { 0, 2, 5, 0, 0, 0, 0, 0}, + { 1, 2, 5, 0, 0, 0, 0, 0}, + { 0, 1, 2, 5, 0, 0, 0, 0}, + { 3, 5, 0, 0, 0, 0, 0, 0}, + { 0, 3, 5, 0, 0, 0, 0, 0}, + { 1, 3, 5, 0, 0, 0, 0, 0}, + { 0, 1, 3, 5, 0, 0, 0, 0}, + { 2, 3, 5, 0, 0, 0, 0, 0}, + { 0, 2, 3, 5, 0, 0, 0, 0}, + { 1, 2, 3, 5, 0, 0, 0, 0}, + { 0, 1, 2, 3, 5, 0, 0, 0}, + { 4, 5, 0, 0, 0, 0, 0, 0}, + { 0, 4, 5, 0, 0, 0, 0, 0}, + { 1, 4, 5, 0, 0, 0, 0, 0}, + { 0, 1, 4, 5, 0, 0, 0, 0}, + { 2, 4, 5, 0, 0, 0, 0, 0}, + { 0, 2, 4, 5, 0, 0, 0, 0}, + { 1, 2, 4, 5, 0, 0, 0, 0}, + { 0, 1, 2, 4, 5, 0, 0, 0}, + { 3, 4, 5, 0, 0, 0, 0, 0}, + { 0, 3, 4, 5, 0, 0, 0, 0}, + { 1, 3, 4, 5, 0, 0, 0, 0}, + { 0, 1, 3, 4, 5, 0, 0, 0}, + { 2, 3, 4, 5, 0, 0, 0, 0}, + { 0, 2, 3, 4, 5, 0, 0, 0}, + { 1, 2, 3, 4, 5, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 0, 0}, + { 6, 0, 0, 0, 0, 0, 0, 0}, + { 0, 6, 0, 0, 0, 0, 0, 0}, + { 1, 6, 0, 0, 0, 0, 0, 0}, + { 0, 1, 6, 0, 0, 0, 0, 0}, + { 2, 6, 0, 0, 0, 0, 0, 0}, + { 0, 2, 6, 0, 0, 0, 0, 0}, + { 1, 2, 6, 0, 0, 0, 0, 0}, + { 0, 1, 2, 6, 0, 0, 0, 0}, + { 3, 6, 0, 0, 0, 0, 0, 0}, + { 0, 3, 6, 0, 0, 0, 0, 0}, + { 1, 3, 6, 0, 0, 0, 0, 0}, + { 0, 1, 3, 6, 0, 0, 0, 0}, + { 2, 3, 6, 0, 0, 0, 0, 0}, + { 0, 2, 3, 6, 0, 0, 0, 0}, + { 1, 2, 3, 6, 0, 0, 0, 0}, + { 0, 1, 2, 3, 6, 0, 0, 0}, + { 4, 6, 0, 0, 0, 0, 0, 0}, + { 0, 4, 6, 0, 0, 0, 0, 0}, + { 1, 4, 6, 0, 0, 0, 0, 0}, + { 0, 1, 4, 6, 0, 0, 0, 0}, + { 2, 4, 6, 0, 0, 0, 0, 0}, + { 0, 2, 4, 6, 0, 0, 0, 0}, + { 1, 2, 4, 6, 0, 0, 0, 0}, + { 0, 1, 2, 4, 6, 0, 0, 0}, + { 3, 4, 6, 0, 0, 0, 0, 0}, + { 0, 3, 4, 6, 0, 0, 0, 0}, + { 1, 3, 4, 6, 0, 0, 0, 0}, + { 0, 1, 3, 4, 6, 0, 0, 0}, + { 2, 3, 4, 6, 0, 0, 0, 0}, + { 0, 2, 3, 4, 6, 0, 0, 0}, + { 1, 2, 3, 4, 6, 0, 0, 0}, + { 0, 1, 2, 3, 4, 6, 0, 0}, + { 5, 6, 0, 0, 0, 0, 0, 0}, + { 0, 5, 6, 0, 0, 0, 0, 0}, + { 1, 5, 6, 0, 0, 0, 0, 0}, + { 0, 1, 5, 6, 0, 0, 0, 0}, + { 2, 5, 6, 0, 0, 0, 0, 0}, + { 0, 2, 5, 6, 0, 0, 0, 0}, + { 1, 2, 5, 6, 0, 0, 0, 0}, + { 0, 1, 2, 5, 6, 0, 0, 0}, + { 3, 5, 6, 0, 0, 0, 0, 0}, + { 0, 3, 5, 6, 0, 0, 0, 0}, + { 1, 3, 5, 6, 0, 0, 0, 0}, + { 0, 1, 3, 5, 6, 0, 0, 0}, + { 2, 3, 5, 6, 0, 0, 0, 0}, + { 0, 2, 3, 5, 6, 0, 0, 0}, + { 1, 2, 3, 5, 6, 0, 0, 0}, + { 0, 1, 2, 3, 5, 6, 0, 0}, + { 4, 5, 6, 0, 0, 0, 0, 0}, + { 0, 4, 5, 6, 0, 0, 0, 0}, + { 1, 4, 5, 6, 0, 0, 0, 0}, + { 0, 1, 4, 5, 6, 0, 0, 0}, + { 2, 4, 5, 6, 0, 0, 0, 0}, + { 0, 2, 4, 5, 6, 0, 0, 0}, + { 1, 2, 4, 5, 6, 0, 0, 0}, + { 0, 1, 2, 4, 5, 6, 0, 0}, + { 3, 4, 5, 6, 0, 0, 0, 0}, + { 0, 3, 4, 5, 6, 0, 0, 0}, + { 1, 3, 4, 5, 6, 0, 0, 0}, + { 0, 1, 3, 4, 5, 6, 0, 0}, + { 2, 3, 4, 5, 6, 0, 0, 0}, + { 0, 2, 3, 4, 5, 6, 0, 0}, + { 1, 2, 3, 4, 5, 6, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 0}, + { 7, 0, 0, 0, 0, 0, 0, 0}, + { 0, 7, 0, 0, 0, 0, 0, 0}, + { 1, 7, 0, 0, 0, 0, 0, 0}, + { 0, 1, 7, 0, 0, 0, 0, 0}, + { 2, 7, 0, 0, 0, 0, 0, 0}, + { 0, 2, 7, 0, 0, 0, 0, 0}, + { 1, 2, 7, 0, 0, 0, 0, 0}, + { 0, 1, 2, 7, 0, 0, 0, 0}, + { 3, 7, 0, 0, 0, 0, 0, 0}, + { 0, 3, 7, 0, 0, 0, 0, 0}, + { 1, 3, 7, 0, 0, 0, 0, 0}, + { 0, 1, 3, 7, 0, 0, 0, 0}, + { 2, 3, 7, 0, 0, 0, 0, 0}, + { 0, 2, 3, 7, 0, 0, 0, 0}, + { 1, 2, 3, 7, 0, 0, 0, 0}, + { 0, 1, 2, 3, 7, 0, 0, 0}, + { 4, 7, 0, 0, 0, 0, 0, 0}, + { 0, 4, 7, 0, 0, 0, 0, 0}, + { 1, 4, 7, 0, 0, 0, 0, 0}, + { 0, 1, 4, 7, 0, 0, 0, 0}, + { 2, 4, 7, 0, 0, 0, 0, 0}, + { 0, 2, 4, 7, 0, 0, 0, 0}, + { 1, 2, 4, 7, 0, 0, 0, 0}, + { 0, 1, 2, 4, 7, 0, 0, 0}, + { 3, 4, 7, 0, 0, 0, 0, 0}, + { 0, 3, 4, 7, 0, 0, 0, 0}, + { 1, 3, 4, 7, 0, 0, 0, 0}, + { 0, 1, 3, 4, 7, 0, 0, 0}, + { 2, 3, 4, 7, 0, 0, 0, 0}, + { 0, 2, 3, 4, 7, 0, 0, 0}, + { 1, 2, 3, 4, 7, 0, 0, 0}, + { 0, 1, 2, 3, 4, 7, 0, 0}, + { 5, 7, 0, 0, 0, 0, 0, 0}, + { 0, 5, 7, 0, 0, 0, 0, 0}, + { 1, 5, 7, 0, 0, 0, 0, 0}, + { 0, 1, 5, 7, 0, 0, 0, 0}, + { 2, 5, 7, 0, 0, 0, 0, 0}, + { 0, 2, 5, 7, 0, 0, 0, 0}, + { 1, 2, 5, 7, 0, 0, 0, 0}, + { 0, 1, 2, 5, 7, 0, 0, 0}, + { 3, 5, 7, 0, 0, 0, 0, 0}, + { 0, 3, 5, 7, 0, 0, 0, 0}, + { 1, 3, 5, 7, 0, 0, 0, 0}, + { 0, 1, 3, 5, 7, 0, 0, 0}, + { 2, 3, 5, 7, 0, 0, 0, 0}, + { 0, 2, 3, 5, 7, 0, 0, 0}, + { 1, 2, 3, 5, 7, 0, 0, 0}, + { 0, 1, 2, 3, 5, 7, 0, 0}, + { 4, 5, 7, 0, 0, 0, 0, 0}, + { 0, 4, 5, 7, 0, 0, 0, 0}, + { 1, 4, 5, 7, 0, 0, 0, 0}, + { 0, 1, 4, 5, 7, 0, 0, 0}, + { 2, 4, 5, 7, 0, 0, 0, 0}, + { 0, 2, 4, 5, 7, 0, 0, 0}, + { 1, 2, 4, 5, 7, 0, 0, 0}, + { 0, 1, 2, 4, 5, 7, 0, 0}, + { 3, 4, 5, 7, 0, 0, 0, 0}, + { 0, 3, 4, 5, 7, 0, 0, 0}, + { 1, 3, 4, 5, 7, 0, 0, 0}, + { 0, 1, 3, 4, 5, 7, 0, 0}, + { 2, 3, 4, 5, 7, 0, 0, 0}, + { 0, 2, 3, 4, 5, 7, 0, 0}, + { 1, 2, 3, 4, 5, 7, 0, 0}, + { 0, 1, 2, 3, 4, 5, 7, 0}, + { 6, 7, 0, 0, 0, 0, 0, 0}, + { 0, 6, 7, 0, 0, 0, 0, 0}, + { 1, 6, 7, 0, 0, 0, 0, 0}, + { 0, 1, 6, 7, 0, 0, 0, 0}, + { 2, 6, 7, 0, 0, 0, 0, 0}, + { 0, 2, 6, 7, 0, 0, 0, 0}, + { 1, 2, 6, 7, 0, 0, 0, 0}, + { 0, 1, 2, 6, 7, 0, 0, 0}, + { 3, 6, 7, 0, 0, 0, 0, 0}, + { 0, 3, 6, 7, 0, 0, 0, 0}, + { 1, 3, 6, 7, 0, 0, 0, 0}, + { 0, 1, 3, 6, 7, 0, 0, 0}, + { 2, 3, 6, 7, 0, 0, 0, 0}, + { 0, 2, 3, 6, 7, 0, 0, 0}, + { 1, 2, 3, 6, 7, 0, 0, 0}, + { 0, 1, 2, 3, 6, 7, 0, 0}, + { 4, 6, 7, 0, 0, 0, 0, 0}, + { 0, 4, 6, 7, 0, 0, 0, 0}, + { 1, 4, 6, 7, 0, 0, 0, 0}, + { 0, 1, 4, 6, 7, 0, 0, 0}, + { 2, 4, 6, 7, 0, 0, 0, 0}, + { 0, 2, 4, 6, 7, 0, 0, 0}, + { 1, 2, 4, 6, 7, 0, 0, 0}, + { 0, 1, 2, 4, 6, 7, 0, 0}, + { 3, 4, 6, 7, 0, 0, 0, 0}, + { 0, 3, 4, 6, 7, 0, 0, 0}, + { 1, 3, 4, 6, 7, 0, 0, 0}, + { 0, 1, 3, 4, 6, 7, 0, 0}, + { 2, 3, 4, 6, 7, 0, 0, 0}, + { 0, 2, 3, 4, 6, 7, 0, 0}, + { 1, 2, 3, 4, 6, 7, 0, 0}, + { 0, 1, 2, 3, 4, 6, 7, 0}, + { 5, 6, 7, 0, 0, 0, 0, 0}, + { 0, 5, 6, 7, 0, 0, 0, 0}, + { 1, 5, 6, 7, 0, 0, 0, 0}, + { 0, 1, 5, 6, 7, 0, 0, 0}, + { 2, 5, 6, 7, 0, 0, 0, 0}, + { 0, 2, 5, 6, 7, 0, 0, 0}, + { 1, 2, 5, 6, 7, 0, 0, 0}, + { 0, 1, 2, 5, 6, 7, 0, 0}, + { 3, 5, 6, 7, 0, 0, 0, 0}, + { 0, 3, 5, 6, 7, 0, 0, 0}, + { 1, 3, 5, 6, 7, 0, 0, 0}, + { 0, 1, 3, 5, 6, 7, 0, 0}, + { 2, 3, 5, 6, 7, 0, 0, 0}, + { 0, 2, 3, 5, 6, 7, 0, 0}, + { 1, 2, 3, 5, 6, 7, 0, 0}, + { 0, 1, 2, 3, 5, 6, 7, 0}, + { 4, 5, 6, 7, 0, 0, 0, 0}, + { 0, 4, 5, 6, 7, 0, 0, 0}, + { 1, 4, 5, 6, 7, 0, 0, 0}, + { 0, 1, 4, 5, 6, 7, 0, 0}, + { 2, 4, 5, 6, 7, 0, 0, 0}, + { 0, 2, 4, 5, 6, 7, 0, 0}, + { 1, 2, 4, 5, 6, 7, 0, 0}, + { 0, 1, 2, 4, 5, 6, 7, 0}, + { 3, 4, 5, 6, 7, 0, 0, 0}, + { 0, 3, 4, 5, 6, 7, 0, 0}, + { 1, 3, 4, 5, 6, 7, 0, 0}, + { 0, 1, 3, 4, 5, 6, 7, 0}, + { 2, 3, 4, 5, 6, 7, 0, 0}, + { 0, 2, 3, 4, 5, 6, 7, 0}, + { 1, 2, 3, 4, 5, 6, 7, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7} +}; + +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { + unsigned int ctr, pos; + uint32_t good; + __m256i d, tmp; + const __m256i bound = _mm256_set1_epi32(Q); + const __m256i mask = _mm256_set1_epi32(0x7FFFFF); + const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, + -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, + -1, 5, 4, 3, -1, 2, 1, 0); + + ctr = pos = 0; + while (pos <= REJ_UNIFORM_BUFLEN - 24) { + d = _mm256_loadu_si256((__m256i *)&buf[pos]); + d = _mm256_permute4x64_epi64(d, 0x94); + d = _mm256_shuffle_epi8(d, idx8); + d = _mm256_and_si256(d, mask); + pos += 24; + + tmp = _mm256_sub_epi32(d, bound); + good = _mm256_movemask_ps((__m256)tmp); + tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3AES_AVX2_idxlut[good])); + d = _mm256_permutevar8x32_epi32(d, tmp); + + _mm256_storeu_si256((__m256i *)&r[ctr], d); + ctr += _mm_popcnt_u32(good); + + } + + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { + unsigned int ctr, pos; + uint32_t good; + __m256i f0, f1; + __m128i g0, g1; + const __m256i mask = _mm256_set1_epi8(15); + const __m256i eta = _mm256_set1_epi8(4); + const __m256i bound = _mm256_set1_epi8(9); + + ctr = pos = 0; + while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { + f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); + f1 = _mm256_slli_epi16(f0, 4); + f0 = _mm256_or_si256(f0, f1); + f0 = _mm256_and_si256(f0, mask); + + f1 = _mm256_sub_epi8(f0, bound); + f0 = _mm256_sub_epi8(eta, f0); + good = _mm256_movemask_epi8(f1); + + g0 = _mm256_castsi256_si128(f0); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm256_extracti128_si256(f0, 1); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM3AES_AVX2_idxlut[good]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good); + pos += 4; + } + + uint32_t t0, t1; + while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 9) { + r[ctr++] = 4 - t0; + } + if (t1 < 9 && ctr < N) { + r[ctr++] = 4 - t1; + } + } + + return ctr; +} diff --git a/crypto_sign/dilithium3aes/avx2/rejsample.h b/crypto_sign/dilithium3aes/avx2/rejsample.h new file mode 100644 index 00000000..2f52d684 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/rejsample.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_REJSAMPLE_H +#define PQCLEAN_DILITHIUM3AES_AVX2_REJSAMPLE_H +#include "params.h" +#include "symmetric.h" +#include + +#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) + +#define REJ_UNIFORM_ETA_NBLOCKS ((228+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) + +extern const uint8_t PQCLEAN_DILITHIUM3AES_AVX2_idxlut[256][8]; + +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); + +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/rounding.c b/crypto_sign/dilithium3aes/avx2/rounding.c new file mode 100644 index 00000000..f7475870 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/rounding.c @@ -0,0 +1,154 @@ +#include "consts.h" +#include "params.h" +#include "rejsample.h" +#include "rounding.h" +#include +#include +#include + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +/************************************************* +* Name: power2round +* +* Description: For finite field elements a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be positive standard representative. +* +* Arguments: - __m256i *a1: output array of length N/8 with high bits +* - __m256i *a0: output array of length N/8 with low bits a0 +* - const __m256i *a: input array of length N/8 +* +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i mask = _mm256_set1_epi32(-(1 << D)); + const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); + + for (i = 0; i < N / 8; ++i) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, half); + f0 = _mm256_and_si256(f1, mask); + f1 = _mm256_srli_epi32(f1, D); + f0 = _mm256_sub_epi32(f, f0); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + +/************************************************* +* Name: decompose +* +* Description: For finite field element a, compute high and low parts a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard +* representative. +* +* Arguments: - __m256i *a1: output array of length N/8 with high parts +* - __m256i *a0: output array of length N/8 with low parts a0 +* - const __m256i *a: input array of length N/8 +* +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM3AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + const __m256i v = _mm256_set1_epi32(1025); + const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(512); + const __m256i mask = _mm256_set1_epi32(15); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + + +/************************************************* +* Name: make_hint +* +* Description: Compute indices of polynomial coefficients whose low bits +* overflow into the high bits. +* +* Arguments: - uint8_t *hint: hint array +* - const __m256i *a0: low bits of input elements +* - const __m256i *a1: high bits of input elements +* +* Returns number of overflowing low bits +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { + unsigned int i, n = 0; + __m256i f0, f1, g0, g1; + uint32_t bad; + uint64_t idx; + const __m256i low = _mm256_set1_epi32(-GAMMA2); + const __m256i high = _mm256_set1_epi32(GAMMA2); + + for (i = 0; i < N / 8; ++i) { + f0 = _mm256_load_si256(&a0[i]); + f1 = _mm256_load_si256(&a1[i]); + g0 = _mm256_abs_epi32(f0); + g0 = _mm256_cmpgt_epi32(g0, high); + g1 = _mm256_cmpeq_epi32(f0, low); + g1 = _mm256_sign_epi32(g1, f1); + g0 = _mm256_or_si256(g0, g1); + + bad = _mm256_movemask_ps((__m256)g0); + memcpy(&idx, PQCLEAN_DILITHIUM3AES_AVX2_idxlut[bad], 8); + idx += (uint64_t)0x0808080808080808 * i; + memcpy(&hint[n], &idx, 8); + n += _mm_popcnt_u32(bad); + } + + return n; +} + +/************************************************* +* Name: use_hint +* +* Description: Correct high parts according to hint. +* +* Arguments: - __m256i *b: output array of length N/8 with corrected high parts +* - const __m256i *a: input array of length N/8 +* - const __m256i *a: input array of length N/8 with hint bits +* +**************************************************/ +void PQCLEAN_DILITHIUM3AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { + unsigned int i; + __m256i a0[N / 8]; + __m256i f, g, h, t; + const __m256i zero = _mm256_setzero_si256(); + const __m256i mask = _mm256_set1_epi32(15); + + PQCLEAN_DILITHIUM3AES_AVX2_decompose_avx(b, a0, a); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a0[i]); + g = _mm256_load_si256(&b[i]); + h = _mm256_load_si256(&hint[i]); + t = _mm256_blendv_epi32(zero, h, f); + t = _mm256_slli_epi32(t, 1); + h = _mm256_sub_epi32(h, t); + g = _mm256_add_epi32(g, h); + g = _mm256_and_si256(g, mask); + _mm256_store_si256(&b[i], g); + } +} diff --git a/crypto_sign/dilithium3aes/avx2/rounding.h b/crypto_sign/dilithium3aes/avx2/rounding.h new file mode 100644 index 00000000..7ed2686b --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/rounding.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_ROUNDING_H +#define PQCLEAN_DILITHIUM3AES_AVX2_ROUNDING_H +#include "params.h" +#include +#include + +void PQCLEAN_DILITHIUM3AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); +void PQCLEAN_DILITHIUM3AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); +unsigned int PQCLEAN_DILITHIUM3AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); +void PQCLEAN_DILITHIUM3AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/shuffle.S b/crypto_sign/dilithium3aes/avx2/shuffle.S new file mode 100644 index 00000000..77a42ea5 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/shuffle.S @@ -0,0 +1,54 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +#store +vmovdqa %ymm9,(%rdi) +vmovdqa %ymm8,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm3,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx) +.global _cdecl(PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx) +cdecl(PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx): +_cdecl(PQCLEAN_DILITHIUM3AES_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret diff --git a/crypto_sign/dilithium3aes/avx2/shuffle.inc b/crypto_sign/dilithium3aes/avx2/shuffle.inc new file mode 100644 index 00000000..73e9ffe0 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/shuffle.inc @@ -0,0 +1,25 @@ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm diff --git a/crypto_sign/dilithium3aes/avx2/sign.c b/crypto_sign/dilithium3aes/avx2/sign.c new file mode 100644 index 00000000..7c2c2a08 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/sign.c @@ -0,0 +1,425 @@ +#include "aes256ctr.h" +#include "align.h" +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" +#include +#include + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; + uint8_t seedbuf[3 * SEEDBYTES]; + const uint8_t *rho, *rhoprime, *key; + uint64_t nonce; + aes256ctr_ctx aesctx; + polyvecl rowbuf[1]; + polyvecl s1, *row = rowbuf; + polyveck s2; + poly t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Store rho, key */ + memcpy(pk, rho, SEEDBYTES); + memcpy(sk, rho, SEEDBYTES); + memcpy(sk + SEEDBYTES, key, SEEDBYTES); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0); + for (i = 0; i < L; ++i) { + nonce = i; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta_preinit(&s1.vec[i], &aesctx); + } + for (i = 0; i < K; ++i) { + nonce = L + i; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_eta_preinit(&s2.vec[i], &aesctx); + } + + /* Pack secret vectors */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM3AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); + } + for (i = 0; i < K; i++) { + PQCLEAN_DILITHIUM3AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); + } + + /* Transform s1 */ + PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_ntt(&s1); + + PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(&aesctx, rho, 0); + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + for (unsigned int j = 0; j < L; j++) { + nonce = (i << 8) + j; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx); + PQCLEAN_DILITHIUM3AES_AVX2_poly_nttunpack(&row->vec[j]); + } + + /* Compute inner-product */ + PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); + PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(&t1); + + /* Add error polynomial */ + PQCLEAN_DILITHIUM3AES_AVX2_poly_add(&t1, &t1, &s2.vec[i]); + + /* Round t and pack t1, t0 */ + PQCLEAN_DILITHIUM3AES_AVX2_poly_caddq(&t1); + PQCLEAN_DILITHIUM3AES_AVX2_poly_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM3AES_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); + PQCLEAN_DILITHIUM3AES_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); + } + + /* Compute CRH(rho, t1) and store in secret key */ + crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + unsigned int i, n, pos; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint8_t hintbuf[N]; + uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce = 0; + polyvecl mat[K], s1, z; + polyveck t0, s2, w1; + poly c, tmp; + union { + polyvecl y; + polyveck w0; + } tmpv; + shake256incctx state; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM3AES_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + /* Compute CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM3AES_AVX2_polyvec_matrix_expand(mat, rho); + PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM3AES_AVX2_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM3AES_AVX2_polyveck_ntt(&t0); + + aes256ctr_ctx aesctx; + PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0); + +rej: + /* Sample intermediate vector y */ + for (i = 0; i < L; ++i) { + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce++; + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_gamma1_preinit(&z.vec[i], &aesctx); + } + + /* Matrix-vector product */ + tmpv.y = z; + PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_ntt(&tmpv.y); + PQCLEAN_DILITHIUM3AES_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); + PQCLEAN_DILITHIUM3AES_AVX2_polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM3AES_AVX2_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM3AES_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); + PQCLEAN_DILITHIUM3AES_AVX2_polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM3AES_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt(&c); + + /* Compute z, reject if it reveals secret */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); + PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM3AES_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); + PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(&z.vec[i]); + if (PQCLEAN_DILITHIUM3AES_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { + goto rej; + } + } + + /* Zero hint vector in signature */ + pos = 0; + memset(hint, 0, OMEGA); + + for (i = 0; i < K; i++) { + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); + PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM3AES_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(&tmpv.w0.vec[i]); + if (PQCLEAN_DILITHIUM3AES_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints */ + PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); + PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(&tmp); + if (PQCLEAN_DILITHIUM3AES_AVX2_poly_chknorm(&tmp, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM3AES_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + n = PQCLEAN_DILITHIUM3AES_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); + if (pos + n > OMEGA) { + goto rej; + } + + /* Store hints in signature */ + memcpy(&hint[pos], hintbuf, n); + hint[OMEGA + i] = pos = pos + n; + } + + /* Pack z into signature */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM3AES_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); + } + + *siglen = PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { + unsigned int i, j, pos = 0; + /* PQCLEAN_DILITHIUM3AES_AVX2_polyw1_pack writes additional 14 bytes */ + ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; + uint8_t mu[CRHBYTES]; + const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce; + aes256ctr_ctx aesctx; + polyvecl rowbuf[1]; + polyvecl *row = rowbuf; + polyvecl z; + poly c, w1, h; + shake256incctx state; + + if (siglen != PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + /* Expand PQCLEAN_DILITHIUM3AES_AVX2_challenge */ + PQCLEAN_DILITHIUM3AES_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt(&c); + + /* Unpack z; shortness follows from unpacking */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM3AES_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); + PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt(&z.vec[i]); + } + + PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(&aesctx, pk, 0); + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + for (j = 0; j < L; j++) { + nonce = (i << 8) + j; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM3AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx); + PQCLEAN_DILITHIUM3AES_AVX2_poly_nttunpack(&row->vec[j]); + } + + /* Compute i-th row of Az - c2^Dt1 */ + PQCLEAN_DILITHIUM3AES_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); + + PQCLEAN_DILITHIUM3AES_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); + PQCLEAN_DILITHIUM3AES_AVX2_poly_shiftl(&h); + PQCLEAN_DILITHIUM3AES_AVX2_poly_ntt(&h); + PQCLEAN_DILITHIUM3AES_AVX2_poly_pointwise_montgomery(&h, &c, &h); + + PQCLEAN_DILITHIUM3AES_AVX2_poly_sub(&w1, &w1, &h); + PQCLEAN_DILITHIUM3AES_AVX2_poly_reduce(&w1); + PQCLEAN_DILITHIUM3AES_AVX2_poly_invntt_tomont(&w1); + + /* Get hint polynomial and reconstruct w1 */ + memset(h.vec, 0, sizeof(poly)); + if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { + return -1; + } + + for (j = pos; j < hint[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > pos && hint[j] <= hint[j - 1]) { + return -1; + } + h.coeffs[hint[j]] = 1; + } + pos = hint[OMEGA + i]; + + PQCLEAN_DILITHIUM3AES_AVX2_poly_caddq(&w1); + PQCLEAN_DILITHIUM3AES_AVX2_poly_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM3AES_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); + } + + /* Extra indices are zero for strong unforgeability */ + for (j = pos; j < OMEGA; ++j) { + if (hint[j]) { + return -1; + } + } + + /* Call random oracle and verify PQCLEAN_DILITHIUM3AES_AVX2_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (buf.coeffs[i] != sig[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { + size_t i; + + if (smlen < PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES) { + goto badsig; + } + + *mlen = smlen - PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM3AES_AVX2_CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium3aes/avx2/sign.h b/crypto_sign/dilithium3aes/avx2/sign.h new file mode 100644 index 00000000..4b6feee1 --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/sign.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_SIGN_H +#define PQCLEAN_DILITHIUM3AES_AVX2_SIGN_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM3AES_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM3AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium3aes/avx2/symmetric.h b/crypto_sign/dilithium3aes/avx2/symmetric.h new file mode 100644 index 00000000..6cfad5ba --- /dev/null +++ b/crypto_sign/dilithium3aes/avx2/symmetric.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_DILITHIUM3AES_AVX2_SYMMETRIC_H +#define PQCLEAN_DILITHIUM3AES_AVX2_SYMMETRIC_H +#include "aes256ctr.h" +#include "fips202.h" +#include "params.h" +#include + + + +typedef aes256ctr_ctx stream128_state; +typedef aes256ctr_ctx stream256_state; + +#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES +#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream128_release(STATE) +#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM3AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_release(STATE) + + +#endif diff --git a/crypto_sign/dilithium3aes/clean/LICENSE b/crypto_sign/dilithium3aes/clean/LICENSE new file mode 100644 index 00000000..08473af7 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/LICENSE @@ -0,0 +1,5 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium3aes/clean/Makefile b/crypto_sign/dilithium3aes/clean/Makefile new file mode 100644 index 00000000..60655614 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/Makefile @@ -0,0 +1,19 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium3aes_clean.a +HEADERS=aes256ctr.h api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS=aes256ctr.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-aes.o + +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_sign/dilithium3aes/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium3aes/clean/Makefile.Microsoft_nmake new file mode 100644 index 00000000..0de00dcc --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/Makefile.Microsoft_nmake @@ -0,0 +1,23 @@ +# This Makefile can be used with Microsoft Visual Studio's nmake using the command: +# nmake /f Makefile.Microsoft_nmake + +LIBRARY=libdilithium3aes_clean.lib +OBJECTS=aes256ctr.obj ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-aes.obj + +# Warning C4146 is raised when a unary minus operator is applied to an +# unsigned type; this has nonetheless been standard and portable for as +# long as there has been a C standard, and we need it for constant-time +# computations. Thus, we disable that spurious warning. +CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 + +all: $(LIBRARY) + +# Make sure objects are recompiled if headers change. +$(OBJECTS): *.h + +$(LIBRARY): $(OBJECTS) + LIB.EXE /NOLOGO /WX /OUT:$@ $** + +clean: + -DEL $(OBJECTS) + -DEL $(LIBRARY) diff --git a/crypto_sign/dilithium3aes/clean/aes256ctr.c b/crypto_sign/dilithium3aes/clean/aes256ctr.c new file mode 100644 index 00000000..688cb5e7 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/aes256ctr.c @@ -0,0 +1,564 @@ +#include "aes256ctr.h" +#include +#include +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +static inline uint32_t br_dec32le(const uint8_t *src) { + return (uint32_t)src[0] + | ((uint32_t)src[1] << 8) + | ((uint32_t)src[2] << 16) + | ((uint32_t)src[3] << 24); +} + +static void br_range_dec32le(uint32_t *v, size_t num, const uint8_t *src) { + while (num-- > 0) { + *v ++ = br_dec32le(src); + src += 4; + } +} + +static inline uint32_t br_swap32(uint32_t x) { + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + +static inline void br_enc32le(uint8_t *dst, uint32_t x) { + dst[0] = (uint8_t)x; + dst[1] = (uint8_t)(x >> 8); + dst[2] = (uint8_t)(x >> 16); + dst[3] = (uint8_t)(x >> 24); +} + +static void br_range_enc32le(uint8_t *dst, const uint32_t *v, size_t num) { + while (num-- > 0) { + br_enc32le(dst, *v ++); + dst += 4; + } +} + +static void br_aes_ct64_bitslice_Sbox(uint64_t *q) { + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +static void br_aes_ct64_ortho(uint64_t *q) { +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \ + (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) { + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) { + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const uint8_t Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t sub_word(uint32_t x) { + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +static void br_aes_ct64_keysched(uint64_t *comp_skey, const uint8_t *key) { + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + int key_len = 32; + + nk = (int)(key_len >> 2); + nkf = (int)((14 + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } +} + +static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey) { + unsigned u, v, n; + + n = (14 + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} + +static inline void add_round_key(uint64_t *q, const uint64_t *sk) { + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void shift_rows(uint64_t *q) { + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t rotr32(uint64_t x) { + return (x << 32) | (x >> 32); +} + +static inline void mix_columns(uint64_t *q) { + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +static void inc4_be(uint32_t *x) { + *x = br_swap32(*x) + 4; + *x = br_swap32(*x); +} + +static void aes_ctr4x(uint8_t out[64], uint32_t ivw[16], uint64_t sk_exp[64]) { + uint32_t w[16]; + uint64_t q[8]; + int i; + + memcpy(w, ivw, sizeof(w)); + for (i = 0; i < 4; i++) { + br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + + add_round_key(q, sk_exp); + for (i = 1; i < 14; i++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, sk_exp + (i << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, sk_exp + 112); + + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]); + } + br_range_enc32le(out, w, 16); + + /* Increase counter for next 4 blocks */ + inc4_be(ivw + 3); + inc4_be(ivw + 7); + inc4_be(ivw + 11); + inc4_be(ivw + 15); +} + +static void br_aes_ct64_ctr_init(uint64_t sk_exp[120], const uint8_t *key) { + uint64_t skey[30]; + + br_aes_ct64_keysched(skey, key); + br_aes_ct64_skey_expand(sk_exp, skey); +} + +static void br_aes_ct64_ctr_run(uint64_t sk_exp[120], const uint8_t *iv, uint32_t cc, uint8_t *data, size_t len) { + uint32_t ivw[16]; + size_t i; + + br_range_dec32le(ivw, 3, iv); + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + ivw[ 3] = br_swap32(cc); + ivw[ 7] = br_swap32(cc + 1); + ivw[11] = br_swap32(cc + 2); + ivw[15] = br_swap32(cc + 3); + + while (len > 64) { + aes_ctr4x(data, ivw, sk_exp); + data += 64; + len -= 64; + } + if (len > 0) { + uint8_t tmp[64]; + aes_ctr4x(tmp, ivw, sk_exp); + for (i = 0; i < len; i++) { + data[i] = tmp[i]; + } + } +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t *key, const uint8_t *nonce) { + uint64_t sk_exp[120]; + + br_aes_ct64_ctr_init(sk_exp, key); + br_aes_ct64_ctr_run(sk_exp, nonce, 0, out, outlen); +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_init(aes256ctr_ctx *s, const uint8_t *key, const uint8_t *nonce) { + br_aes_ct64_ctr_init(s->sk_exp, key); + + br_range_dec32le(s->ivw, 3, nonce); + memcpy(s->ivw + 4, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 8, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 12, s->ivw, 3 * sizeof(uint32_t)); + s->ivw[ 3] = br_swap32(0); + s->ivw[ 7] = br_swap32(1); + s->ivw[11] = br_swap32(2); + s->ivw[15] = br_swap32(3); +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *s) { + while (nblocks > 0) { + aes_ctr4x(out, s->ivw, s->sk_exp); + out += 64; + nblocks--; + } +} diff --git a/crypto_sign/dilithium3aes/clean/aes256ctr.h b/crypto_sign/dilithium3aes/clean/aes256ctr.h new file mode 100644 index 00000000..b2e7e3d6 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/aes256ctr.h @@ -0,0 +1,28 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_AES256CTR_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_AES256CTR_H + +#include +#include + +#define AES256CTR_BLOCKBYTES 64 + + +typedef struct { + uint64_t sk_exp[120]; + uint32_t ivw[16]; +} aes256ctr_ctx; + +void PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/api.h b/crypto_sign/dilithium3aes/clean/api.h new file mode 100644 index 00000000..495dd5db --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/api.h @@ -0,0 +1,32 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_API_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES 1952 +#define PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_SECRETKEYBYTES 4016 +#define PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES 3293 + +#define PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_ALGNAME "Dilithium3-AES" + + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/ntt.c b/crypto_sign/dilithium3aes/clean/ntt.c new file mode 100644 index 00000000..6c036dbb --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/ntt.c @@ -0,0 +1,98 @@ +#include "ntt.h" +#include "params.h" +#include "reduce.h" +#include + +static const int32_t zetas[N] = { + 0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, + 2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, + -2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, + -1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, + 3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, + -671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, + -3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, + -3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, + 189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, + 1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, + 2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, + 266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, + 900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, + -655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, + 342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, + 2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, + -3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, + -1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, + -1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, + -542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, + -2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, + -3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, + -3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, + -426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, + -2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, + -554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 +}; + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_ntt +* +* Description: Forward NTT, in-place. No modular reduction is performed after +* additions or subtractions. Output vector is in bitreversed order. +* +* Arguments: - uint32_t p[N]: input/output coefficient array +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_ntt(int32_t a[N]) { + unsigned int len, start, j, k; + int32_t zeta, t; + + k = 0; + for (len = 128; len > 0; len >>= 1) { + for (start = 0; start < N; start = j + len) { + zeta = zetas[++k]; + for (j = start; j < start + len; ++j) { + t = PQCLEAN_DILITHIUM3AES_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + a[j + len] = a[j] - t; + a[j] = a[j] + t; + } + } + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_invntt_tomont +* +* Description: Inverse NTT and multiplication by Montgomery factor 2^32. +* In-place. No modular reductions after additions or +* subtractions; input coefficients need to be smaller than +* Q in absolute value. Output coefficient are smaller than Q in +* absolute value. +* +* Arguments: - uint32_t p[N]: input/output coefficient array +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_invntt_tomont(int32_t a[N]) { + unsigned int start, len, j, k; + int32_t t, zeta; + const int32_t f = 41978; // mont^2/256 + + k = 256; + for (len = 1; len < N; len <<= 1) { + for (start = 0; start < N; start = j + len) { + zeta = -zetas[--k]; + for (j = start; j < start + len; ++j) { + t = a[j]; + a[j] = t + a[j + len]; + a[j + len] = t - a[j + len]; + a[j + len] = PQCLEAN_DILITHIUM3AES_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + } + } + } + + for (j = 0; j < N; ++j) { + a[j] = PQCLEAN_DILITHIUM3AES_CLEAN_montgomery_reduce((int64_t)f * a[j]); + } +} diff --git a/crypto_sign/dilithium3aes/clean/ntt.h b/crypto_sign/dilithium3aes/clean/ntt.h new file mode 100644 index 00000000..9d40f949 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/ntt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_NTT_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_NTT_H +#include "params.h" +#include + +void PQCLEAN_DILITHIUM3AES_CLEAN_ntt(int32_t a[N]); + +void PQCLEAN_DILITHIUM3AES_CLEAN_invntt_tomont(int32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/packing.c b/crypto_sign/dilithium3aes/clean/packing.c new file mode 100644 index 00000000..9a32261d --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/packing.c @@ -0,0 +1,261 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM3AES_CLEAN_challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Encode h */ + for (i = 0; i < OMEGA + K; ++i) { + sig[i] = 0; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; + } + } + + sig[OMEGA + i] = (uint8_t) k; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM3AES_CLEAN_challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES]) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + return 0; +} diff --git a/crypto_sign/dilithium3aes/clean/packing.h b/crypto_sign/dilithium3aes/clean/packing.h new file mode 100644 index 00000000..f5e608cf --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/packing.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_PACKING_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_PACKING_H +#include "params.h" +#include "polyvec.h" +#include + +void PQCLEAN_DILITHIUM3AES_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +void PQCLEAN_DILITHIUM3AES_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM3AES_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM3AES_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM3AES_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM3AES_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES]); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/params.h b/crypto_sign/dilithium3aes/clean/params.h new file mode 100644 index 00000000..4f315e50 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/params.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_PARAMS_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_PARAMS_H + + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#define K 6 +#define L 5 +#define ETA 4 +#define TAU 49 +#define BETA 196 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 55 +#define PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_ALGNAME "Dilithium3-AES" + + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 640 + +#define POLYW1_PACKEDBYTES 128 + +#define POLYETA_PACKEDBYTES 128 + +#define PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif diff --git a/crypto_sign/dilithium3aes/clean/poly.c b/crypto_sign/dilithium3aes/clean/poly.c new file mode 100644 index 00000000..ea7e8ad4 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/poly.c @@ -0,0 +1,818 @@ +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "reduce.h" +#include "rounding.h" +#include "symmetric.h" +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_reduce(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM3AES_CLEAN_reduce32(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_caddq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_caddq(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM3AES_CLEAN_caddq(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* standard representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_freeze(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM3AES_CLEAN_freeze(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_shiftl(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] <<= D; + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_ntt(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_CLEAN_ntt(a->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM3AES_CLEAN_invntt_tomont(a->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = PQCLEAN_DILITHIUM3AES_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM3AES_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM3AES_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3AES_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { + unsigned int i, s = 0; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + h->coeffs[i] = PQCLEAN_DILITHIUM3AES_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + + DBENCH_STOP(*tround); + return s; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + b->coeffs[i] = PQCLEAN_DILITHIUM3AES_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM3AES_CLEAN_reduce32(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int32_t t; + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for (i = 0; i < N; ++i) { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2 * a->coeffs[i]); + + if (t >= B) { + DBENCH_STOP(*tsample); + return 1; + } + } + + DBENCH_STOP(*tsample); + return 0; +} + +/************************************************* +* Name: rej_uniform +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); + + ctr = rej_uniform(a->coeffs, N, buf, buflen); + + while (ctr < N) { + off = buflen % 3; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + stream128_squeezeblocks(buf + off, 1, &state); + buflen = STREAM128_BLOCKBYTES + off; + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); + } + stream128_release(&state); +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 9) { + a[ctr++] = 4 - t0; + } + if (t1 < 9 && ctr < len) { + a[ctr++] = 4 - t1; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling on the +* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_ETA_NBLOCKS ((227 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = rej_eta(a->coeffs, N, buf, buflen); + + while (ctr < N) { + stream128_squeezeblocks(buf, 1, &state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); + } + stream128_release(&state); +} + +/************************************************* +* Name: poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce) { + uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + stream256_release(&state); + PQCLEAN_DILITHIUM3AES_CLEAN_polyz_unpack(a, buf); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[SHAKE256_RATE]; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf, sizeof buf, &state); + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)buf[i] << 8 * i; + } + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf, sizeof buf, &state); + pos = 0; + } + + b = buf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = (uint8_t) (ETA - a->coeffs[2 * i + 0]); + t[1] = (uint8_t) (ETA - a->coeffs[2 * i + 1]); + r[i] = t[0] | (t[1] << 4); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[i] & 0x0F; + r->coeffs[2 * i + 1] = a[i] >> 4; + r->coeffs[2 * i + 0] = ETA - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = ETA - r->coeffs[2 * i + 1]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)); + r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)); + r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)); + r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = (uint8_t) t[0]; + r[13 * i + 1] = (uint8_t) (t[0] >> 8); + r[13 * i + 1] |= (uint8_t) (t[1] << 5); + r[13 * i + 2] = (uint8_t) (t[1] >> 3); + r[13 * i + 3] = (uint8_t) (t[1] >> 11); + r[13 * i + 3] |= (uint8_t) (t[2] << 2); + r[13 * i + 4] = (uint8_t) (t[2] >> 6); + r[13 * i + 4] |= (uint8_t) (t[3] << 7); + r[13 * i + 5] = (uint8_t) (t[3] >> 1); + r[13 * i + 6] = (uint8_t) (t[3] >> 9); + r[13 * i + 6] |= (uint8_t) (t[4] << 4); + r[13 * i + 7] = (uint8_t) (t[4] >> 4); + r[13 * i + 8] = (uint8_t) (t[4] >> 12); + r[13 * i + 8] |= (uint8_t) (t[5] << 1); + r[13 * i + 9] = (uint8_t) (t[5] >> 7); + r[13 * i + 9] |= (uint8_t) (t[6] << 6); + r[13 * i + 10] = (uint8_t) (t[6] >> 2); + r[13 * i + 11] = (uint8_t) (t[6] >> 10); + r[13 * i + 11] |= (uint8_t) (t[7] << 3); + r[13 * i + 12] = (uint8_t) (t[7] >> 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyz_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; + + r[5 * i + 0] = (uint8_t) t[0]; + r[5 * i + 1] = (uint8_t) (t[0] >> 8); + r[5 * i + 2] = (uint8_t) (t[0] >> 16); + r[5 * i + 2] |= (uint8_t) (t[1] << 4); + r[5 * i + 3] = (uint8_t) (t[1] >> 4); + r[5 * i + 4] = (uint8_t) (t[1] >> 12); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[5 * i + 0]; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16; + r->coeffs[2 * i + 0] &= 0xFFFFF; + + r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + r->coeffs[2 * i + 0] &= 0xFFFFF; + + r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)); + } + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium3aes/clean/poly.h b/crypto_sign/dilithium3aes/clean/poly.h new file mode 100644 index 00000000..092576ca --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/poly.h @@ -0,0 +1,53 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_POLY_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_POLY_H +#include "params.h" +#include + +typedef struct { + int32_t coeffs[N]; +} poly; + +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_caddq(poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM3AES_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM3AES_CLEAN_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM3AES_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyz_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyw1_pack(uint8_t *r, const poly *a); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/polyvec.c b/crypto_sign/dilithium3aes/clean/polyvec.c new file mode 100644 index 00000000..a30d322d --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/polyvec.c @@ -0,0 +1,448 @@ +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + + for (i = 0; i < K; ++i) { + for (j = 0; j < L; ++j) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); + } + } +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); + } +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_ntt(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + unsigned int i; + poly t; + + PQCLEAN_DILITHIUM3AES_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); + for (i = 1; i < L; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); + PQCLEAN_DILITHIUM3AES_CLEAN_poly_add(w, w, &t); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM3AES_CLEAN_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_caddq(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM3AES_CLEAN_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; + + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM3AES_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + } + + return s; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM3AES_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} diff --git a/crypto_sign/dilithium3aes/clean/polyvec.h b/crypto_sign/dilithium3aes/clean/polyvec.h new file mode 100644 index 00000000..668b284a --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/polyvec.h @@ -0,0 +1,68 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_POLYVEC_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_POLYVEC_H +#include "params.h" +#include "poly.h" +#include + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_reduce(polyvecl *v); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + + +int PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_caddq(polyveck *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + +int PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/reduce.c b/crypto_sign/dilithium3aes/clean/reduce.c new file mode 100644 index 00000000..551e06f6 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/reduce.c @@ -0,0 +1,69 @@ +#include "params.h" +#include "reduce.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_montgomery_reduce +* +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. +* +* Arguments: - int64_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_montgomery_reduce(int64_t a) { + int32_t t; + + t = (int32_t)((uint64_t)a * (uint64_t)QINV); + t = (a - (int64_t)t * Q) >> 32; + return t; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_reduce32 +* +* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, +* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_reduce32(int32_t a) { + int32_t t; + + t = (a + (1 << 22)) >> 23; + t = a - t * Q; + return t; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_caddq +* +* Description: Add Q if input coefficient is negative. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_caddq(int32_t a) { + a += (a >> 31) & Q; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_freeze +* +* Description: For finite field element a, compute standard +* representative r = a mod^+ Q. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_freeze(int32_t a) { + a = PQCLEAN_DILITHIUM3AES_CLEAN_reduce32(a); + a = PQCLEAN_DILITHIUM3AES_CLEAN_caddq(a); + return a; +} diff --git a/crypto_sign/dilithium3aes/clean/reduce.h b/crypto_sign/dilithium3aes/clean/reduce.h new file mode 100644 index 00000000..fa9cc09d --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/reduce.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_REDUCE_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_REDUCE_H +#include "params.h" +#include + +#define MONT (-4186625) // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 + +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_montgomery_reduce(int64_t a); + +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_reduce32(int32_t a); + +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_caddq(int32_t a); + +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_freeze(int32_t a); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/rounding.c b/crypto_sign/dilithium3aes/clean/rounding.c new file mode 100644 index 00000000..6dd2665e --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/rounding.c @@ -0,0 +1,92 @@ +#include "params.h" +#include "rounding.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_power2round(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + (1 << (D - 1)) - 1) >> D; + *a0 = a - (a1 << D); + return a1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_decompose(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + 127) >> 7; + a1 = (a1 * 1025 + (1 << 21)) >> 22; + a1 &= 15; + + *a0 = a - a1 * 2 * GAMMA2; + *a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; + return a1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. +* +* Arguments: - int32_t a0: low bits of input element +* - int32_t a1: high bits of input element +* +* Returns 1 if overflow. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM3AES_CLEAN_make_hint(int32_t a0, int32_t a1) { + if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { + return 1; + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - int32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_use_hint(int32_t a, unsigned int hint) { + int32_t a0, a1; + + a1 = PQCLEAN_DILITHIUM3AES_CLEAN_decompose(&a0, a); + if (hint == 0) { + return a1; + } + + if (a0 > 0) { + return (a1 + 1) & 15; + } + return (a1 - 1) & 15; +} diff --git a/crypto_sign/dilithium3aes/clean/rounding.h b/crypto_sign/dilithium3aes/clean/rounding.h new file mode 100644 index 00000000..af049b63 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/rounding.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_ROUNDING_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_ROUNDING_H +#include "params.h" +#include + +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_power2round(int32_t *a0, int32_t a); + +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_decompose(int32_t *a0, int32_t a); + +unsigned int PQCLEAN_DILITHIUM3AES_CLEAN_make_hint(int32_t a0, int32_t a1); + +int32_t PQCLEAN_DILITHIUM3AES_CLEAN_use_hint(int32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/sign.c b/crypto_sign/dilithium3aes/clean/sign.c new file mode 100644 index 00000000..3d96ee26 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/sign.c @@ -0,0 +1,343 @@ +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Expand matrix */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_expand(mat, rho); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); + + /* Matrix-vector multiplication */ + s1hat = s1; + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_ntt(&s1hat); + PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce(&t1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_invntt_tomont(&t1); + + /* Add error vector s2 */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_add(&t1, &t1, &s2); + + /* Extract t1 and write public key */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_caddq(&t1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM3AES_CLEAN_pack_pk(pk, rho, &t1); + + /* Compute CRH(rho, t1) and write secret key */ + crh(tr, pk, PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES); + PQCLEAN_DILITHIUM3AES_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + unsigned int n; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint16_t nonce = 0; + polyvecl mat[K], s1, y, z; + polyveck t0, s2, w1, w0, h; + poly cp; + shake256incctx state; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM3AES_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + /* Compute CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_expand(mat, rho); + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); + + /* Matrix-vector multiplication */ + z = y; + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_decompose(&w1, &w0, &w1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM3AES_CLEAN_poly_challenge(&cp, sig); + PQCLEAN_DILITHIUM3AES_CLEAN_poly_ntt(&cp); + + /* Compute z, reject if it reveals secret */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_invntt_tomont(&z); + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_add(&z, &z, &y); + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_reduce(&z); + if (PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + goto rej; + } + + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_sub(&w0, &w0, &h); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce(&w0); + if (PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints for w1 */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce(&h); + if (PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_chknorm(&h, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_add(&w0, &w0, &h); + n = PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_make_hint(&h, &w0, &w1); + if (n > OMEGA) { + goto rej; + } + + /* Write signature */ + PQCLEAN_DILITHIUM3AES_CLEAN_pack_sig(sig, sig, &z, &h); + *siglen = PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) { + unsigned int i; + uint8_t buf[K * POLYW1_PACKEDBYTES]; + uint8_t rho[SEEDBYTES]; + uint8_t mu[CRHBYTES]; + uint8_t c[SEEDBYTES]; + uint8_t c2[SEEDBYTES]; + poly cp; + polyvecl mat[K], z; + polyveck t1, w1, h; + shake256incctx state; + + if (siglen != PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES) { + return -1; + } + + PQCLEAN_DILITHIUM3AES_CLEAN_unpack_pk(rho, &t1, pk); + if (PQCLEAN_DILITHIUM3AES_CLEAN_unpack_sig(c, &z, &h, sig)) { + return -1; + } + if (PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + PQCLEAN_DILITHIUM3AES_CLEAN_poly_challenge(&cp, c); + PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_expand(mat, rho); + + PQCLEAN_DILITHIUM3AES_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM3AES_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + + PQCLEAN_DILITHIUM3AES_CLEAN_poly_ntt(&cp); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_shiftl(&t1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_ntt(&t1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); + + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_sub(&w1, &w1, &t1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_invntt_tomont(&w1); + + /* Reconstruct w1 */ + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM3AES_CLEAN_polyveck_pack_w1(buf, &w1); + + /* Call random oracle and verify PQCLEAN_DILITHIUM3AES_CLEAN_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(c2, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (c[i] != c2[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) { + size_t i; + + if (smlen < PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES) { + goto badsig; + } + + *mlen = smlen - PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM3AES_CLEAN_CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = (size_t) -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium3aes/clean/sign.h b/crypto_sign/dilithium3aes/clean/sign.h new file mode 100644 index 00000000..29008459 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/sign.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_SIGN_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_SIGN_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM3AES_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM3AES_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium3aes/clean/symmetric-aes.c b/crypto_sign/dilithium3aes/clean/symmetric-aes.c new file mode 100644 index 00000000..ac0e9c86 --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/symmetric-aes.c @@ -0,0 +1,12 @@ +#include "aes256ctr.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM3AES_CLEAN_dilithium_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint16_t nonce) { + uint8_t expnonce[12] = {0}; + expnonce[0] = (uint8_t) nonce; + expnonce[1] = (uint8_t) (nonce >> 8); + PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_init(state, key, expnonce); +} diff --git a/crypto_sign/dilithium3aes/clean/symmetric.h b/crypto_sign/dilithium3aes/clean/symmetric.h new file mode 100644 index 00000000..380b510e --- /dev/null +++ b/crypto_sign/dilithium3aes/clean/symmetric.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_DILITHIUM3AES_CLEAN_SYMMETRIC_H +#define PQCLEAN_DILITHIUM3AES_CLEAN_SYMMETRIC_H +#include "aes256ctr.h" +#include "fips202.h" +#include "params.h" +#include + + + +typedef aes256ctr_ctx stream128_state; +typedef aes256ctr_ctx stream256_state; + +void PQCLEAN_DILITHIUM3AES_CLEAN_dilithium_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES +#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM3AES_CLEAN_dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream128_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM3AES_CLEAN_dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_DILITHIUM3AES_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_release(STATE) + + +#endif diff --git a/crypto_sign/dilithium4/META.yml b/crypto_sign/dilithium4/META.yml deleted file mode 100644 index 8e6378a2..00000000 --- a/crypto_sign/dilithium4/META.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Dilithium4 -type: signature -claimed-nist-level: 3 -length-public-key: 1760 -length-secret-key: 3856 -length-signature: 3366 -nistkat-sha256: 87844f967b4340d60dc4d83aac0f1d3a244fa8f9490017f72fd4969bba168f88 -testvectors-sha256: 91087880c84678bf66008d843e7fa1ab5231114a8ca9e9e36c41065f14172af2 -principal-submitters: - - Vadim Lyubashevsky -auxiliary-submitters: - - Léo Ducas - - Eike Kiltz - - Tancrède Lepoint - - Peter Schwabe - - Gregor Seiler - - Damien Stehlé -implementations: - - name: clean - version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 - - name: avx2 - version: https://github.com/pq-crystals/dilithium/commit/c1b40fd599e71f65aa18be64dd6c3fc8e84b0c08 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Darwin - - Linux - required_flags: - - avx2 - - bmi1 - - popcnt diff --git a/crypto_sign/dilithium4/avx2/LICENSE b/crypto_sign/dilithium4/avx2/LICENSE deleted file mode 100644 index 40541676..00000000 --- a/crypto_sign/dilithium4/avx2/LICENSE +++ /dev/null @@ -1,6 +0,0 @@ -Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) - -For Keccak and the random number generator -we are using public-domain code from sources -and by authors listed in comments on top of -the respective files. diff --git a/crypto_sign/dilithium4/avx2/Makefile b/crypto_sign/dilithium4/avx2/Makefile deleted file mode 100644 index e0df80cc..00000000 --- a/crypto_sign/dilithium4/avx2/Makefile +++ /dev/null @@ -1,40 +0,0 @@ -# This Makefile can be used with GNU Make or BSD Make - -LIB=libdilithium4_avx2.a - -SOURCES = fips202x4.c invntt.S nttconsts.c ntt.S packing.c pointwise.S poly.c \ - polyvec.c reduce.S rejsample.c rounding.c sign.c stream.c -OBJECTS = fips202x4.o invntt.o nttconsts.o ntt.o packing.o pointwise.o poly.o \ - polyvec.o reduce.o rejsample.o rounding.o sign.o stream.o -HEADERS = alignment.h api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - nttconsts.h reduce.h rounding.h rejsample.h symmetric.h stream.h \ - fips202x4.h shuffle.inc cdecl.inc - -CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror \ - -Wmissing-prototypes -Wredundant-decls -std=c99 \ - -Wcast-align -Werror=shadow\ - -mavx2 -mbmi -mpopcnt -I../../../common $(EXTRAFLAGS) - -all: $(LIB) - -KECCAK4XDIR=../../../common/keccak4x -KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o -KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) - -%.o: %.c $(HEADERS) - $(CC) $(CFLAGS) -c -o $@ $< - -%.o: %.S $(HEADERS) - $(CC) -c -o $@ $< - -$(LIB): $(OBJECTS) $(KECCAK4X) - $(AR) -r $@ $^ - -$(KECCAK4X): - $(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ) - -clean: - $(RM) $(OBJECTS) - $(RM) $(LIB) - $(MAKE) -C $(KECCAK4XDIR) clean - diff --git a/crypto_sign/dilithium4/avx2/alignment.h b/crypto_sign/dilithium4/avx2/alignment.h deleted file mode 100644 index fd6bbf6e..00000000 --- a/crypto_sign/dilithium4/avx2/alignment.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_ALIGNMENT_H -#define PQCLEAN_DILITHIUM4_AVX2_ALIGNMENT_H - -#define ALIGNED_UINT8(N) \ - union { \ - uint32_t as_arr[N]; \ - __m256i as_vec[(N)/32]; \ - } - -#define ALIGNED_UINT32(N) \ - union { \ - uint32_t as_arr[N]; \ - __m256i as_vec[(N)/8]; \ - } - -#define ALIGNED_UINT64(N) \ - union { \ - uint64_t as_arr[N]; \ - __m256i as_vec[(N)/8]; \ - } - -#endif //PQCLEAN_DILITHIUM4_AVX2_ALIGNMENT_H diff --git a/crypto_sign/dilithium4/avx2/api.h b/crypto_sign/dilithium4/avx2/api.h deleted file mode 100644 index b3d8059a..00000000 --- a/crypto_sign/dilithium4/avx2/api.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_API_H -#define PQCLEAN_DILITHIUM4_AVX2_API_H - -#include -#include - - -#define PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES 1760U -#define PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES 3856U -#define PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES 3366U - -#define PQCLEAN_DILITHIUM4_AVX2_CRYPTO_ALGNAME "Dilithium4" - - -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_keypair( - uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); - -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, const uint8_t *sk); - -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, const uint8_t *pk); - - - -#endif diff --git a/crypto_sign/dilithium4/avx2/fips202x4.c b/crypto_sign/dilithium4/avx2/fips202x4.c deleted file mode 100644 index 61bb8c94..00000000 --- a/crypto_sign/dilithium4/avx2/fips202x4.c +++ /dev/null @@ -1,233 +0,0 @@ -#include -#include - -#include "fips202.h" -#include "fips202x4.h" -#include "params.h" - -#define NROUNDS 24 -#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64 - (offset)))) - -static uint64_t load64(const uint8_t *x) { - uint64_t r = 0; - - for (size_t i = 0; i < 8; ++i) { - r |= (uint64_t)x[i] << 8 * i; - } - - return r; -} - -static void store64(uint8_t *x, uint64_t u) { - for (size_t i = 0; i < 8; ++i) { - x[i] = (uint8_t)(u >> 8 * i); - } -} - -/* Use implementation from the Keccak Code Package */ -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); -#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds - -static void keccak_absorb4x(__m256i *s, - uint8_t r, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen, - uint8_t p) { - size_t i; - uint8_t t0[200]; - uint8_t t1[200]; - uint8_t t2[200]; - uint8_t t3[200]; - uint64_t *ss = (uint64_t *)s; - - for (i = 0; i < 25; ++i) { - s[i] = _mm256_xor_si256(s[i], s[i]); - } - - while (mlen >= r) { - for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(m0 + 8 * i); - ss[4 * i + 1] ^= load64(m1 + 8 * i); - ss[4 * i + 2] ^= load64(m2 + 8 * i); - ss[4 * i + 3] ^= load64(m3 + 8 * i); - } - - KeccakF1600_StatePermute4x(s); - mlen -= r; - m0 += r; - m1 += r; - m2 += r; - m3 += r; - } - - for (i = 0; i < r; ++i) { - t0[i] = 0; - t1[i] = 0; - t2[i] = 0; - t3[i] = 0; - } - for (i = 0; i < mlen; ++i) { - t0[i] = m0[i]; - t1[i] = m1[i]; - t2[i] = m2[i]; - t3[i] = m3[i]; - } - - t0[i] = p; - t1[i] = p; - t2[i] = p; - t3[i] = p; - - t0[r - 1] |= 128; - t1[r - 1] |= 128; - t2[r - 1] |= 128; - t3[r - 1] |= 128; - - for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(t0 + 8 * i); - ss[4 * i + 1] ^= load64(t1 + 8 * i); - ss[4 * i + 2] ^= load64(t2 + 8 * i); - ss[4 * i + 3] ^= load64(t3 + 8 * i); - } -} - - -static void keccak_squeezeblocks4x(uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - uint8_t r, - __m256i *s) { - uint64_t *ss = (uint64_t *)s; - - while (nblocks > 0) { - KeccakF1600_StatePermute4x(s); - for (size_t i = 0; i < r / 8; ++i) { - store64(h0 + 8 * i, ss[4 * i + 0]); - store64(h1 + 8 * i, ss[4 * i + 1]); - store64(h2 + 8 * i, ss[4 * i + 2]); - store64(h3 + 8 * i, ss[4 * i + 3]); - } - - h0 += r; - h1 += r; - h2 += r; - h3 += r; - --nblocks; - } - -} - -void PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - keccak_absorb4x(s, SHAKE128_RATE, m0, m1, m2, m3, mlen, 0x1F); -} - -void PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s) { - keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE128_RATE, s); -} - -void PQCLEAN_DILITHIUM4_AVX2_shake256_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - keccak_absorb4x(s, SHAKE256_RATE, m0, m1, m2, m3, mlen, 0x1F); -} - -void PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s) { - keccak_squeezeblocks4x(h0, h1, h2, h3, nblocks, SHAKE256_RATE, s); -} - -void PQCLEAN_DILITHIUM4_AVX2_shake128_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - size_t nblocks = hlen / SHAKE128_RATE; - uint8_t t[4][SHAKE128_RATE]; - __m256i s[25]; - - PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x(s, m0, m1, m2, m3, mlen); - PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); - - h0 += nblocks * SHAKE128_RATE; - h1 += nblocks * SHAKE128_RATE; - h2 += nblocks * SHAKE128_RATE; - h3 += nblocks * SHAKE128_RATE; - hlen -= nblocks * SHAKE128_RATE; - - if (hlen) { - PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); - for (size_t i = 0; i < hlen; ++i) { - h0[i] = t[0][i]; - h1[i] = t[1][i]; - h2[i] = t[2][i]; - h3[i] = t[3][i]; - } - } -} - -void PQCLEAN_DILITHIUM4_AVX2_shake256_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen) { - size_t nblocks = hlen / SHAKE256_RATE; - uint8_t t[4][SHAKE256_RATE]; - __m256i s[25]; - - PQCLEAN_DILITHIUM4_AVX2_shake256_absorb4x(s, m0, m1, m2, m3, mlen); - PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x(h0, h1, h2, h3, nblocks, s); - - h0 += nblocks * SHAKE256_RATE; - h1 += nblocks * SHAKE256_RATE; - h2 += nblocks * SHAKE256_RATE; - h3 += nblocks * SHAKE256_RATE; - hlen -= nblocks * SHAKE256_RATE; - - if (hlen) { - PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x(t[0], t[1], t[2], t[3], 1, s); - for (size_t i = 0; i < hlen; ++i) { - h0[i] = t[0][i]; - h1[i] = t[1][i]; - h2[i] = t[2][i]; - h3[i] = t[3][i]; - } - } -} diff --git a/crypto_sign/dilithium4/avx2/fips202x4.h b/crypto_sign/dilithium4/avx2/fips202x4.h deleted file mode 100644 index d1dd0e78..00000000 --- a/crypto_sign/dilithium4/avx2/fips202x4.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_FIPS202X4_H -#define PQCLEAN_DILITHIUM4_AVX2_FIPS202X4_H - -#include -#include -#include - -#include "params.h" - -void PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); - -void PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s); - -void PQCLEAN_DILITHIUM4_AVX2_shake256_absorb4x( - __m256i *s, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); - -void PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t nblocks, - __m256i *s); - -void PQCLEAN_DILITHIUM4_AVX2_shake128_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); - -void PQCLEAN_DILITHIUM4_AVX2_shake256_4x( - uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, - size_t hlen, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen); - -#endif diff --git a/crypto_sign/dilithium4/avx2/invntt.S b/crypto_sign/dilithium4/avx2/invntt.S deleted file mode 100644 index 6c94513b..00000000 --- a/crypto_sign/dilithium4/avx2/invntt.S +++ /dev/null @@ -1,282 +0,0 @@ -.include "shuffle.inc" -#include "cdecl.inc" - -.macro butterfly l0,l1,l2,l3,h0,h1,h2,h3,z0=15,z1=3 -vpaddd %ymm2,%ymm\l0,%ymm12 -vpaddd %ymm2,%ymm\l1,%ymm13 -vpaddd %ymm2,%ymm\l2,%ymm14 - -vpsubd %ymm\h0,%ymm12,%ymm12 -vpsubd %ymm\h1,%ymm13,%ymm13 -vpsubd %ymm\h2,%ymm14,%ymm14 - -vpmuludq %ymm\z0,%ymm12,%ymm12 -vpmuludq %ymm\z0,%ymm13,%ymm13 -vpaddd %ymm2,%ymm\l3,%ymm15 - -vpmuludq %ymm\z1,%ymm14,%ymm14 -vpsubd %ymm\h3,%ymm15,%ymm15 -vpaddd %ymm\l0,%ymm\h0,%ymm\l0 - -vpmuludq %ymm\z1,%ymm15,%ymm15 -vpaddd %ymm\l1,%ymm\h1,%ymm\l1 -vpaddd %ymm\l2,%ymm\h2,%ymm\l2 - -vpaddd %ymm\l3,%ymm\h3,%ymm\l3 - -vpmuludq %ymm0,%ymm12,%ymm\h0 -vpmuludq %ymm0,%ymm13,%ymm\h1 -vpmuludq %ymm0,%ymm14,%ymm\h2 -vpmuludq %ymm0,%ymm15,%ymm\h3 -vpmuludq %ymm1,%ymm\h0,%ymm\h0 -vpmuludq %ymm1,%ymm\h1,%ymm\h1 -vpmuludq %ymm1,%ymm\h2,%ymm\h2 -vpmuludq %ymm1,%ymm\h3,%ymm\h3 -vpaddq %ymm12,%ymm\h0,%ymm\h0 -vpaddq %ymm13,%ymm\h1,%ymm\h1 -vpaddq %ymm14,%ymm\h2,%ymm\h2 -vpaddq %ymm15,%ymm\h3,%ymm\h3 -vpsrlq $32,%ymm\h0,%ymm\h0 -vpsrlq $32,%ymm\h1,%ymm\h1 -vpsrlq $32,%ymm\h2,%ymm\h2 -vpsrlq $32,%ymm\h3,%ymm\h3 -.endm - -.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx) -cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 - -#load -vmovdqa (%rsi),%ymm6 -vmovdqa 32(%rsi),%ymm7 -vmovdqa 64(%rsi),%ymm5 -vmovdqa 96(%rsi),%ymm10 - -#reorder -shuffle8 6,5,8,5 -shuffle8 7,10,6,10 - -shuffle4 8,6,4,6 -shuffle4 5,10,8,10 - -vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm6,%ymm7 -vpsrlq $32,%ymm8,%ymm9 -vpsrlq $32,%ymm10,%ymm11 - -level0: -vpmovzxdq (%rdx),%ymm3 -vpmovzxdq 16(%rdx),%ymm15 -vpaddd %ymm2,%ymm4,%ymm12 -vpaddd %ymm2,%ymm6,%ymm13 -vpaddd %ymm2,%ymm8,%ymm14 - -vpsubd %ymm5,%ymm12,%ymm12 -vpsubd %ymm7,%ymm13,%ymm13 -vpsubd %ymm9,%ymm14,%ymm14 - -vpmuludq %ymm3,%ymm12,%ymm12 -vpmuludq %ymm15,%ymm13,%ymm13 -vpaddd %ymm2,%ymm10,%ymm15 - -vpsubd %ymm11,%ymm15,%ymm15 -vpaddd %ymm4,%ymm5,%ymm4 -vpaddd %ymm6,%ymm7,%ymm6 -vpmovzxdq 32(%rdx),%ymm5 -vpmovzxdq 48(%rdx),%ymm7 - -vpmuludq %ymm5,%ymm14,%ymm14 -vpmuludq %ymm7,%ymm15,%ymm15 -vpaddd %ymm8,%ymm9,%ymm8 - -vpaddd %ymm10,%ymm11,%ymm10 - -vpmuludq %ymm0,%ymm12,%ymm5 -vpmuludq %ymm0,%ymm13,%ymm7 -vpmuludq %ymm0,%ymm14,%ymm9 -vpmuludq %ymm0,%ymm15,%ymm11 -vpmuludq %ymm1,%ymm5,%ymm5 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm9,%ymm9 -vpmuludq %ymm1,%ymm11,%ymm11 -vpaddq %ymm12,%ymm5,%ymm5 -vpaddq %ymm13,%ymm7,%ymm7 -vpaddq %ymm14,%ymm9,%ymm9 -vpaddq %ymm15,%ymm11,%ymm11 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm7,%ymm7 -vpsrlq $32,%ymm9,%ymm9 -vpsrlq $32,%ymm11,%ymm11 - -level1: -#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) -vpmovzxdq 64(%rdx),%ymm15 -vpmovzxdq 80(%rdx),%ymm3 - -butterfly 4,5,8,9,6,7,10,11 - -level2: -#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) -vpmovzxdq 96(%rdx),%ymm3 - -butterfly 4,5,6,7,8,9,10,11,3,3 - -#shuffle -shuffle4 4,5,3,5 -shuffle4 6,7,4,7 -shuffle4 8,9,6,9 -shuffle4 10,11,8,11 - -level3: -#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) -vpbroadcastd 112(%rdx),%ymm14 -vpbroadcastd 116(%rdx),%ymm15 -vpblendd $0xF0,%ymm15,%ymm14,%ymm10 - -butterfly 3,4,6,8,5,7,9,11,10,10 - -#shuffle -shuffle8 3,4,10,4 -shuffle8 6,8,3,8 -shuffle8 5,7,6,7 -shuffle8 9,11,5,11 - -level4: -#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) -vpbroadcastd 120(%rdx),%ymm9 - -butterfly 10,3,6,5,4,8,7,11,9,9 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm8,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx) -cdecl(PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x256q)(%rip),%ymm2 - -#load -vmovdqa (%rsi),%ymm4 -vmovdqa 256(%rsi),%ymm5 -vmovdqa 512(%rsi),%ymm6 -vmovdqa 768(%rsi),%ymm7 -vmovdqa 1024(%rsi),%ymm8 -vmovdqa 1280(%rsi),%ymm9 -vmovdqa 1536(%rsi),%ymm10 -vmovdqa 1792(%rsi),%ymm11 - -level5: -vpbroadcastd (%rdx),%ymm3 -vpbroadcastd 4(%rdx),%ymm15 -vpaddd %ymm2,%ymm4,%ymm12 -vpaddd %ymm2,%ymm6,%ymm13 -vpaddd %ymm2,%ymm8,%ymm14 - -vpsubd %ymm5,%ymm12,%ymm12 -vpsubd %ymm7,%ymm13,%ymm13 -vpsubd %ymm9,%ymm14,%ymm14 - -vpmuludq %ymm3,%ymm12,%ymm12 -vpmuludq %ymm15,%ymm13,%ymm13 -vpaddd %ymm2,%ymm10,%ymm15 - -vpsubd %ymm11,%ymm15,%ymm15 -vpaddd %ymm4,%ymm5,%ymm4 -vpaddd %ymm6,%ymm7,%ymm6 -vpbroadcastd 8(%rdx),%ymm5 -vpbroadcastd 12(%rdx),%ymm7 - -vpmuludq %ymm5,%ymm14,%ymm14 -vpmuludq %ymm7,%ymm15,%ymm15 -vpaddd %ymm8,%ymm9,%ymm8 - -vpaddd %ymm10,%ymm11,%ymm10 - -vpmuludq %ymm0,%ymm12,%ymm5 -vpmuludq %ymm0,%ymm13,%ymm7 -vpmuludq %ymm0,%ymm14,%ymm9 -vpmuludq %ymm0,%ymm15,%ymm11 -vpmuludq %ymm1,%ymm5,%ymm5 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm9,%ymm9 -vpmuludq %ymm1,%ymm11,%ymm11 -vpaddq %ymm12,%ymm5,%ymm5 -vpaddq %ymm13,%ymm7,%ymm7 -vpaddq %ymm14,%ymm9,%ymm9 -vpaddq %ymm15,%ymm11,%ymm11 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm7,%ymm7 -vpsrlq $32,%ymm9,%ymm9 -vpsrlq $32,%ymm11,%ymm11 - -level6: -#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) -vpbroadcastd 16(%rdx),%ymm15 -vpbroadcastd 20(%rdx),%ymm3 - -butterfly 4,5,8,9,6,7,10,11 - -level7: -#cdecl(PQCLEAN_DILITHIUM4_AVX2_zetas) -vpbroadcastd 24(%rdx),%ymm3 - -butterfly 4,5,6,7,8,9,10,11,3,3 - -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xdiv)(%rip),%ymm3 - -vpmuludq %ymm3,%ymm4,%ymm4 -vpmuludq %ymm3,%ymm5,%ymm5 -vpmuludq %ymm3,%ymm6,%ymm6 -vpmuludq %ymm3,%ymm7,%ymm7 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm0,%ymm6,%ymm14 -vpmuludq %ymm0,%ymm7,%ymm15 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm12,%ymm4,%ymm4 -vpaddq %ymm13,%ymm5,%ymm5 -vpaddq %ymm14,%ymm6,%ymm6 -vpaddq %ymm15,%ymm7,%ymm7 -vpsrlq $32,%ymm4,%ymm4 -vpsrlq $32,%ymm5,%ymm5 -vpsrlq $32,%ymm6,%ymm6 -vpsrlq $32,%ymm7,%ymm7 - -#store -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_mask)(%rip),%ymm3 -vpermd %ymm4,%ymm3,%ymm4 -vpermd %ymm5,%ymm3,%ymm5 -vpermd %ymm6,%ymm3,%ymm6 -vpermd %ymm7,%ymm3,%ymm7 -vpermd %ymm8,%ymm3,%ymm8 -vpermd %ymm9,%ymm3,%ymm9 -vpermd %ymm10,%ymm3,%ymm10 -vpermd %ymm11,%ymm3,%ymm11 -vmovdqa %xmm4,(%rdi) -vmovdqa %xmm5,128(%rdi) -vmovdqa %xmm6,256(%rdi) -vmovdqa %xmm7,384(%rdi) -vmovdqa %xmm8,512(%rdi) -vmovdqa %xmm9,640(%rdi) -vmovdqa %xmm10,768(%rdi) -vmovdqa %xmm11,896(%rdi) - -ret diff --git a/crypto_sign/dilithium4/avx2/ntt.S b/crypto_sign/dilithium4/avx2/ntt.S deleted file mode 100644 index b3f499bc..00000000 --- a/crypto_sign/dilithium4/avx2/ntt.S +++ /dev/null @@ -1,179 +0,0 @@ -.include "shuffle.inc" -#include "cdecl.inc" - -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,z0=3,z1=3,z2=3,z3=3 -#mul -vpmuludq %ymm\z0,%ymm\rh0,%ymm\rh0 -vpmuludq %ymm\z1,%ymm\rh1,%ymm\rh1 -vpmuludq %ymm\z2,%ymm\rh2,%ymm\rh2 -vpmuludq %ymm\z3,%ymm\rh3,%ymm\rh3 - -#reduce -vpmuludq %ymm0,%ymm\rh0,%ymm12 -vpmuludq %ymm0,%ymm\rh1,%ymm13 -vpmuludq %ymm0,%ymm\rh2,%ymm14 -vpmuludq %ymm0,%ymm\rh3,%ymm15 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm\rh0,%ymm12,%ymm12 -vpaddq %ymm\rh1,%ymm13,%ymm13 -vpaddq %ymm\rh2,%ymm14,%ymm14 -vpaddq %ymm\rh3,%ymm15,%ymm15 -vpsrlq $32,%ymm12,%ymm12 -vpsrlq $32,%ymm13,%ymm13 -vpsrlq $32,%ymm14,%ymm14 -vpsrlq $32,%ymm15,%ymm15 - -#update -vpaddd %ymm2,%ymm\rl0,%ymm\rh0 -vpaddd %ymm2,%ymm\rl1,%ymm\rh1 -vpaddd %ymm2,%ymm\rl2,%ymm\rh2 -vpaddd %ymm2,%ymm\rl3,%ymm\rh3 -vpaddd %ymm12,%ymm\rl0,%ymm\rl0 -vpaddd %ymm13,%ymm\rl1,%ymm\rl1 -vpaddd %ymm14,%ymm\rl2,%ymm\rl2 -vpaddd %ymm15,%ymm\rl3,%ymm\rl3 -vpsubd %ymm12,%ymm\rh0,%ymm\rh0 -vpsubd %ymm13,%ymm\rh1,%ymm\rh1 -vpsubd %ymm14,%ymm\rh2,%ymm\rh2 -vpsubd %ymm15,%ymm\rh3,%ymm\rh3 -.endm - -.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx) -cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 - -level0: -#zetas -vpbroadcastd (%rdx),%ymm3 - -#load -vpmovzxdq (%rsi),%ymm4 -vpmovzxdq 128(%rsi),%ymm5 -vpmovzxdq 256(%rsi),%ymm6 -vpmovzxdq 384(%rsi),%ymm7 -vpmovzxdq 512(%rsi),%ymm8 -vpmovzxdq 640(%rsi),%ymm9 -vpmovzxdq 768(%rsi),%ymm10 -vpmovzxdq 896(%rsi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -level1: -#PQCLEAN_DILITHIUM4_AVX2_zetas -vpbroadcastd 4(%rdx),%ymm12 -vpbroadcastd 8(%rdx),%ymm13 - -butterfly 4,5,8,9,6,7,10,11,12,12,13,13 - -level2: -#PQCLEAN_DILITHIUM4_AVX2_zetas -vpbroadcastd 12(%rdx),%ymm12 -vpbroadcastd 16(%rdx),%ymm13 -vpbroadcastd 20(%rdx),%ymm14 -vpbroadcastd 24(%rdx),%ymm15 - -butterfly 4,6,8,10,5,7,9,11,12,13,14,15 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,256(%rdi) -vmovdqa %ymm6,512(%rdi) -vmovdqa %ymm7,768(%rdi) -vmovdqa %ymm8,1024(%rdi) -vmovdqa %ymm9,1280(%rdi) -vmovdqa %ymm10,1536(%rdi) -vmovdqa %ymm11,1792(%rdi) - -ret - -.global cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx) -cdecl(PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x2q)(%rip),%ymm2 - -#load -vmovdqa (%rsi),%ymm4 -vmovdqa 32(%rsi),%ymm5 -vmovdqa 64(%rsi),%ymm6 -vmovdqa 96(%rsi),%ymm7 -vmovdqa 128(%rsi),%ymm8 -vmovdqa 160(%rsi),%ymm9 -vmovdqa 192(%rsi),%ymm10 -vmovdqa 224(%rsi),%ymm11 - -level3: -#zetas -vpbroadcastd (%rdx),%ymm3 - -butterfly 4,5,6,7,8,9,10,11 - -level4: -#PQCLEAN_DILITHIUM4_AVX2_zetas -vpbroadcastd 4(%rdx),%ymm12 -vpbroadcastd 8(%rdx),%ymm13 -vpblendd $0xF0,%ymm13,%ymm12,%ymm12 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly 3,8,4,9,5,10,6,11,12,12,12,12 - -level5: -#zetas -vpmovzxdq 12(%rdx),%ymm12 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly 7,5,3,10,8,6,4,11,12,12,12,12 - -level6: -#zetas -vpmovzxdq 28(%rdx),%ymm12 -vpmovzxdq 44(%rdx),%ymm13 - -butterfly 7,5,8,6,3,10,4,11,12,12,13,13 - -level7: -#zetas -vpmovzxdq 60(%rdx),%ymm12 -vpmovzxdq 76(%rdx),%ymm13 -vpmovzxdq 92(%rdx),%ymm14 -vpmovzxdq 108(%rdx),%ymm15 - -butterfly 7,3,8,4,5,10,6,11,12,13,14,15 - -#store -vpsllq $32,%ymm5,%ymm5 -vpsllq $32,%ymm10,%ymm10 -vpsllq $32,%ymm6,%ymm6 -vpsllq $32,%ymm11,%ymm11 -vpblendd $0xAA,%ymm5,%ymm7,%ymm7 -vpblendd $0xAA,%ymm10,%ymm3,%ymm3 -vpblendd $0xAA,%ymm6,%ymm8,%ymm8 -vpblendd $0xAA,%ymm11,%ymm4,%ymm4 - -shuffle4 7,3,5,3 -shuffle4 8,4,7,4 - -shuffle8 5,7,6,7 -shuffle8 3,4,5,4 - -vmovdqa %ymm6,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm7,64(%rdi) -vmovdqa %ymm4,96(%rdi) - -ret diff --git a/crypto_sign/dilithium4/avx2/ntt.h b/crypto_sign/dilithium4/avx2/ntt.h deleted file mode 100644 index e337cda9..00000000 --- a/crypto_sign/dilithium4/avx2/ntt.h +++ /dev/null @@ -1,36 +0,0 @@ -#ifndef NTT_H -#define NTT_H - -#include - -#include "nttconsts.h" -#include "params.h" - -void PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx( - uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas -); -void PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx( - uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas -); - -void PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx( - uint64_t *tmp, - const uint32_t *a, - const uint32_t *zetas_inv -); -void PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx( - uint32_t *a, - const uint64_t *tmp, - const uint32_t *zetas_inv -); - -void PQCLEAN_DILITHIUM4_AVX2_pointwise_avx( - uint32_t *c, const uint32_t *a, const uint32_t *b); -void PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx( - uint32_t *c, const uint32_t *a, const uint32_t *b); - -#endif diff --git a/crypto_sign/dilithium4/avx2/nttconsts.c b/crypto_sign/dilithium4/avx2/nttconsts.c deleted file mode 100644 index 20aa2120..00000000 --- a/crypto_sign/dilithium4/avx2/nttconsts.c +++ /dev/null @@ -1,80 +0,0 @@ -#include "nttconsts.h" - -#define QINV 4236238847 // -q^(-1) mod 2^32 -#define MONT 4193792ULL -#define DIV (((MONT*MONT % Q) * (Q-1) % Q) * ((Q-1) >> 8) % Q) - - -const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q = {.as_arr = {2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q, 2 * Q}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q = {.as_arr = {256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, 256 * Q, - 256 * Q - } - }; -const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask = {.as_arr = {0, 2, 4, 6, 0, 0, 0, 0}}; -const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones = {.as_arr = {0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, 0x7FFFFF, - 0x7FFFFF, 0x7FFFFF - } - }; -const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv = { .as_arr = {DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV}}; - -#undef QINV -#undef MONT -#undef DIV - - -const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas = { - .as_arr = { - 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, 2725464, 1024112, 2706023, 95776, - 3077325, 3530437, 4450022, 4702672, 6927966, 2176455, 6851714, 5339162, 3475950, 6795196, 2091667, - 5037939, 266997, 4860065, 3407706, 2244091, 2434439, 4621053, 2316500, 5933984, 7144689, 7183191, - 3817976, 4817955, 3513181, 5187039, 2353451, 7300517, 3585928, 6718724, 4788269, 5842901, 3915439, - 7122806, 4296819, 5190273, 4747489, 1939314, 7380215, 5223087, 126922, 900702, 495491, 7725090, 4823422, - 1859098, 6767243, 5257975, 7855319, 909542, 8337157, 2031748, 7611795, 819034, 7857917, 3207046, 4784579, - 8021166, 7830929, 7260833, 4519302, 5336701, 3574422, 5512770, 3412210, 2147896, 5412772, 7969390, - 7396998, 2715295, 4686924, 5903370, 342297, 3437287, 2842341, 4055324, 286988, 5038140, 2691481, 1247620, - 5942594, 1735879, 5790267, 2486353, 4108315, 203044, 1265009, 1595974, 6288512, 2619752, 6271868, - 3539968, 8079950, 2348700, 7841118, 7709315, 8357436, 7998430, 1852771, 7151892, 7072248, 1349076, - 6949987, 4613401, 5386378, 7047359, 7929317, 1250494, 1869119, 1237275, 1312455, 2635921, 1903435, - 5062207, 3306115, 4832145, 7329447, 6950192, 6417775, 3119733, 6262231, 4520680, 6681150, 6736599, - 3505694, 4558682, 5037034, 508951, 44288, 904516, 264944, 3097992, 7280319, 3958618, 7100756, 1500165, - 7838005, 5796124, 1917081, 777191, 5548557, 4656147, 5834105, 2235880, 6709241, 594136, 7005614, 3406031, - 6533464, 4603424, 5495562, 6980856, 5102745, 3507263, 6239768, 6779997, 3699596, 4656075, 1653064, - 2389356, 759969, 8371839, 5130689, 8169440, 7063561, 6366809, 1957272, 5196991, 810149, 2432395, 3369112, - 162844, 1652634, 2454455, 185531, 1616392, 4686184, 8215696, 7173032, 3014001, 6581310, 3111497, 1757237, - 8360995, 811944, 531354, 954230, 3881043, 189548, 3159746, 5971092, 1315589, 4827145, 6529015, 8202977, - 1341330, 5341501, 2213111, 7953734, 6712985, 3523897, 7404533, 1723600, 7276084, 3866901, 1717735, - 6577327, 8119771, 269760, 472078, 1910376, 4546524, 2680103, 4010497, 280005, 3900724, 5823537, 2071892, - 5582638, 1285669, 7567685, 5361315, 4751448, 6795489, 6940675, 4499357, 3839961, 5441381, 183443, - 7826001, 3937738, 6144432, 7403526, 3919660, 1400424, 7959518, 1612842, 8332111, 7534263, 6094090, - 4834730, 7018208, 1976782 - } -}; - -const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv = { - .as_arr = { - 6403635, 1362209, 3545687, 2286327, 846154, 48306, 6767575, 420899, 6979993, 4460757, 976891, 2235985, - 4442679, 554416, 8196974, 2939036, 4540456, 3881060, 1439742, 1584928, 3628969, 3019102, 812732, 7094748, - 2797779, 6308525, 2556880, 4479693, 8100412, 4369920, 5700314, 3833893, 6470041, 7908339, 8110657, 260646, - 1803090, 6662682, 4513516, 1104333, 6656817, 975884, 4856520, 1667432, 426683, 6167306, 3038916, 7039087, - 177440, 1851402, 3553272, 7064828, 2409325, 5220671, 8190869, 4499374, 7426187, 7849063, 7568473, 19422, - 6623180, 5268920, 1799107, 5366416, 1207385, 164721, 3694233, 6764025, 8194886, 5925962, 6727783, 8217573, - 5011305, 5948022, 7570268, 3183426, 6423145, 2013608, 1316856, 210977, 3249728, 8578, 7620448, 5991061, - 6727353, 3724342, 4680821, 1600420, 2140649, 4873154, 3277672, 1399561, 2884855, 3776993, 1846953, 4974386, - 1374803, 7786281, 1671176, 6144537, 2546312, 3724270, 2831860, 7603226, 6463336, 2584293, 542412, 6880252, - 1279661, 4421799, 1100098, 5282425, 8115473, 7475901, 8336129, 7871466, 3343383, 3821735, 4874723, 1643818, - 1699267, 3859737, 2118186, 5260684, 1962642, 1430225, 1050970, 3548272, 5074302, 3318210, 6476982, 5744496, - 7067962, 7143142, 6511298, 7129923, 451100, 1333058, 2994039, 3767016, 1430430, 7031341, 1308169, 1228525, - 6527646, 381987, 22981, 671102, 539299, 6031717, 300467, 4840449, 2108549, 5760665, 2091905, 6784443, - 7115408, 8177373, 4272102, 5894064, 2590150, 6644538, 2437823, 7132797, 5688936, 3342277, 8093429, 4325093, - 5538076, 4943130, 8038120, 2477047, 3693493, 5665122, 983419, 411027, 2967645, 6232521, 4968207, 2867647, - 4805995, 3043716, 3861115, 1119584, 549488, 359251, 3595838, 5173371, 522500, 7561383, 768622, 6348669, - 43260, 7470875, 525098, 3122442, 1613174, 6521319, 3556995, 655327, 7884926, 7479715, 8253495, 3157330, - 1000202, 6441103, 3632928, 3190144, 4083598, 1257611, 4464978, 2537516, 3592148, 1661693, 4794489, 1079900, - 6026966, 3193378, 4867236, 3562462, 4562441, 1197226, 1235728, 2446433, 6063917, 3759364, 5945978, 6136326, - 4972711, 3520352, 8113420, 3342478, 6288750, 1585221, 4904467, 3041255, 1528703, 6203962, 1452451, 3677745, - 3930395, 4849980, 5303092, 8284641, 5674394, 7356305, 5654953, 6554070, 7913949, 876248, 777960, 8143293, - 518909, 2608894, 3975713 - } -}; diff --git a/crypto_sign/dilithium4/avx2/nttconsts.h b/crypto_sign/dilithium4/avx2/nttconsts.h deleted file mode 100644 index 5a340bbf..00000000 --- a/crypto_sign/dilithium4/avx2/nttconsts.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_NTTCONSTS_H -#define PQCLEAN_DILITHIUM4_AVX2_NTTCONSTS_H - -#include -#include - -#include "alignment.h" -#include "params.h" - -typedef ALIGNED_UINT32(8) aligned_uint32x8_t; - -typedef ALIGNED_UINT32(N) aligned_uint32xN_t; - - -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xqinv; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xq; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x2q; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x256q; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_mask; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8x23ones; -extern const aligned_uint32x8_t PQCLEAN_DILITHIUM4_AVX2_8xdiv; - -extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas; -extern const aligned_uint32xN_t PQCLEAN_DILITHIUM4_AVX2_zetas_inv; - -#endif //PQCLEAN_DILITHIUM4_AVX2_NTTCONSTS_H - diff --git a/crypto_sign/dilithium4/avx2/packing.c b/crypto_sign/dilithium4/avx2/packing.c deleted file mode 100644 index 0e73aa8c..00000000 --- a/crypto_sign/dilithium4/avx2/packing.c +++ /dev/null @@ -1,297 +0,0 @@ -#include "packing.h" -#include "params.h" -#include "poly.h" -#include "polyvec.h" - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_pack_pk -* -* Description: Bit-pack public key pk = (rho, t1). -* -* Arguments: - uint8_t pk[]: output byte array -* - const uint8_t rho[]: byte array containing rho -* - const polyveck *t1: pointer to vector t1 -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1) { - for (size_t i = 0; i < SEEDBYTES; ++i) { - pk[i] = rho[i]; - } - pk += SEEDBYTES; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_unpack_pk -* -* Description: Unpack public key pk = (rho, t1). -* -* Arguments: - const uint8_t rho[]: output byte array for rho -* - const polyveck *t1: pointer to output vector t1 -* - uint8_t pk[]: byte array containing bit-packed pk -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { - rho[i] = pk[i]; - } - pk += SEEDBYTES; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_pack_sk -* -* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). -* -* Arguments: - uint8_t sk[]: output byte array -* - const uint8_t rho[]: byte array containing rho -* - const uint8_t key[]: byte array containing key -* - const uint8_t tr[]: byte array containing tr -* - const polyvecl *s1: pointer to vector s1 -* - const polyveck *s2: pointer to vector s2 -* - const polyveck *t0: pointer to vector t0 -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { - for (size_t i = 0; i < SEEDBYTES; ++i) { - sk[i] = rho[i]; - } - sk += SEEDBYTES; - - for (size_t i = 0; i < SEEDBYTES; ++i) { - sk[i] = key[i]; - } - sk += SEEDBYTES; - - for (size_t i = 0; i < CRHBYTES; ++i) { - sk[i] = tr[i]; - } - sk += CRHBYTES; - - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); - } - sk += L * POLETA_SIZE_PACKED; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); - } - sk += K * POLETA_SIZE_PACKED; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_unpack_sk -* -* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). -* -* Arguments: - const uint8_t rho[]: output byte array for rho -* - const uint8_t key[]: output byte array for key -* - const uint8_t tr[]: output byte array for tr -* - const polyvecl *s1: pointer to output vector s1 -* - const polyveck *s2: pointer to output vector s2 -* - const polyveck *r0: pointer to output vector t0 -* - uint8_t sk[]: byte array containing bit-packed sk -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t sk[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { - rho[i] = sk[i]; - } - sk += SEEDBYTES; - - for (size_t i = 0; i < SEEDBYTES; ++i) { - key[i] = sk[i]; - } - sk += SEEDBYTES; - - for (size_t i = 0; i < CRHBYTES; ++i) { - tr[i] = sk[i]; - } - sk += CRHBYTES; - - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); - } - sk += L * POLETA_SIZE_PACKED; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); - } - sk += K * POLETA_SIZE_PACKED; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_pack_sig -* -* Description: Bit-pack signature sig = (z, h, c). -* -* Arguments: - uint8_t sig[]: output byte array -* - const polyvecl *z: pointer to vector z -* - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { - size_t k; - uint64_t signs, mask; - - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); - } - sig += L * POLZ_SIZE_PACKED; - - /* Encode h */ - k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { - if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (uint8_t)j; - } - } - - sig[OMEGA + i] = (uint8_t)k; - } - while (k < OMEGA) { - sig[k++] = 0; - } - sig += OMEGA + K; - - /* Encode c */ - signs = 0; - mask = 1; - for (size_t i = 0; i < N / 8; ++i) { - sig[i] = 0; - for (size_t j = 0; j < 8; ++j) { - if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (uint8_t)(1u << j); - if (c->coeffs[8 * i + j] == (Q - 1)) { - signs |= mask; - } - mask <<= 1; - } - } - } - sig += N / 8; - for (size_t i = 0; i < 8; ++i) { - sig[i] = (uint8_t)(signs >> 8u * i); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_unpack_sig -* -* Description: Unpack signature sig = (z, h, c). -* -* Arguments: - polyvecl *z: pointer to output vector z -* - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial -* - const uint8_t sig[]: byte array containing -* bit-packed signature -* -* Returns 1 in case of malformed signature; otherwise 0. -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES]) { - size_t k; - uint64_t signs; - - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); - } - sig += L * POLZ_SIZE_PACKED; - - /* Decode h */ - k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { - h->vec[i].coeffs[j] = 0; - } - - if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { - return 1; - } - - for (size_t j = k; j < sig[OMEGA + i]; ++j) { - /* Coefficients are ordered for strong unforgeability */ - if (j > k && sig[j] <= sig[j - 1]) { - return 1; - } - h->vec[i].coeffs[sig[j]] = 1; - } - - k = sig[OMEGA + i]; - } - - /* Extra indices are zero for strong unforgeability */ - for (size_t j = k; j < OMEGA; ++j) { - if (sig[j]) { - return 1; - } - } - - sig += OMEGA + K; - - /* Decode c */ - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)sig[N / 8 + i] << 8 * i; - } - - /* Extra sign bits are zero for strong unforgeability */ - if (signs >> 60) { - return 1; - } - - for (size_t i = 0; i < N / 8; ++i) { - for (size_t j = 0; j < 8; ++j) { - if ((sig[i] >> j) & 0x01) { - c->coeffs[8 * i + j] = 1; - c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - } - } - - return 0; -} diff --git a/crypto_sign/dilithium4/avx2/packing.h b/crypto_sign/dilithium4/avx2/packing.h deleted file mode 100644 index 979ec810..00000000 --- a/crypto_sign/dilithium4/avx2/packing.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_PACKING_H -#define PQCLEAN_DILITHIUM4_AVX2_PACKING_H - -#include "api.h" -#include "params.h" -#include "polyvec.h" - -void PQCLEAN_DILITHIUM4_AVX2_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1); -void PQCLEAN_DILITHIUM4_AVX2_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[SEEDBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM4_AVX2_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, const polyveck *h, const poly *c); - -void PQCLEAN_DILITHIUM4_AVX2_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM4_AVX2_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t *sk); -int PQCLEAN_DILITHIUM4_AVX2_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES]); - -#endif diff --git a/crypto_sign/dilithium4/avx2/params.h b/crypto_sign/dilithium4/avx2/params.h deleted file mode 100644 index 54acba57..00000000 --- a/crypto_sign/dilithium4/avx2/params.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_PARAMS_H -#define PQCLEAN_DILITHIUM4_AVX2_PARAMS_H - - -#define SEEDBYTES 32 -#define CRHBYTES 48 -#define N 256 -#define Q 8380417 -#define QBITS 23 -#define D 14 -#define GAMMA1 ((Q - 1)/16) -#define GAMMA2 (GAMMA1/2) -#define ALPHA (2*GAMMA2) - -#define K 6 -#define L 5 -#define ETA 3 -#define SETABITS 3 -#define BETA 175 -#define OMEGA 120 - - -#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) -#define POLT0_SIZE_PACKED ((N*D)/8) -#define POLETA_SIZE_PACKED ((N*SETABITS)/8) -#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) -#define POLW1_SIZE_PACKED ((N*4)/8) - -#endif diff --git a/crypto_sign/dilithium4/avx2/pointwise.S b/crypto_sign/dilithium4/avx2/pointwise.S deleted file mode 100644 index a9d3ddd3..00000000 --- a/crypto_sign/dilithium4/avx2/pointwise.S +++ /dev/null @@ -1,194 +0,0 @@ -#include "params.h" -#include "cdecl.inc" - -.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx) -cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 - -xor %eax,%eax -_looptop1: -#load -vmovdqa (%rsi),%ymm2 -vmovdqa 32(%rsi),%ymm4 -vmovdqa 64(%rsi),%ymm6 -vmovdqa (%rdx),%ymm10 -vmovdqa 32(%rdx),%ymm12 -vmovdqa 64(%rdx),%ymm14 -vpsrlq $32,%ymm2,%ymm3 -vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm6,%ymm7 -vpsrlq $32,%ymm10,%ymm11 -vpsrlq $32,%ymm12,%ymm13 -vpsrlq $32,%ymm14,%ymm15 - -#mul -vpmuludq %ymm2,%ymm10,%ymm2 -vpmuludq %ymm3,%ymm11,%ymm3 -vpmuludq %ymm4,%ymm12,%ymm4 -vpmuludq %ymm5,%ymm13,%ymm5 -vpmuludq %ymm6,%ymm14,%ymm6 -vpmuludq %ymm7,%ymm15,%ymm7 - -#reduce -vpmuludq %ymm0,%ymm2,%ymm10 -vpmuludq %ymm0,%ymm3,%ymm11 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm0,%ymm6,%ymm14 -vpmuludq %ymm0,%ymm7,%ymm15 -vpmuludq %ymm1,%ymm10,%ymm10 -vpmuludq %ymm1,%ymm11,%ymm11 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpmuludq %ymm1,%ymm14,%ymm14 -vpmuludq %ymm1,%ymm15,%ymm15 -vpaddq %ymm2,%ymm10,%ymm2 -vpaddq %ymm3,%ymm11,%ymm3 -vpaddq %ymm4,%ymm12,%ymm4 -vpaddq %ymm5,%ymm13,%ymm5 -vpaddq %ymm6,%ymm14,%ymm6 -vpaddq %ymm7,%ymm15,%ymm7 -vpsrlq $32,%ymm2,%ymm2 -vpsrlq $32,%ymm4,%ymm4 -vpsrlq $32,%ymm6,%ymm6 - -#store -vpblendd $0xAA,%ymm3,%ymm2,%ymm2 -vpblendd $0xAA,%ymm5,%ymm4,%ymm4 -vpblendd $0xAA,%ymm7,%ymm6,%ymm6 -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm4,32(%rdi) -vmovdqa %ymm6,64(%rdi) - -add $96,%rdi -add $96,%rsi -add $96,%rdx -add $1,%eax -cmp $10,%eax -jb _looptop1 - -vmovdqa (%rsi),%ymm2 -vmovdqa 32(%rsi),%ymm4 -vmovdqa (%rdx),%ymm10 -vmovdqa 32(%rdx),%ymm12 -vpsrlq $32,%ymm2,%ymm3 -vpsrlq $32,%ymm4,%ymm5 -vpsrlq $32,%ymm10,%ymm11 -vpsrlq $32,%ymm12,%ymm13 - -#mul -vpmuludq %ymm2,%ymm10,%ymm2 -vpmuludq %ymm3,%ymm11,%ymm3 -vpmuludq %ymm4,%ymm12,%ymm4 -vpmuludq %ymm5,%ymm13,%ymm5 - -#reduce -vpmuludq %ymm0,%ymm2,%ymm10 -vpmuludq %ymm0,%ymm3,%ymm11 -vpmuludq %ymm0,%ymm4,%ymm12 -vpmuludq %ymm0,%ymm5,%ymm13 -vpmuludq %ymm1,%ymm10,%ymm10 -vpmuludq %ymm1,%ymm11,%ymm11 -vpmuludq %ymm1,%ymm12,%ymm12 -vpmuludq %ymm1,%ymm13,%ymm13 -vpaddq %ymm2,%ymm10,%ymm2 -vpaddq %ymm3,%ymm11,%ymm3 -vpaddq %ymm4,%ymm12,%ymm4 -vpaddq %ymm5,%ymm13,%ymm5 -vpsrlq $32,%ymm2,%ymm2 -vpsrlq $32,%ymm4,%ymm4 - -#store -vpblendd $0x55,%ymm2,%ymm3,%ymm2 -vpblendd $0x55,%ymm4,%ymm5,%ymm4 -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm4,32(%rdi) - -ret - -.macro pointwise off -#load -vmovdqa \off(%rsi),%ymm6 -vmovdqa \off+32(%rsi),%ymm8 -vmovdqa \off(%rdx),%ymm10 -vmovdqa \off+32(%rdx),%ymm12 -vpsrlq $32,%ymm6,%ymm7 -vpsrlq $32,%ymm8,%ymm9 -vpsrlq $32,%ymm10,%ymm11 -vpsrlq $32,%ymm12,%ymm13 - -#mul -vpmuludq %ymm6,%ymm10,%ymm6 -vpmuludq %ymm7,%ymm11,%ymm7 -vpmuludq %ymm8,%ymm12,%ymm8 -vpmuludq %ymm9,%ymm13,%ymm9 -.endm - -.macro acc -vpaddq %ymm6,%ymm2,%ymm2 -vpaddq %ymm7,%ymm3,%ymm3 -vpaddq %ymm8,%ymm4,%ymm4 -vpaddq %ymm9,%ymm5,%ymm5 -.endm - -.global cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx) -cdecl(PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xqinv)(%rip),%ymm0 -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm1 - -xor %eax,%eax -_looptop2: -pointwise 0 - -#mov -vmovdqa %ymm6,%ymm2 -vmovdqa %ymm7,%ymm3 -vmovdqa %ymm8,%ymm4 -vmovdqa %ymm9,%ymm5 - -pointwise 1024 -acc - -pointwise 2048 -acc - -pointwise 3072 -acc - -pointwise 4096 -acc - -#reduce -vpmuludq %ymm0,%ymm2,%ymm6 -vpmuludq %ymm0,%ymm3,%ymm7 -vpmuludq %ymm0,%ymm4,%ymm8 -vpmuludq %ymm0,%ymm5,%ymm9 -vpmuludq %ymm1,%ymm6,%ymm6 -vpmuludq %ymm1,%ymm7,%ymm7 -vpmuludq %ymm1,%ymm8,%ymm8 -vpmuludq %ymm1,%ymm9,%ymm9 -vpaddq %ymm2,%ymm6,%ymm2 -vpaddq %ymm3,%ymm7,%ymm3 -vpaddq %ymm4,%ymm8,%ymm4 -vpaddq %ymm5,%ymm9,%ymm5 -vpsrlq $32,%ymm2,%ymm2 -vpsrlq $32,%ymm4,%ymm4 - -#store -vpblendd $0xAA,%ymm3,%ymm2,%ymm2 -vpblendd $0xAA,%ymm5,%ymm4,%ymm4 - -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm4,32(%rdi) - -add $64,%rsi -add $64,%rdx -add $64,%rdi -add $1,%eax -cmp $16,%eax -jb _looptop2 - -ret diff --git a/crypto_sign/dilithium4/avx2/poly.c b/crypto_sign/dilithium4/avx2/poly.c deleted file mode 100644 index f5d28f56..00000000 --- a/crypto_sign/dilithium4/avx2/poly.c +++ /dev/null @@ -1,923 +0,0 @@ -#include -#include - -#include "fips202x4.h" -#include "ntt.h" -#include "nttconsts.h" -#include "params.h" -#include "poly.h" -#include "reduce.h" -#include "rejsample.h" -#include "rounding.h" -#include "symmetric.h" - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_reduce -* -* Description: Reduce all coefficients of input polynomial to representative -* in [0,2*Q[. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_reduce(poly *a) { - PQCLEAN_DILITHIUM4_AVX2_reduce_avx(a->coeffs); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_csubq -* -* Description: For all coefficients of input polynomial subtract Q if -* coefficient is bigger than Q. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_csubq(poly *a) { - PQCLEAN_DILITHIUM4_AVX2_csubq_avx(a->coeffs); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_freeze -* -* Description: Reduce all coefficients of the polynomial to standard -* representatives. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_freeze(poly *a) { - PQCLEAN_DILITHIUM4_AVX2_reduce_avx(a->coeffs); - PQCLEAN_DILITHIUM4_AVX2_csubq_avx(a->coeffs); - -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_add -* -* Description: Add polynomials. No modular reduction is performed. -* -* Arguments: - poly *c: pointer to output polynomial -* - const poly *a: pointer to first summand -* - const poly *b: pointer to second summand -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b) { - __m256i vec0, vec1; - for (size_t i = 0; i < N / 8; i++) { - vec0 = _mm256_load_si256(&a->coeffs_x8[i]); - vec1 = _mm256_load_si256(&b->coeffs_x8[i]); - vec0 = _mm256_add_epi32(vec0, vec1); - _mm256_store_si256(&c->coeffs_x8[i], vec0); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_sub -* -* Description: Subtract polynomials. Assumes coefficients of second input -* polynomial to be less than 2*Q. No modular reduction is -* performed. -* -* Arguments: - poly *c: pointer to output polynomial -* - const poly *a: pointer to first input polynomial -* - const poly *b: pointer to second input polynomial to be -* subtraced from first input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { - __m256i vec0, vec1; - const __m256i twoq = _mm256_load_si256(PQCLEAN_DILITHIUM4_AVX2_8x2q.as_vec); - - for (size_t i = 0; i < N / 8; i++) { - vec0 = _mm256_load_si256(&a->coeffs_x8[i]); - vec1 = _mm256_load_si256(&b->coeffs_x8[i]); - vec0 = _mm256_add_epi32(vec0, twoq); - vec0 = _mm256_sub_epi32(vec0, vec1); - _mm256_store_si256(&c->coeffs_x8[i], vec0); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_shiftl -* -* Description: Multiply polynomial by 2^D without modular reduction. Assumes -* input coefficients to be less than 2^{32-D}. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_shiftl(poly *a) { - __m256i vec; - - for (size_t i = 0; i < N / 8; i++) { - vec = _mm256_load_si256(&a->coeffs_x8[i]); - vec = _mm256_slli_epi32(vec, D); - _mm256_store_si256(&a->coeffs_x8[i], vec); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_ntt -* -* Description: Forward NTT. Output coefficients can be up to 16*Q larger than -* input coefficients. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_ntt(poly *a) { - ALIGNED_UINT64(N) tmp; - - for (size_t i = 0; i < N / 32; ++i) { - PQCLEAN_DILITHIUM4_AVX2_ntt_levels0t2_avx(tmp.as_arr + 4 * i, a->coeffs + 4 * i, PQCLEAN_DILITHIUM4_AVX2_zetas.as_arr + 1); - } - for (size_t i = 0; i < N / 32; ++i) { - PQCLEAN_DILITHIUM4_AVX2_ntt_levels3t8_avx(a->coeffs + 32 * i, tmp.as_arr + 32 * i, PQCLEAN_DILITHIUM4_AVX2_zetas.as_arr + 8 + 31 * i); - } -} - -/************************************************* -* Name: poly_invntt_montgomery -* -* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients -* need to be less than 2*Q. Output coefficients are less than 2*Q. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(poly *a) { - ALIGNED_UINT64(N) tmp; - - for (size_t i = 0; i < N / 32; i++) { - PQCLEAN_DILITHIUM4_AVX2_invntt_levels0t4_avx(tmp.as_arr + 32 * i, a->coeffs + 32 * i, PQCLEAN_DILITHIUM4_AVX2_zetas_inv.as_arr + 31 * i); - } - for (size_t i = 0; i < N / 32; i++) { - PQCLEAN_DILITHIUM4_AVX2_invntt_levels5t7_avx(a->coeffs + 4 * i, tmp.as_arr + 4 * i, PQCLEAN_DILITHIUM4_AVX2_zetas_inv.as_arr + 248); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery -* -* Description: Pointwise multiplication of polynomials in NTT domain -* representation and multiplication of resulting polynomial -* with 2^{-32}. Output coefficients are less than 2*Q if input -* coefficient are less than 22*Q. -* -* Arguments: - poly *c: pointer to output polynomial -* - const poly *a: pointer to first input polynomial -* - const poly *b: pointer to second input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - PQCLEAN_DILITHIUM4_AVX2_pointwise_avx(c->coeffs, a->coeffs, b->coeffs); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_power2round -* -* Description: For all coefficients c of the input polynomial, -* compute c0, c1 such that c mod Q = c1*2^D + c0 -* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be -* standard representatives. -* -* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *v: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_power2round(poly *restrict a1, - poly *restrict a0, - const poly *restrict a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM4_AVX2_power2round(a->coeffs[i], &a0->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_decompose -* -* Description: For all coefficients c of the input polynomial, -* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 -* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we -* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. -* Assumes coefficients to be standard representatives. -* -* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *c: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_decompose( - poly *restrict a1, - poly *restrict a0, - const poly *restrict a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM4_AVX2_decompose(a->coeffs[i], &a0->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_make_hint -* -* Description: Compute hint polynomial. The coefficients of which indicate -* whether the low bits of the corresponding coefficient of -* the input polynomial overflow into the high bits. -* -* Arguments: - poly *h: pointer to output hint polynomial -* - const poly *a0: pointer to low part of input polynomial -* - const poly *a1: pointer to high part of input polynomial -* -* Returns number of 1 bits. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_AVX2_poly_make_hint( - poly *restrict h, - const poly *restrict a0, - const poly *restrict a1) { - uint32_t s = 0; - for (size_t i = 0; i < N; ++i) { - h->coeffs[i] = PQCLEAN_DILITHIUM4_AVX2_make_hint(a0->coeffs[i], a1->coeffs[i]); - s += h->coeffs[i]; - } - return s; -} - -/************************************************* - * Name: PQCLEAN_DILITHIUM4_AVX2_poly_use_hint - * - * Description: Use hint polynomial to correct the high bits of a polynomial. -* -* Arguments: - poly *a: pointer to output polynomial with corrected high bits -* - const poly *b: pointer to input polynomial -* - const poly *h: pointer to input hint polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_poly_use_hint( - poly *restrict a, - const poly *restrict b, - const poly *restrict h) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM4_AVX2_use_hint(b->coeffs[i], h->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_chknorm -* -* Description: Check infinity norm of polynomial against given bound. -* Assumes input coefficients to be standard representatives. -* -* Arguments: - const poly *a: pointer to polynomial -* - uint32_t B: norm bound -* -* Returns 0 if norm is strictly smaller than B and 1 otherwise. -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_poly_chknorm(const poly *a, uint32_t B) { - int32_t t; - - /* It is ok to leak which coefficient violates the bound since - the probability for each coefficient is independent of secret - data but we must not leak the sign of the centralized representative. */ - for (size_t i = 0; i < N; ++i) { - /* Absolute value of centralized representative */ - t = (Q - 1) / 2 - a->coeffs[i]; - t ^= (t >> 31); - t = (Q - 1) / 2 - t; - - if ((uint32_t)t >= B) { - return 1; - } - } - - return 0; -} - -/************************************************* -* Name: rej_uniform_ref -* -* Description: Sample uniformly random coefficients in [0, Q-1] by -* performing rejection sampling using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_uniform_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint32_t t; - - ctr = pos = 0; - while (ctr < len && pos + 3 <= buflen) { - t = buf[pos++]; - t |= (uint32_t)buf[pos++] << 8; - t |= (uint32_t)buf[pos++] << 16; - t &= 0x7FFFFF; - - if (t < Q) { - a[ctr++] = t; - } - } - - return ctr; -} - -/************************************************* -* Name: poly_uniform -* -* Description: Sample polynomial with uniformly random coefficients -* in [0,Q-1] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). -* -* Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES -* - uint16_t nonce: 2-byte nonce -**************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform(poly *a, - const uint8_t seed[SEEDBYTES], - uint16_t nonce) { - size_t ctr, off; - size_t nblocks = POLY_UNIFORM_NBLOCKS; - size_t buflen = POLY_UNIFORM_BUFLEN; - uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; - stream128_state state; - - stream128_init(&state, seed, nonce); - stream128_squeezeblocks(buf, nblocks, &state); - - ctr = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a->coeffs, N, buf, buflen); - - while (ctr < N) { - off = buflen % 3; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } - - buflen = STREAM128_BLOCKBYTES + off; - stream128_squeezeblocks(buf + off, 1, &state); - ctr += rej_uniform_ref(a->coeffs + ctr, N - ctr, buf, buflen); - } - stream128_ctx_release(&state); -} - -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(poly *a0, - poly *a1, - poly *a2, - poly *a3, - const uint8_t seed[SEEDBYTES], - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][SEEDBYTES + 2]; - uint8_t outbuf[4][5 * SHAKE128_RATE]; - __m256i state[25]; - - for (size_t i = 0; i < SEEDBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][SEEDBYTES + 0] = nonce0; - inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; - inbuf[1][SEEDBYTES + 0] = nonce1; - inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; - inbuf[2][SEEDBYTES + 0] = nonce2; - inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; - inbuf[3][SEEDBYTES + 0] = nonce3; - inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; - - PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - SEEDBYTES + 2); - PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, - state); - - ctr0 = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a0->coeffs, N, outbuf[0], 5 * SHAKE128_RATE); - ctr1 = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a1->coeffs, N, outbuf[1], 5 * SHAKE128_RATE); - ctr2 = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a2->coeffs, N, outbuf[2], 5 * SHAKE128_RATE); - ctr3 = PQCLEAN_DILITHIUM4_AVX2_rej_uniform(a3->coeffs, N, outbuf[3], 5 * SHAKE128_RATE); - - while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); - - ctr0 += rej_uniform_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], - SHAKE128_RATE); - ctr1 += rej_uniform_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], - SHAKE128_RATE); - ctr2 += rej_uniform_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], - SHAKE128_RATE); - ctr3 += rej_uniform_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], - SHAKE128_RATE); - } -} - -/************************************************* -* Name: rej_eta -* -* Description: Sample uniformly random coefficients in [-ETA, ETA] by -* performing rejection sampling using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_eta_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint32_t t0, t1; - - ctr = pos = 0; - while (ctr < len && pos < buflen) { - t0 = buf[pos] & 0x07; - t1 = buf[pos++] >> 5; - - if (t0 <= 2 * ETA) { - a[ctr++] = Q + ETA - t0; - } - if (t1 <= 2 * ETA && ctr < len) { - a[ctr++] = Q + ETA - t1; - } - } - - return ctr; -} - -/************************************************* -* Name: poly_uniform_eta -* -* Description: Sample polynomial with uniformly random coefficients -* in [-ETA,ETA] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). -* -* Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES -* - uint16_t nonce: 2-byte nonce -**************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta( - poly *a, - const uint8_t seed[SEEDBYTES], - uint16_t nonce) { - size_t ctr; - uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; - stream128_state state; - - stream128_init(&state, seed, nonce); - stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); - - ctr = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); - - while (ctr < N) { - stream128_squeezeblocks(buf, 1, &state); - ctr += rej_eta_ref(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); - } - stream128_ctx_release(&state); -} - -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x( - poly *a0, - poly *a1, - poly *a2, - poly *a3, - const uint8_t seed[SEEDBYTES], - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][SEEDBYTES + 2]; - uint8_t outbuf[4][2 * SHAKE128_RATE]; - __m256i state[25]; - - for (size_t i = 0; i < SEEDBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][SEEDBYTES + 0] = nonce0; - inbuf[0][SEEDBYTES + 1] = nonce0 >> 8; - inbuf[1][SEEDBYTES + 0] = nonce1; - inbuf[1][SEEDBYTES + 1] = nonce1 >> 8; - inbuf[2][SEEDBYTES + 0] = nonce2; - inbuf[2][SEEDBYTES + 1] = nonce2 >> 8; - inbuf[3][SEEDBYTES + 0] = nonce3; - inbuf[3][SEEDBYTES + 1] = nonce3 >> 8; - - PQCLEAN_DILITHIUM4_AVX2_shake128_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - SEEDBYTES + 2); - PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 2, - state); - - ctr0 = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a0->coeffs, N, outbuf[0], 2 * SHAKE128_RATE); - ctr1 = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a1->coeffs, N, outbuf[1], 2 * SHAKE128_RATE); - ctr2 = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a2->coeffs, N, outbuf[2], 2 * SHAKE128_RATE); - ctr3 = PQCLEAN_DILITHIUM4_AVX2_rej_eta(a3->coeffs, N, outbuf[3], 2 * SHAKE128_RATE); - - while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM4_AVX2_shake128_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); - - ctr0 += rej_eta_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], SHAKE128_RATE); - ctr1 += rej_eta_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], SHAKE128_RATE); - ctr2 += rej_eta_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], SHAKE128_RATE); - ctr3 += rej_eta_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], SHAKE128_RATE); - } -} - -/************************************************* -* Name: rej_gamma1m1_ref -* -* Description: Sample uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling -* using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_gamma1m1_ref( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint32_t t0, t1; - - ctr = pos = 0; - while (ctr < len && pos + 5 <= buflen) { - t0 = buf[pos]; - t0 |= (uint32_t)buf[pos + 1] << 8; - t0 |= (uint32_t)buf[pos + 2] << 16; - t0 &= 0xFFFFF; - - t1 = buf[pos + 2] >> 4; - t1 |= (uint32_t)buf[pos + 3] << 4; - t1 |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (t0 <= 2 * GAMMA1 - 2) { - a[ctr++] = Q + GAMMA1 - 1 - t0; - } - if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { - a[ctr++] = Q + GAMMA1 - 1 - t1; - } - } - return ctr; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1 -* -* Description: Sample polynomial with uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection -* sampling on output stream of SHAKE256(seed|nonce). -* -* Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* CRHBYTES -* - uint16_t nonce: 16-bit nonce -**************************************************/ -#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) -#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1( - poly *a, - const uint8_t seed[CRHBYTES], - uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; - stream256_state state; - - stream256_init(&state, seed, nonce); - stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); - - ctr = PQCLEAN_DILITHIUM4_AVX2_rej_gamma1m1(a->coeffs, N, buf, POLY_UNIFORM_GAMMA1M1_BUFLEN); - - while (ctr < N) { - off = buflen % 5; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } - - buflen = STREAM256_BLOCKBYTES + off; - stream256_squeezeblocks(buf + off, 1, &state); - ctr += rej_gamma1m1_ref(a->coeffs + ctr, N - ctr, buf, buflen); - } - stream256_ctx_release(&state); -} - -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1_4x(poly *a0, - poly *a1, - poly *a2, - poly *a3, - const uint8_t seed[CRHBYTES], - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3) { - size_t ctr0, ctr1, ctr2, ctr3; - uint8_t inbuf[4][CRHBYTES + 2]; - uint8_t outbuf[4][5 * SHAKE256_RATE]; - __m256i state[25]; - - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[0][i] = seed[i]; - inbuf[1][i] = seed[i]; - inbuf[2][i] = seed[i]; - inbuf[3][i] = seed[i]; - } - inbuf[0][CRHBYTES + 0] = nonce0 & 0xFF; - inbuf[0][CRHBYTES + 1] = nonce0 >> 8; - inbuf[1][CRHBYTES + 0] = nonce1 & 0xFF; - inbuf[1][CRHBYTES + 1] = nonce1 >> 8; - inbuf[2][CRHBYTES + 0] = nonce2 & 0xFF; - inbuf[2][CRHBYTES + 1] = nonce2 >> 8; - inbuf[3][CRHBYTES + 0] = nonce3 & 0xFF; - inbuf[3][CRHBYTES + 1] = nonce3 >> 8; - - PQCLEAN_DILITHIUM4_AVX2_shake256_absorb4x(state, inbuf[0], inbuf[1], inbuf[2], inbuf[3], - CRHBYTES + 2); - PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 5, - state); - - ctr0 = rej_gamma1m1_ref(a0->coeffs, N, outbuf[0], 5 * SHAKE256_RATE); - ctr1 = rej_gamma1m1_ref(a1->coeffs, N, outbuf[1], 5 * SHAKE256_RATE); - ctr2 = rej_gamma1m1_ref(a2->coeffs, N, outbuf[2], 5 * SHAKE256_RATE); - ctr3 = rej_gamma1m1_ref(a3->coeffs, N, outbuf[3], 5 * SHAKE256_RATE); - - while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { - PQCLEAN_DILITHIUM4_AVX2_shake256_squeezeblocks4x(outbuf[0], outbuf[1], outbuf[2], outbuf[3], 1, - state); - - ctr0 += rej_gamma1m1_ref(a0->coeffs + ctr0, N - ctr0, outbuf[0], - SHAKE256_RATE); - ctr1 += rej_gamma1m1_ref(a1->coeffs + ctr1, N - ctr1, outbuf[1], - SHAKE256_RATE); - ctr2 += rej_gamma1m1_ref(a2->coeffs + ctr2, N - ctr2, outbuf[2], - SHAKE256_RATE); - ctr3 += rej_gamma1m1_ref(a3->coeffs + ctr3, N - ctr3, outbuf[3], - SHAKE256_RATE); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyeta_pack -* -* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. -* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLETA_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyeta_pack(uint8_t *restrict r, const poly *restrict a) { - uint8_t t[8]; - for (size_t i = 0; i < N / 8; ++i) { - t[0] = Q + ETA - a->coeffs[8 * i + 0]; - t[1] = Q + ETA - a->coeffs[8 * i + 1]; - t[2] = Q + ETA - a->coeffs[8 * i + 2]; - t[3] = Q + ETA - a->coeffs[8 * i + 3]; - t[4] = Q + ETA - a->coeffs[8 * i + 4]; - t[5] = Q + ETA - a->coeffs[8 * i + 5]; - t[6] = Q + ETA - a->coeffs[8 * i + 6]; - t[7] = Q + ETA - a->coeffs[8 * i + 7]; - - r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); - r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); - r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack -* -* Description: Unpack polynomial with coefficients in [-ETA,ETA]. -* Output coefficients lie in [Q-ETA,Q+ETA]. -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: byte array with bit-packed polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = a[3 * i + 0] & 0x07; - r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 0x07; - r->coeffs[8 * i + 2] = (uint32_t)((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 0x07; - r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 0x07; - r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 0x07; - r->coeffs[8 * i + 5] = (uint32_t)((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 0x07; - r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 0x07; - r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 0x07; - - r->coeffs[8 * i + 0] = Q + ETA - r->coeffs[8 * i + 0]; - r->coeffs[8 * i + 1] = Q + ETA - r->coeffs[8 * i + 1]; - r->coeffs[8 * i + 2] = Q + ETA - r->coeffs[8 * i + 2]; - r->coeffs[8 * i + 3] = Q + ETA - r->coeffs[8 * i + 3]; - r->coeffs[8 * i + 4] = Q + ETA - r->coeffs[8 * i + 4]; - r->coeffs[8 * i + 5] = Q + ETA - r->coeffs[8 * i + 5]; - r->coeffs[8 * i + 6] = Q + ETA - r->coeffs[8 * i + 6]; - r->coeffs[8 * i + 7] = Q + ETA - r->coeffs[8 * i + 7]; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyt1_pack -* -* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. -* Input coefficients are assumed to be standard representatives. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT1_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyt1_pack(uint8_t *restrict r, const poly *restrict a) { - for (size_t i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyt1_unpack -* -* Description: Unpack polynomial t1 with 9-bit coefficients. -* Output coefficients are standard representatives. -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: byte array with bit-packed polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyt1_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] >> 0) | ((uint32_t)a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t)a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t)a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t)a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t)a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t)a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t)a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t)a[9 * i + 8] << 1)) & 0x1FF; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyt0_pack -* -* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT0_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyt0_pack(uint8_t *restrict r, const poly *restrict a) { - uint32_t t[4]; - - for (size_t i = 0; i < N / 4; ++i) { - t[0] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1U << (D - 1)) - a->coeffs[4 * i + 3]; - - r[7 * i + 0] = t[0]; - r[7 * i + 1] = t[0] >> 8; - r[7 * i + 1] |= t[1] << 6; - r[7 * i + 2] = t[1] >> 2; - r[7 * i + 3] = t[1] >> 10; - r[7 * i + 3] |= t[2] << 4; - r[7 * i + 4] = t[2] >> 4; - r[7 * i + 5] = t[2] >> 12; - r[7 * i + 5] |= t[3] << 2; - r[7 * i + 6] = t[3] >> 6; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyt0_unpack -* -* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: byte array with bit-packed polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyt0_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8; - - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10; - - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12; - - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6; - - r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; - r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; - r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; - r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyz_pack -* -* Description: Bit-pack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Input coefficients are assumed to be standard representatives. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLZ_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyz_pack(uint8_t *restrict r, const poly *restrict a) { - uint32_t t[2]; - - for (size_t i = 0; i < N / 2; ++i) { - /* Map to {0,...,2*GAMMA1 - 2} */ - t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; - t[0] += ((int32_t)t[0] >> 31) & Q; - t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; - t[1] += ((int32_t)t[1] >> 31) & Q; - - r[5 * i + 0] = t[0]; - r[5 * i + 1] = t[0] >> 8; - r[5 * i + 2] = t[0] >> 16; - r[5 * i + 2] |= t[1] << 4; - r[5 * i + 3] = t[1] >> 4; - r[5 * i + 4] = t[1] >> 12; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyz_unpack -* -* Description: Unpack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Output coefficients are standard representatives. -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: byte array with bit-packed polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyz_unpack(poly *restrict r, const uint8_t *restrict a) { - for (size_t i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16; - - r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; - - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; - r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; - } - -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyw1_pack -* -* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. -* Input coefficients are assumed to be standard representatives. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLW1_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyw1_pack( - uint8_t *restrict r, - const poly *restrict a) { - for (size_t i = 0; i < N / 2; ++i) { - r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4); - } -} diff --git a/crypto_sign/dilithium4/avx2/poly.h b/crypto_sign/dilithium4/avx2/poly.h deleted file mode 100644 index 52e594a5..00000000 --- a/crypto_sign/dilithium4/avx2/poly.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef POLY_H -#define POLY_H - -#include -#include - -#include "alignment.h" -#include "params.h" - -typedef union { - uint32_t coeffs[N]; - __m256i coeffs_x8[N / 8]; -} poly; - -void PQCLEAN_DILITHIUM4_AVX2_poly_reduce(poly *a); -void PQCLEAN_DILITHIUM4_AVX2_poly_csubq(poly *a); -void PQCLEAN_DILITHIUM4_AVX2_poly_freeze(poly *a); - -void PQCLEAN_DILITHIUM4_AVX2_poly_add(poly *c, const poly *a, const poly *b); -void PQCLEAN_DILITHIUM4_AVX2_poly_sub(poly *c, const poly *a, const poly *b); -void PQCLEAN_DILITHIUM4_AVX2_poly_shiftl(poly *a); - -void PQCLEAN_DILITHIUM4_AVX2_poly_ntt(poly *a); -void PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(poly *a); -void PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b); - -void PQCLEAN_DILITHIUM4_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); -void PQCLEAN_DILITHIUM4_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); -unsigned int PQCLEAN_DILITHIUM4_AVX2_poly_make_hint(poly *h, const poly *a0, const poly *a1); -void PQCLEAN_DILITHIUM4_AVX2_poly_use_hint(poly *a, const poly *b, const poly *h); - -int PQCLEAN_DILITHIUM4_AVX2_poly_chknorm(const poly *a, uint32_t B); -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform(poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(poly *a0, - poly *a1, - poly *a2, - poly *a3, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3); -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta(poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(poly *a0, - poly *a1, - poly *a2, - poly *a3, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3); -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1(poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1_4x(poly *a0, - poly *a1, - poly *a2, - poly *a3, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3); - -void PQCLEAN_DILITHIUM4_AVX2_polyeta_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM4_AVX2_polyeta_unpack(poly *r, const uint8_t *a); - -void PQCLEAN_DILITHIUM4_AVX2_polyt1_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM4_AVX2_polyt1_unpack(poly *r, const uint8_t *a); - -void PQCLEAN_DILITHIUM4_AVX2_polyt0_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM4_AVX2_polyt0_unpack(poly *r, const uint8_t *a); - -void PQCLEAN_DILITHIUM4_AVX2_polyz_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM4_AVX2_polyz_unpack(poly *r, const uint8_t *a); - -void PQCLEAN_DILITHIUM4_AVX2_polyw1_pack(uint8_t *r, const poly *a); -#endif diff --git a/crypto_sign/dilithium4/avx2/polyvec.c b/crypto_sign/dilithium4/avx2/polyvec.c deleted file mode 100644 index 3c5d4165..00000000 --- a/crypto_sign/dilithium4/avx2/polyvec.c +++ /dev/null @@ -1,323 +0,0 @@ -#include - -#include "ntt.h" -#include "params.h" -#include "poly.h" -#include "polyvec.h" - -/**************************************************************/ -/************ Vectors of polynomials of length L **************/ -/**************************************************************/ - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_freeze -* -* Description: Reduce coefficients of polynomials in vector of length L -* to standard representatives. -* -* Arguments: - polyvecl *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyvecl_freeze(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_freeze(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_add -* -* Description: Add vectors of polynomials of length L. -* No modular reduction is performed. -* -* Arguments: - polyvecl *w: pointer to output vector -* - const polyvecl *u: pointer to first summand -* - const polyvecl *v: pointer to second summand -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt -* -* Description: Forward NTT of all polynomials in vector of length L. Output -* coefficients can be up to 16*Q larger than input coefficients. -* -* Arguments: - polyvecl *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_ntt(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery -* -* Description: Pointwise multiply vectors of polynomials of length L, multiply -* resulting vector by 2^{-32} and add (accumulate) polynomials -* in it. Input/output vectors are in NTT domain representation. -* Input coefficients are assumed to be less than 22*Q. Output -* coeffcient are less than 2*L*Q. -* -* Arguments: - poly *w: output polynomial -* - const polyvecl *u: pointer to first input vector -* - const polyvecl *v: pointer to second input vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(poly *w, - const polyvecl *u, - const polyvecl *v) { - PQCLEAN_DILITHIUM4_AVX2_pointwise_acc_avx(w->coeffs, u->vec->coeffs, v->vec->coeffs); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm -* -* Description: Check infinity norm of polynomials in vector of length L. -* Assumes input coefficients to be standard representatives. -* -* Arguments: - const polyvecl *v: pointer to vector -* - uint32_t B: norm bound -* -* Returns 0 if norm of all polynomials is strictly smaller than B and 1 -* otherwise. -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t bound) { - for (size_t i = 0; i < L; ++i) { - if (PQCLEAN_DILITHIUM4_AVX2_poly_chknorm(&v->vec[i], bound)) { - return 1; - } - } - - return 0; -} - -/**************************************************************/ -/************ Vectors of polynomials of length K **************/ -/**************************************************************/ - - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_reduce -* -* Description: Reduce coefficients of polynomials in vector of length K -* to representatives in [0,2*Q[. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_reduce(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_reduce(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq -* -* Description: For all coefficients of polynomials in vector of length K -* subtract Q if coefficient is bigger than Q. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_csubq(&v->vec[i]); - } -} - -/************************************************* -* Name: polyveck_freeze -* -* Description: Reduce coefficients of polynomials in vector of length K -* to standard representatives. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_freeze(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_freeze(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_add -* -* Description: Add vectors of polynomials of length K. -* No modular reduction is performed. -* -* Arguments: - polyveck *w: pointer to output vector -* - const polyveck *u: pointer to first summand -* - const polyveck *v: pointer to second summand -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_sub -* -* Description: Subtract vectors of polynomials of length K. -* Assumes coefficients of polynomials in second input vector -* to be less than 2*Q. No modular reduction is performed. -* -* Arguments: - polyveck *w: pointer to output vector -* - const polyveck *u: pointer to first input vector -* - const polyveck *v: pointer to second input vector to be -* subtracted from first input vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_shiftl -* -* Description: Multiply vector of polynomials of Length K by 2^D without modular -* reduction. Assumes input coefficients to be less than 2^{32-D}. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_shiftl(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_shiftl(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt -* -* Description: Forward NTT of all polynomials in vector of length K. Output -* coefficients can be up to 16*Q larger than input coefficients. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_ntt(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_invntt_montgomery -* -* Description: Inverse NTT and multiplication by 2^{32} of polynomials -* in vector of length K. Input coefficients need to be less -* than 2*Q. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_invntt_montgomery(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm -* -* Description: Check infinity norm of polynomials in vector of length K. -* Assumes input coefficients to be standard representatives. -* -* Arguments: - const polyveck *v: pointer to vector -* - uint32_t B: norm bound -* -* Returns 0 if norm of all polynomials are strictly smaller than B and 1 -* otherwise. -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm(const polyveck *v, uint32_t bound) { - for (size_t i = 0; i < K; ++i) { - if (PQCLEAN_DILITHIUM4_AVX2_poly_chknorm(&v->vec[i], bound)) { - return 1; - } - } - - return 0; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_power2round -* -* Description: For all coefficients a of polynomials in vector of length K, -* compute a0, a1 such that a mod Q = a1*2^D + a0 -* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be -* standard representatives. -* -* Arguments: - polyveck *v1: pointer to output vector of polynomials with -* coefficients a1 -* - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 -* - const polyveck *v: pointer to input vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_decompose -* -* Description: For all coefficients a of polynomials in vector of length K, -* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 -* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we -* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. -* Assumes coefficients to be standard representatives. -* -* Arguments: - polyveck *v1: pointer to output vector of polynomials with -* coefficients a1 -* - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 -* - const polyveck *v: pointer to input vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_polyveck_make_hint -* -* Description: Compute hint vector. -* -* Arguments: - polyveck *h: pointer to output vector -* - const polyveck *v0: pointer to low part of input vector -* - const polyveck *v1: pointer to high part of input vector -* -* Returns number of 1 bits. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_AVX2_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1) { - uint32_t s = 0; - - for (size_t i = 0; i < K; ++i) { - s += PQCLEAN_DILITHIUM4_AVX2_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); - } - - return s; -} - -/************************************************* -* Name: polyveck_use_hint -* -* Description: Use hint vector to correct the high bits of input vector. -* -* Arguments: - polyveck *w: pointer to output vector of polynomials with -* corrected high bits -* - const polyveck *v: pointer to input vector -* - const polyveck *h: pointer to input hint vector -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_use_hint(&w->vec[i], &v->vec[i], &h->vec[i]); - } -} diff --git a/crypto_sign/dilithium4/avx2/polyvec.h b/crypto_sign/dilithium4/avx2/polyvec.h deleted file mode 100644 index 49f92364..00000000 --- a/crypto_sign/dilithium4/avx2/polyvec.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_POLYVEC_H -#define PQCLEAN_DILITHIUM4_AVX2_POLYVEC_H - -#include - -#include "params.h" -#include "poly.h" - -/* Vectors of polynomials of length L */ -typedef struct { - poly vec[L]; -} polyvecl; - -void PQCLEAN_DILITHIUM4_AVX2_polyvecl_freeze(polyvecl *v); - -void PQCLEAN_DILITHIUM4_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); - -void PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(polyvecl *v); -void PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v); - -int PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm(const polyvecl *v, uint32_t B); - - - -/* Vectors of polynomials of length K */ -typedef struct { - poly vec[K]; -} polyveck; - -void PQCLEAN_DILITHIUM4_AVX2_polyveck_reduce(polyveck *v); -void PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(polyveck *v); -void PQCLEAN_DILITHIUM4_AVX2_polyveck_freeze(polyveck *v); - -void PQCLEAN_DILITHIUM4_AVX2_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v); -void PQCLEAN_DILITHIUM4_AVX2_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v); -void PQCLEAN_DILITHIUM4_AVX2_polyveck_shiftl(polyveck *v); - -void PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(polyveck *v); -void PQCLEAN_DILITHIUM4_AVX2_polyveck_invntt_montgomery(polyveck *v); - -int PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm( - const polyveck *v, uint32_t B); - -void PQCLEAN_DILITHIUM4_AVX2_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v); -void PQCLEAN_DILITHIUM4_AVX2_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v); -uint32_t PQCLEAN_DILITHIUM4_AVX2_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1); -void PQCLEAN_DILITHIUM4_AVX2_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h); - -#endif diff --git a/crypto_sign/dilithium4/avx2/reduce.S b/crypto_sign/dilithium4/avx2/reduce.S deleted file mode 100644 index ae1dcdad..00000000 --- a/crypto_sign/dilithium4/avx2/reduce.S +++ /dev/null @@ -1,93 +0,0 @@ -#include "cdecl.inc" - -.global cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx) -cdecl(PQCLEAN_DILITHIUM4_AVX2_reduce_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8x23ones)(%rip),%ymm0 - -xor %eax,%eax -_looptop_rdc32: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm7 - -#reduce -vpsrld $23,%ymm1,%ymm2 -vpsrld $23,%ymm3,%ymm4 -vpsrld $23,%ymm5,%ymm6 -vpsrld $23,%ymm7,%ymm8 -vpand %ymm0,%ymm1,%ymm1 -vpand %ymm0,%ymm3,%ymm3 -vpand %ymm0,%ymm5,%ymm5 -vpand %ymm0,%ymm7,%ymm7 -vpsubd %ymm2,%ymm1,%ymm1 -vpsubd %ymm4,%ymm3,%ymm3 -vpsubd %ymm6,%ymm5,%ymm5 -vpsubd %ymm8,%ymm7,%ymm7 -vpslld $13,%ymm2,%ymm2 -vpslld $13,%ymm4,%ymm4 -vpslld $13,%ymm6,%ymm6 -vpslld $13,%ymm8,%ymm8 -vpaddd %ymm2,%ymm1,%ymm1 -vpaddd %ymm4,%ymm3,%ymm3 -vpaddd %ymm6,%ymm5,%ymm5 -vpaddd %ymm8,%ymm7,%ymm7 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -add $128,%rdi -add $1,%eax -cmp $8,%eax -jb _looptop_rdc32 - -ret - -.global cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx) -cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq_avx): -#consts -vmovdqa cdecl(PQCLEAN_DILITHIUM4_AVX2_8xq)(%rip),%ymm0 - -xor %eax,%eax -_looptop_csubq: -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm7 - -#cdecl(PQCLEAN_DILITHIUM4_AVX2_csubq) -vpsubd %ymm0,%ymm1,%ymm1 -vpsubd %ymm0,%ymm3,%ymm3 -vpsubd %ymm0,%ymm5,%ymm5 -vpsubd %ymm0,%ymm7,%ymm7 -vpsrad $31,%ymm1,%ymm2 -vpsrad $31,%ymm3,%ymm4 -vpsrad $31,%ymm5,%ymm6 -vpsrad $31,%ymm7,%ymm8 -vpand %ymm0,%ymm2,%ymm2 -vpand %ymm0,%ymm4,%ymm4 -vpand %ymm0,%ymm6,%ymm6 -vpand %ymm0,%ymm8,%ymm8 -vpaddd %ymm2,%ymm1,%ymm1 -vpaddd %ymm4,%ymm3,%ymm3 -vpaddd %ymm6,%ymm5,%ymm5 -vpaddd %ymm8,%ymm7,%ymm7 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -add $128,%rdi -add $1,%eax -cmp $8,%eax -jb _looptop_csubq - -ret diff --git a/crypto_sign/dilithium4/avx2/reduce.h b/crypto_sign/dilithium4/avx2/reduce.h deleted file mode 100644 index ccb2f18c..00000000 --- a/crypto_sign/dilithium4/avx2/reduce.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef REDUCE_H -#define REDUCE_H - -#include - -void PQCLEAN_DILITHIUM4_AVX2_reduce_avx(uint32_t a[N]); -void PQCLEAN_DILITHIUM4_AVX2_csubq_avx(uint32_t a[N]); - -#endif diff --git a/crypto_sign/dilithium4/avx2/rejsample.h b/crypto_sign/dilithium4/avx2/rejsample.h deleted file mode 100644 index c72a6fc7..00000000 --- a/crypto_sign/dilithium4/avx2/rejsample.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef REJSAMPLE_H -#define REJSAMPLE_H - -#include -#include - -uint32_t PQCLEAN_DILITHIUM4_AVX2_rej_uniform( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); - -uint32_t PQCLEAN_DILITHIUM4_AVX2_rej_eta( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); - -uint32_t PQCLEAN_DILITHIUM4_AVX2_rej_gamma1m1( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); - -#endif diff --git a/crypto_sign/dilithium4/avx2/rounding.c b/crypto_sign/dilithium4/avx2/rounding.c deleted file mode 100644 index 80d28229..00000000 --- a/crypto_sign/dilithium4/avx2/rounding.c +++ /dev/null @@ -1,115 +0,0 @@ -#include "rounding.h" - -/************************************************* -* Name: power2round -* -* Description: For finite field element a, compute a0, a1 such that -* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. -* Assumes a to be standard representative. -* -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 -* -* Returns a1. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_AVX2_power2round(uint32_t a, uint32_t *a0) { - int32_t t; - - /* Centralized remainder mod 2^D */ - t = a & ((1U << D) - 1); - t -= (1U << (D - 1)) + 1; - t += (t >> 31) & (1U << D); - t -= (1U << (D - 1)) - 1; - *a0 = Q + t; - a = (a - t) >> D; - return a; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_decompose -* -* Description: For finite field element a, compute high and low bits a0, a1 such -* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except -* if a1 = (Q-1)/ALPHA where we set a1 = 0 and -* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard -* representative. -* -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 -* -* Returns a1. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_AVX2_decompose(uint32_t a, uint32_t *a0) { - int32_t t, u; - - /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFF; - t += (a >> 19) << 9; - t -= ALPHA / 2 + 1; - t += (t >> 31) & ALPHA; - t -= ALPHA / 2 - 1; - a -= t; - - /* Divide by ALPHA (possible to avoid) */ - u = a - 1; - u >>= 31; - a = (a >> 19) + 1; - a -= u & 1; - - /* Border case */ - *a0 = Q + t - (a >> 4); - a &= 0xF; - return a; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_make_hint -* -* Description: Compute hint bit indicating whether the low bits of the -* input element overflow into the high bits. Inputs assumed to be -* standard representatives. -* -* Arguments: - uint32_t a0: low bits of input element -* - uint32_t a1: high bits of input element -* -* Returns 1 if high bits of a and b differ and 0 otherwise. -**************************************************/ -unsigned int PQCLEAN_DILITHIUM4_AVX2_make_hint(const uint32_t a0, const uint32_t a1) { - if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { - return 0; - } - - return 1; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_use_hint -* -* Description: Correct high bits according to hint. -* -* Arguments: - uint32_t a: input element -* - unsigned int hint: hint bit -* -* Returns corrected high bits. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_AVX2_use_hint(const uint32_t a, const unsigned int hint) { - uint32_t a0, a1; - - a1 = PQCLEAN_DILITHIUM4_AVX2_decompose(a, &a0); - if (hint == 0) { - return a1; - } - if (a0 > Q) { - return (a1 + 1) & 0xF; - } - return (a1 - 1) & 0xF; - - /* If decompose does not divide out ALPHA: - if(hint == 0) - return a1; - else if(a0 > Q) - return (a1 + ALPHA) % (Q - 1); - else - return (a1 - ALPHA) % (Q - 1); - */ -} diff --git a/crypto_sign/dilithium4/avx2/rounding.h b/crypto_sign/dilithium4/avx2/rounding.h deleted file mode 100644 index 611791c9..00000000 --- a/crypto_sign/dilithium4/avx2/rounding.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef ROUNDING_H -#define ROUNDING_H - -#include "params.h" -#include - -uint32_t PQCLEAN_DILITHIUM4_AVX2_power2round(uint32_t a, uint32_t *a0); -uint32_t PQCLEAN_DILITHIUM4_AVX2_decompose(uint32_t a, uint32_t *a0); -unsigned int PQCLEAN_DILITHIUM4_AVX2_make_hint(uint32_t a0, uint32_t a1); -uint32_t PQCLEAN_DILITHIUM4_AVX2_use_hint(uint32_t a, unsigned int hint); - -#endif diff --git a/crypto_sign/dilithium4/avx2/shuffle.inc b/crypto_sign/dilithium4/avx2/shuffle.inc deleted file mode 100644 index df352030..00000000 --- a/crypto_sign/dilithium4/avx2/shuffle.inc +++ /dev/null @@ -1,23 +0,0 @@ -.macro shuffle8 r0,r1,r2,r3 -vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 -vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle4 r0,r1,r2,r3 -vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 -vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 -.endm - -.macro shuffle2 r0,r1,r2,r3 -vpsllq $32,%ymm\r1,%ymm12 -vpsrlq $32,%ymm\r0,%ymm13 -vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendd $0xAA,%ymm\r1,%ymm13,%ymm\r3 -.endm - -.macro shuffle1 r0,r1,r2,r3 -vpslld $16,%ymm\r1,%ymm12 -vpsrld $16,%ymm\r0,%ymm13 -vpblendw $0xAA,%ymm12,%ymm\r0,%ymm\r2 -vpblendw $0xAA,%ymm\r1,%ymm13,%ymm\r3 -.endm diff --git a/crypto_sign/dilithium4/avx2/sign.c b/crypto_sign/dilithium4/avx2/sign.c deleted file mode 100644 index 641adef6..00000000 --- a/crypto_sign/dilithium4/avx2/sign.c +++ /dev/null @@ -1,463 +0,0 @@ -#include -#include - -#include "fips202.h" -#include "packing.h" -#include "params.h" -#include "poly.h" -#include "polyvec.h" -#include "randombytes.h" -#include "sign.h" -#include "symmetric.h" - -/************************************************* -* Name: expand_mat -* -* Description: Implementation of ExpandA. Generates matrix A with uniformly -* random coefficients a_{i,j} by performing rejection -* sampling on the output stream of SHAKE128(rho|i|j). -* -* Arguments: - polyvecl mat[K]: output matrix -* - const uint8_t rho[]: byte array containing seed rho -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_expand_mat(polyvecl mat[6], const uint8_t rho[SEEDBYTES]) { - poly t0, t1; - - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[0].vec[0], - &mat[0].vec[1], - &mat[0].vec[2], - &mat[0].vec[3], - rho, 0, 1, 2, 3); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[0].vec[4], - &mat[1].vec[0], - &mat[1].vec[1], - &mat[1].vec[2], - rho, 4, 256, 257, 258); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[1].vec[3], - &mat[1].vec[4], - &mat[2].vec[0], - &mat[2].vec[1], - rho, 259, 260, 512, 513); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[2].vec[2], - &mat[2].vec[3], - &mat[2].vec[4], - &mat[3].vec[0], - rho, 514, 515, 516, 768); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[3].vec[1], - &mat[3].vec[2], - &mat[3].vec[3], - &mat[3].vec[4], - rho, 769, 770, 771, 772); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[4].vec[0], - &mat[4].vec[1], - &mat[4].vec[2], - &mat[4].vec[3], - rho, 1024, 1025, 1026, 1027); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[4].vec[4], - &mat[5].vec[0], - &mat[5].vec[1], - &mat[5].vec[2], - rho, 1028, 1280, 1281, 1282); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_4x(&mat[5].vec[3], - &mat[5].vec[4], - &t0, - &t1, - rho, 1283, 1284, 0, 0); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_challenge -* -* Description: Implementation of H. Samples polynomial with 60 nonzero -* coefficients in {-1,1} using the output stream of -* SHAKE256(mu|w1). -* -* Arguments: - poly *c: pointer to output polynomial -* - const uint8_t mu[]: byte array containing mu -* - const polyveck *w1: pointer to vector w1 -**************************************************/ -void PQCLEAN_DILITHIUM4_AVX2_challenge(poly *c, - const uint8_t mu[CRHBYTES], - const polyveck *w1) { - uint8_t b; - size_t pos; - uint64_t signs; - uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - uint8_t outbuf[SHAKE256_RATE]; - shake256ctx state; - - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[i] = mu[i]; - } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); - } - - shake256_absorb(&state, inbuf, sizeof(inbuf)); - shake256_squeezeblocks(outbuf, 1, &state); - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t) outbuf[i] << 8 * i; - } - - pos = 8; - - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - for (size_t i = 196; i < 256; ++i) { - do { - if (pos >= SHAKE256_RATE) { - shake256_squeezeblocks(outbuf, 1, &state); - pos = 0; - } - - b = outbuf[pos++]; - } while (b > i); - - c->coeffs[i] = c->coeffs[b]; - c->coeffs[b] = 1; - c->coeffs[b] ^= -(signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - shake256_ctx_release(&state); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign_keypair -* -* Description: Generates public and private key. -* -* Arguments: - uint8_t *pk: pointer to output public key (allocated -* array of PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (allocated -* array of PQCLEAN_DILITHIUM4_AVX2_CRYPTO_SECRETKEYBYTES bytes) -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { - uint8_t seedbuf[3 * SEEDBYTES]; - uint8_t tr[CRHBYTES]; - const uint8_t *rho, *rhoprime, *key; - uint16_t nonce = 0; - polyvecl mat[K]; - polyvecl s1, s1hat; - polyveck s2, t, t1, t0; - - /* Expand 32 bytes of randomness into rho, rhoprime and key */ - randombytes(seedbuf, 3 * SEEDBYTES); - rho = seedbuf; - rhoprime = seedbuf + SEEDBYTES; - key = seedbuf + 2 * SEEDBYTES; - - /* Expand matrix */ - PQCLEAN_DILITHIUM4_AVX2_expand_mat(mat, rho); - - /* Sample short vectors s1 and s2 */ - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, - nonce, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(&s1.vec[4], &s2.vec[0], &s2.vec[1], &s2.vec[2], rhoprime, - nonce + 4, nonce + 5, nonce + 6, nonce + 7); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_eta_4x(&s2.vec[3], &s2.vec[4], &s2.vec[5], &t.vec[0], rhoprime, - nonce + 8, nonce + 9, nonce + 10, 0); - - /* Matrix-vector multiplication */ - s1hat = s1; - PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(&s1hat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); - //PQCLEAN_DILITHIUM4_AVX2_poly_reduce(&t.vec[i]); - PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&t.vec[i]); - } - - /* Add error vector s2 */ - PQCLEAN_DILITHIUM4_AVX2_polyveck_add(&t, &t, &s2); - - /* Extract t1 and write public key */ - PQCLEAN_DILITHIUM4_AVX2_polyveck_freeze(&t); - PQCLEAN_DILITHIUM4_AVX2_polyveck_power2round(&t1, &t0, &t); - PQCLEAN_DILITHIUM4_AVX2_pack_pk(pk, rho, &t1); - - /* Compute CRH(rho, t1) and write secret key */ - crh(tr, pk, PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES); - PQCLEAN_DILITHIUM4_AVX2_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); - - return 0; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign_signature -* -* Description: Compute signed message. -* -* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES -* of len) -* - size_t *siglen: pointer to output length of signed message -* (should be PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES) -* - uint8_t *m: pointer to message to be signed -* - size_t mlen: length of message -* - uint8_t *sk: pointer to bit-packed secret key -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - uint32_t n; - uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; - uint8_t *rho, *tr, *key, *mu, *rhoprime; - uint16_t nonce = 0; - poly c, chat; - polyvecl mat[K], s1, y, yhat, z; - polyveck t0, s2, w, w1, w0; - polyveck h, cs2, ct0; - - rho = seedbuf; - tr = rho + SEEDBYTES; - key = tr + CRHBYTES; - mu = key + SEEDBYTES; - rhoprime = mu + CRHBYTES; - PQCLEAN_DILITHIUM4_AVX2_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); - - - // use incremental hash API instead of copying around buffers - /* Compute CRH(tr, m) */ - shake256incctx state; - shake256_inc_init(&state); - shake256_inc_absorb(&state, tr, CRHBYTES); - shake256_inc_absorb(&state, m, mlen); - shake256_inc_finalize(&state); - shake256_inc_squeeze(mu, CRHBYTES, &state); - shake256_inc_ctx_release(&state); - - crh(rhoprime, key, SEEDBYTES + CRHBYTES); - - /* Expand matrix and transform vectors */ - PQCLEAN_DILITHIUM4_AVX2_expand_mat(mat, rho); - PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(&s1); - PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(&s2); - PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(&t0); - -rej: - /* Sample intermediate vector y */ - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1_4x(&y.vec[0], &y.vec[1], &y.vec[2], &y.vec[3], - rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_DILITHIUM4_AVX2_poly_uniform_gamma1m1(&y.vec[4], rhoprime, nonce + 4); - nonce += 5; - - /* Matrix-vector multiplication */ - yhat = y; - PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(&yhat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); - PQCLEAN_DILITHIUM4_AVX2_poly_reduce(&w.vec[i]); - PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&w.vec[i]); - } - - /* Decompose w and call the random oracle */ - PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(&w); - PQCLEAN_DILITHIUM4_AVX2_polyveck_decompose(&w1, &w0, &w); - PQCLEAN_DILITHIUM4_AVX2_challenge(&c, mu, &w1); - chat = c; - PQCLEAN_DILITHIUM4_AVX2_poly_ntt(&chat); - - /* Check that subtracting cs2 does not change high bits of w and low bits - * do not reveal secret information */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); - PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&cs2.vec[i]); - } - PQCLEAN_DILITHIUM4_AVX2_polyveck_sub(&w0, &w0, &cs2); - PQCLEAN_DILITHIUM4_AVX2_polyveck_freeze(&w0); - if (PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm(&w0, GAMMA2 - BETA)) { - goto rej; - } - - /* Compute z, reject if it reveals secret */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); - PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&z.vec[i]); - } - PQCLEAN_DILITHIUM4_AVX2_polyvecl_add(&z, &z, &y); - PQCLEAN_DILITHIUM4_AVX2_polyvecl_freeze(&z); - if (PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { - goto rej; - } - - /* Compute hints for w1 */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); - PQCLEAN_DILITHIUM4_AVX2_poly_invntt_montgomery(&ct0.vec[i]); - } - - PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(&ct0); - if (PQCLEAN_DILITHIUM4_AVX2_polyveck_chknorm(&ct0, GAMMA2)) { - goto rej; - } - - PQCLEAN_DILITHIUM4_AVX2_polyveck_add(&w0, &w0, &ct0); - PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(&w0); - n = PQCLEAN_DILITHIUM4_AVX2_polyveck_make_hint(&h, &w0, &w1); - if (n > OMEGA) { - goto rej; - } - - /* Write signature */ - PQCLEAN_DILITHIUM4_AVX2_pack_sig(sig, &z, &h, &c); - *siglen = PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES; - return 0; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign -* -* Description: Compute signed message. -* -* Arguments: - uint8_t *sm: pointer to output signed message (allocated -* array with PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES + mlen bytes), -* can be equal to m -* - size_t *smlen: pointer to output length of signed -* message -* - const uint8_t *m: pointer to message to be signed -* - size_t mlen: length of message -* - const uint8_t *sk: pointer to bit-packed secret key -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - int rc; - memmove(sm + PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES, m, mlen); - rc = PQCLEAN_DILITHIUM4_AVX2_crypto_sign_signature(sm, smlen, m, mlen, sk); - *smlen += mlen; - return rc; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign_verify -* -* Description: Verify signed message. -* -* Arguments: - uint8_t *sig: signature -* - size_t siglen: length of signature (PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES) -* - uint8_t *m: pointer to message -* - size_t *mlen: pointer to output length of message -* - uint8_t *pk: pointer to bit-packed public key -* -* Returns 0 if signed message could be verified correctly and -1 otherwise -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, - const uint8_t *pk) { - uint8_t rho[SEEDBYTES]; - uint8_t mu[CRHBYTES]; - poly c, chat, cp; - polyvecl mat[K], z; - polyveck t1, w1, h, tmp1, tmp2; - - if (siglen < PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES) { - return -1; - } - - PQCLEAN_DILITHIUM4_AVX2_unpack_pk(rho, &t1, pk); - if (PQCLEAN_DILITHIUM4_AVX2_unpack_sig(&z, &h, &c, sig)) { - return -1; - } - if (PQCLEAN_DILITHIUM4_AVX2_polyvecl_chknorm(&z, GAMMA1 - BETA)) { - return -1; - } - - /* Compute CRH(CRH(rho, t1), msg) */ - crh(mu, pk, PQCLEAN_DILITHIUM4_AVX2_CRYPTO_PUBLICKEYBYTES); - - shake256incctx state; - shake256_inc_init(&state); - shake256_inc_absorb(&state, mu, CRHBYTES); - shake256_inc_absorb(&state, m, mlen); - shake256_inc_finalize(&state); - shake256_inc_squeeze(mu, CRHBYTES, &state); - shake256_inc_ctx_release(&state); - - /* Matrix-vector multiplication; compute Az - c2^dt1 */ - PQCLEAN_DILITHIUM4_AVX2_expand_mat(mat, rho); - PQCLEAN_DILITHIUM4_AVX2_polyvecl_ntt(&z); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); - } - - chat = c; - PQCLEAN_DILITHIUM4_AVX2_poly_ntt(&chat); - PQCLEAN_DILITHIUM4_AVX2_polyveck_shiftl(&t1); - PQCLEAN_DILITHIUM4_AVX2_polyveck_ntt(&t1); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_AVX2_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); - } - - PQCLEAN_DILITHIUM4_AVX2_polyveck_sub(&tmp1, &tmp1, &tmp2); - PQCLEAN_DILITHIUM4_AVX2_polyveck_reduce(&tmp1); - PQCLEAN_DILITHIUM4_AVX2_polyveck_invntt_montgomery(&tmp1); - - /* Reconstruct w1 */ - PQCLEAN_DILITHIUM4_AVX2_polyveck_csubq(&tmp1); - PQCLEAN_DILITHIUM4_AVX2_polyveck_use_hint(&w1, &tmp1, &h); - - /* Call random oracle and verify challenge */ - PQCLEAN_DILITHIUM4_AVX2_challenge(&cp, mu, &w1); - for (size_t i = 0; i < N; ++i) { - if (c.coeffs[i] != cp.coeffs[i]) { - return -1; - } - } - - return 0; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_AVX2_crypto_sign_open -* -* Description: Verify signed message. -* -* Arguments: - uint8_t *m: pointer to output message (allocated -* array with smlen bytes), can be equal to sm -* - size_t *mlen: pointer to output length of message -* - const uint8_t *sm: pointer to signed message -* - size_t smlen: length of signed message -* - const uint8_t *pk: pointer to bit-packed public key -* -* Returns 0 if signed message could be verified correctly and -1 otherwise -**************************************************/ -int PQCLEAN_DILITHIUM4_AVX2_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk) { - if (smlen < PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES) { - goto badsig; - } - *mlen = smlen - PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES; - - if (PQCLEAN_DILITHIUM4_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES, - sm + PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES, *mlen, pk)) { - goto badsig; - } else { - /* All good, copy msg, return 0 */ - for (size_t i = 0; i < *mlen; ++i) { - m[i] = sm[PQCLEAN_DILITHIUM4_AVX2_CRYPTO_BYTES + i]; - } - return 0; - } - - /* Signature verification failed */ -badsig: - *mlen = (size_t) -1; - for (size_t i = 0; i < smlen; ++i) { - m[i] = 0; - } - - return -1; -} diff --git a/crypto_sign/dilithium4/avx2/sign.h b/crypto_sign/dilithium4/avx2/sign.h deleted file mode 100644 index 73968a7f..00000000 --- a/crypto_sign/dilithium4/avx2/sign.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef SIGN_H -#define SIGN_H - -#include "api.h" -#include "params.h" -#include "poly.h" -#include "polyvec.h" - -void PQCLEAN_DILITHIUM4_AVX2_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM4_AVX2_challenge(poly *c, const uint8_t mu[CRHBYTES], - const polyveck *w1); - - -#endif - diff --git a/crypto_sign/dilithium4/avx2/stream.c b/crypto_sign/dilithium4/avx2/stream.c deleted file mode 100644 index 2163bc19..00000000 --- a/crypto_sign/dilithium4/avx2/stream.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "stream.h" - -#include - -void PQCLEAN_DILITHIUM4_AVX2_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { - - uint8_t buf[SEEDBYTES + 2]; - memcpy(buf, seed, SEEDBYTES); - buf[SEEDBYTES] = (uint8_t)nonce; - buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); - - shake128_absorb(state, buf, SEEDBYTES + 2); -} - - -void PQCLEAN_DILITHIUM4_AVX2_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { - - uint8_t buf[CRHBYTES + 2]; - memcpy(buf, seed, CRHBYTES); - buf[CRHBYTES] = (uint8_t)nonce; - buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); - - shake256_absorb(state, buf, CRHBYTES + 2); -} diff --git a/crypto_sign/dilithium4/avx2/stream.h b/crypto_sign/dilithium4/avx2/stream.h deleted file mode 100644 index 87a280e4..00000000 --- a/crypto_sign/dilithium4/avx2/stream.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_STREAM_H -#define PQCLEAN_DILITHIUM4_AVX2_STREAM_H - -#include - -#include "fips202.h" -#include "params.h" - -void PQCLEAN_DILITHIUM4_AVX2_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); - -void PQCLEAN_DILITHIUM4_AVX2_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); - -#endif diff --git a/crypto_sign/dilithium4/avx2/symmetric.h b/crypto_sign/dilithium4/avx2/symmetric.h deleted file mode 100644 index 3309c1de..00000000 --- a/crypto_sign/dilithium4/avx2/symmetric.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_AVX2_SYMMETRIC_H -#define PQCLEAN_DILITHIUM4_AVX2_SYMMETRIC_H - -#include "params.h" -#include "stream.h" - - -#include "fips202.h" - -#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) -#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM4_AVX2_shake128_stream_init(STATE, SEED, NONCE) -#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream128_ctx_release(STATE) shake128_ctx_release(STATE) -#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM4_AVX2_shake256_stream_init(STATE, SEED, NONCE) -#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream256_ctx_release(STATE) shake256_ctx_release(STATE) - -#define STREAM128_BLOCKBYTES SHAKE128_RATE -#define STREAM256_BLOCKBYTES SHAKE256_RATE - -typedef shake128ctx stream128_state; -typedef shake256ctx stream256_state; - - -#endif diff --git a/crypto_sign/dilithium4/clean/LICENSE b/crypto_sign/dilithium4/clean/LICENSE deleted file mode 100644 index 40541676..00000000 --- a/crypto_sign/dilithium4/clean/LICENSE +++ /dev/null @@ -1,6 +0,0 @@ -Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) - -For Keccak and the random number generator -we are using public-domain code from sources -and by authors listed in comments on top of -the respective files. diff --git a/crypto_sign/dilithium4/clean/Makefile b/crypto_sign/dilithium4/clean/Makefile deleted file mode 100644 index 1f8fcac7..00000000 --- a/crypto_sign/dilithium4/clean/Makefile +++ /dev/null @@ -1,22 +0,0 @@ -# This Makefile can be used with GNU Make or BSD Make - -LIB=libdilithium4_clean.a - -SOURCES = sign.c polyvec.c poly.c packing.c ntt.c reduce.c rounding.c stream.c -OBJECTS = sign.o polyvec.o poly.o packing.o ntt.o reduce.o rounding.o stream.o -HEADERS = api.h params.h sign.h polyvec.h poly.h packing.h ntt.h \ - reduce.h rounding.h symmetric.h stream.h - -CFLAGS=-O3 -Wall -Wconversion -Wextra -Wpedantic -Wvla -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) - -all: $(LIB) - -%.o: %.c $(HEADERS) - $(CC) $(CFLAGS) -c -o $@ $< - -$(LIB): $(OBJECTS) - $(AR) -r $@ $(OBJECTS) - -clean: - $(RM) $(OBJECTS) - $(RM) $(LIB) diff --git a/crypto_sign/dilithium4/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium4/clean/Makefile.Microsoft_nmake deleted file mode 100644 index 99ffb30f..00000000 --- a/crypto_sign/dilithium4/clean/Makefile.Microsoft_nmake +++ /dev/null @@ -1,18 +0,0 @@ -# This Makefile can be used with Microsoft Visual Studio's nmake using the command: -# nmake /f Makefile.Microsoft_nmake - -LIBRARY=libdilithium4_clean.lib -OBJECTS=sign.obj polyvec.obj poly.obj packing.obj ntt.obj reduce.obj rounding.obj stream.obj -CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX - -all: $(LIBRARY) - -# Make sure objects are recompiled if headers change. -$(OBJECTS): *.h - -$(LIBRARY): $(OBJECTS) - LIB.EXE /NOLOGO /WX /OUT:$@ $** - -clean: - -DEL $(OBJECTS) - -DEL $(LIBRARY) diff --git a/crypto_sign/dilithium4/clean/api.h b/crypto_sign/dilithium4/clean/api.h deleted file mode 100644 index 8231600f..00000000 --- a/crypto_sign/dilithium4/clean/api.h +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_API_H -#define PQCLEAN_DILITHIUM4_CLEAN_API_H - -#include -#include - - -#define PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES 1760U -#define PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES 3856U -#define PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES 3366U - -#define PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_ALGNAME "Dilithium4" - - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair( - uint8_t *pk, uint8_t *sk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *msg, size_t len, - const uint8_t *sk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *m, size_t mlen, const uint8_t *sk); - -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, const uint8_t *pk); - - - -#endif diff --git a/crypto_sign/dilithium4/clean/ntt.c b/crypto_sign/dilithium4/clean/ntt.c deleted file mode 100644 index f150818e..00000000 --- a/crypto_sign/dilithium4/clean/ntt.c +++ /dev/null @@ -1,138 +0,0 @@ -#include - -#include "params.h" -#include "ntt.h" -#include "poly.h" -#include "reduce.h" - -/* Roots of unity in order needed by forward PQCLEAN_DILITHIUM4_CLEAN_ntt */ -static const uint32_t PQCLEAN_DILITHIUM4_CLEAN_zetas[N] = { - 0, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, 1826347, - 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, 2725464, - 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, 6262231, - 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005, 2706023, - 95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439, 4519302, - 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118, 6681150, - 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596, 811944, - 531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638, 4450022, - 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196, 7122806, - 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922, 3412210, - 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370, 7709315, - 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987, 5037034, - 264944, 508951, 3097992, 44288, 7280319, 904516, 3958618, 4656075, 8371839, - 1653064, 5130689, 2389356, 8169440, 759969, 7063561, 189548, 4827145, - 3159746, 6529015, 5971092, 8202977, 1315589, 1341330, 1285669, 6795489, - 7567685, 6940675, 5361315, 4499357, 4751448, 3839961, 2091667, 3407706, - 2316500, 3817976, 5037939, 2244091, 5933984, 4817955, 266997, 2434439, - 7144689, 3513181, 4860065, 4621053, 7183191, 5187039, 900702, 1859098, - 909542, 819034, 495491, 6767243, 8337157, 7857917, 7725090, 5257975, - 2031748, 3207046, 4823422, 7855319, 7611795, 4784579, 342297, 286988, - 5942594, 4108315, 3437287, 5038140, 1735879, 203044, 2842341, 2691481, - 5790267, 1265009, 4055324, 1247620, 2486353, 1595974, 4613401, 1250494, - 2635921, 4832145, 5386378, 1869119, 1903435, 7329447, 7047359, 1237275, - 5062207, 6950192, 7929317, 1312455, 3306115, 6417775, 7100756, 1917081, - 5834105, 7005614, 1500165, 777191, 2235880, 3406031, 7838005, 5548557, - 6709241, 6533464, 5796124, 4656147, 594136, 4603424, 6366809, 2432395, - 2454455, 8215696, 1957272, 3369112, 185531, 7173032, 5196991, 162844, - 1616392, 3014001, 810149, 1652634, 4686184, 6581310, 5341501, 3523897, - 3866901, 269760, 2213111, 7404533, 1717735, 472078, 7953734, 1723600, - 6577327, 1910376, 6712985, 7276084, 8119771, 4546524, 5441381, 6144432, - 7959518, 6094090, 183443, 7403526, 1612842, 4834730, 7826001, 3919660, - 8332111, 7018208, 3937738, 1400424, 7534263, 1976782 -}; - -/* Roots of unity in order needed by inverse PQCLEAN_DILITHIUM4_CLEAN_ntt */ -static const uint32_t PQCLEAN_DILITHIUM4_CLEAN_zetas_inv[N] = { - 6403635, 846154, 6979993, 4442679, 1362209, 48306, 4460757, 554416, - 3545687, 6767575, 976891, 8196974, 2286327, 420899, 2235985, 2939036, - 3833893, 260646, 1104333, 1667432, 6470041, 1803090, 6656817, 426683, - 7908339, 6662682, 975884, 6167306, 8110657, 4513516, 4856520, 3038916, - 1799107, 3694233, 6727783, 7570268, 5366416, 6764025, 8217573, 3183426, - 1207385, 8194886, 5011305, 6423145, 164721, 5925962, 5948022, 2013608, - 3776993, 7786281, 3724270, 2584293, 1846953, 1671176, 2831860, 542412, - 4974386, 6144537, 7603226, 6880252, 1374803, 2546312, 6463336, 1279661, - 1962642, 5074302, 7067962, 451100, 1430225, 3318210, 7143142, 1333058, - 1050970, 6476982, 6511298, 2994039, 3548272, 5744496, 7129923, 3767016, - 6784443, 5894064, 7132797, 4325093, 7115408, 2590150, 5688936, 5538076, - 8177373, 6644538, 3342277, 4943130, 4272102, 2437823, 8093429, 8038120, - 3595838, 768622, 525098, 3556995, 5173371, 6348669, 3122442, 655327, - 522500, 43260, 1613174, 7884926, 7561383, 7470875, 6521319, 7479715, - 3193378, 1197226, 3759364, 3520352, 4867236, 1235728, 5945978, 8113420, - 3562462, 2446433, 6136326, 3342478, 4562441, 6063917, 4972711, 6288750, - 4540456, 3628969, 3881060, 3019102, 1439742, 812732, 1584928, 7094748, - 7039087, 7064828, 177440, 2409325, 1851402, 5220671, 3553272, 8190869, - 1316856, 7620448, 210977, 5991061, 3249728, 6727353, 8578, 3724342, - 4421799, 7475901, 1100098, 8336129, 5282425, 7871466, 8115473, 3343383, - 1430430, 6527646, 7031341, 381987, 1308169, 22981, 1228525, 671102, - 2477047, 411027, 3693493, 2967645, 5665122, 6232521, 983419, 4968207, - 8253495, 3632928, 3157330, 3190144, 1000202, 4083598, 6441103, 1257611, - 1585221, 6203962, 4904467, 1452451, 3041255, 3677745, 1528703, 3930395, - 2797779, 6308525, 2556880, 4479693, 4499374, 7426187, 7849063, 7568473, - 4680821, 1600420, 2140649, 4873154, 3821735, 4874723, 1643818, 1699267, - 539299, 6031717, 300467, 4840449, 2867647, 4805995, 3043716, 3861115, - 4464978, 2537516, 3592148, 1661693, 4849980, 5303092, 8284641, 5674394, - 8100412, 4369920, 19422, 6623180, 3277672, 1399561, 3859737, 2118186, - 2108549, 5760665, 1119584, 549488, 4794489, 1079900, 7356305, 5654953, - 5700314, 5268920, 2884855, 5260684, 2091905, 359251, 6026966, 6554070, - 7913949, 876248, 777960, 8143293, 518909, 2608894, 8354570 -}; - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_ntt -* -* Description: Forward NTT, in-place. No modular reduction is performed after -* additions or subtractions. Hence output coefficients can be up -* to 16*Q larger than the coefficients of the input polynomial. -* Output vector is in bitreversed order. -* -* Arguments: - uint32_t p[N]: input/output coefficient array -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_ntt(uint32_t p[N]) { - size_t k, j; - uint32_t zeta, t; - - k = 1; - for (size_t len = 128; len > 0; len >>= 1) { - for (size_t start = 0; start < N; start = j + len) { - zeta = PQCLEAN_DILITHIUM4_CLEAN_zetas[k++]; - for (j = start; j < start + len; ++j) { - t = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); - p[j + len] = p[j] + 2 * Q - t; - p[j] = p[j] + t; - } - } - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont -* -* Description: Inverse NTT and multiplication by Montgomery factor 2^32. -* In-place. No modular reductions after additions or -* subtractions. Input coefficient need to be smaller than 2*Q. -* Output coefficient are smaller than 2*Q. -* -* Arguments: - uint32_t p[N]: input/output coefficient array -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont(uint32_t p[N]) { - size_t start, len, j, k; - uint32_t t, zeta; - const uint32_t f = (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q; - - k = 0; - for (len = 1; len < N; len <<= 1) { - for (start = 0; start < N; start = j + len) { - zeta = PQCLEAN_DILITHIUM4_CLEAN_zetas_inv[k++]; - for (j = start; j < start + len; ++j) { - t = p[j]; - p[j] = t + p[j + len]; - p[j + len] = t + 256 * Q - p[j + len]; - p[j + len] = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t) zeta * p[j + len]); - } - } - } - - for (j = 0; j < N; ++j) { - p[j] = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t) f * p[j]); - } -} diff --git a/crypto_sign/dilithium4/clean/ntt.h b/crypto_sign/dilithium4/clean/ntt.h deleted file mode 100644 index 21f4d5b8..00000000 --- a/crypto_sign/dilithium4/clean/ntt.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_NTT_H -#define PQCLEAN_DILITHIUM4_CLEAN_NTT_H - -#include - -#include "params.h" - -void PQCLEAN_DILITHIUM4_CLEAN_ntt(uint32_t p[N]); -void PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont(uint32_t p[N]); - -#endif diff --git a/crypto_sign/dilithium4/clean/packing.c b/crypto_sign/dilithium4/clean/packing.c deleted file mode 100644 index 4513baaa..00000000 --- a/crypto_sign/dilithium4/clean/packing.c +++ /dev/null @@ -1,297 +0,0 @@ -#include "packing.h" -#include "params.h" -#include "poly.h" -#include "polyvec.h" - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_pack_pk -* -* Description: Bit-pack public key pk = (rho, t1). -* -* Arguments: - uint8_t pk[]: output byte array -* - const uint8_t rho[]: byte array containing rho -* - const polyveck *t1: pointer to vector t1 -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1) { - for (size_t i = 0; i < SEEDBYTES; ++i) { - pk[i] = rho[i]; - } - pk += SEEDBYTES; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack(pk + i * POLT1_SIZE_PACKED, &t1->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_unpack_pk -* -* Description: Unpack public key pk = (rho, t1). -* -* Arguments: - const uint8_t rho[]: output byte array for rho -* - const polyveck *t1: pointer to output vector t1 -* - uint8_t pk[]: byte array containing bit-packed pk -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { - rho[i] = pk[i]; - } - pk += SEEDBYTES; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLT1_SIZE_PACKED); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_pack_sk -* -* Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0). -* -* Arguments: - uint8_t sk[]: output byte array -* - const uint8_t rho[]: byte array containing rho -* - const uint8_t key[]: byte array containing key -* - const uint8_t tr[]: byte array containing tr -* - const polyvecl *s1: pointer to vector s1 -* - const polyveck *s2: pointer to vector s2 -* - const polyveck *t0: pointer to vector t0 -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[CRHBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0) { - for (size_t i = 0; i < SEEDBYTES; ++i) { - sk[i] = rho[i]; - } - sk += SEEDBYTES; - - for (size_t i = 0; i < SEEDBYTES; ++i) { - sk[i] = key[i]; - } - sk += SEEDBYTES; - - for (size_t i = 0; i < CRHBYTES; ++i) { - sk[i] = tr[i]; - } - sk += CRHBYTES; - - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s1->vec[i]); - } - sk += L * POLETA_SIZE_PACKED; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(sk + i * POLETA_SIZE_PACKED, &s2->vec[i]); - } - sk += K * POLETA_SIZE_PACKED; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack(sk + i * POLT0_SIZE_PACKED, &t0->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_unpack_sk -* -* Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0). -* -* Arguments: - const uint8_t rho[]: output byte array for rho -* - const uint8_t key[]: output byte array for key -* - const uint8_t tr[]: output byte array for tr -* - const polyvecl *s1: pointer to output vector s1 -* - const polyveck *s2: pointer to output vector s2 -* - const polyveck *r0: pointer to output vector t0 -* - uint8_t sk[]: byte array containing bit-packed sk -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t sk[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES]) { - for (size_t i = 0; i < SEEDBYTES; ++i) { - rho[i] = sk[i]; - } - sk += SEEDBYTES; - - for (size_t i = 0; i < SEEDBYTES; ++i) { - key[i] = sk[i]; - } - sk += SEEDBYTES; - - for (size_t i = 0; i < CRHBYTES; ++i) { - tr[i] = sk[i]; - } - sk += CRHBYTES; - - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLETA_SIZE_PACKED); - } - sk += L * POLETA_SIZE_PACKED; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLETA_SIZE_PACKED); - } - sk += K * POLETA_SIZE_PACKED; - - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLT0_SIZE_PACKED); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_pack_sig -* -* Description: Bit-pack signature sig = (z, h, c). -* -* Arguments: - uint8_t sig[]: output byte array -* - const polyvecl *z: pointer to vector z -* - const polyveck *h: pointer to hint vector h -* - const poly *c: pointer to challenge polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, - const polyveck *h, - const poly *c) { - size_t k; - uint64_t signs, mask; - - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyz_pack(sig + i * POLZ_SIZE_PACKED, &z->vec[i]); - } - sig += L * POLZ_SIZE_PACKED; - - /* Encode h */ - k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { - if (h->vec[i].coeffs[j] != 0) { - sig[k++] = (uint8_t)j; - } - } - - sig[OMEGA + i] = (uint8_t)k; - } - while (k < OMEGA) { - sig[k++] = 0; - } - sig += OMEGA + K; - - /* Encode c */ - signs = 0; - mask = 1; - for (size_t i = 0; i < N / 8; ++i) { - sig[i] = 0; - for (size_t j = 0; j < 8; ++j) { - if (c->coeffs[8 * i + j] != 0) { - sig[i] |= (uint8_t)(1u << j); - if (c->coeffs[8 * i + j] == (Q - 1)) { - signs |= mask; - } - mask <<= 1; - } - } - } - sig += N / 8; - for (size_t i = 0; i < 8; ++i) { - sig[i] = (uint8_t)(signs >> 8u * i); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_unpack_sig -* -* Description: Unpack signature sig = (z, h, c). -* -* Arguments: - polyvecl *z: pointer to output vector z -* - polyveck *h: pointer to output hint vector h -* - poly *c: pointer to output challenge polynomial -* - const uint8_t sig[]: byte array containing -* bit-packed signature -* -* Returns 1 in case of malformed signature; otherwise 0. -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES]) { - size_t k; - uint64_t signs; - - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLZ_SIZE_PACKED); - } - sig += L * POLZ_SIZE_PACKED; - - /* Decode h */ - k = 0; - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < N; ++j) { - h->vec[i].coeffs[j] = 0; - } - - if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { - return 1; - } - - for (size_t j = k; j < sig[OMEGA + i]; ++j) { - /* Coefficients are ordered for strong unforgeability */ - if (j > k && sig[j] <= sig[j - 1]) { - return 1; - } - h->vec[i].coeffs[sig[j]] = 1; - } - - k = sig[OMEGA + i]; - } - - /* Extra indices are zero for strong unforgeability */ - for (size_t j = k; j < OMEGA; ++j) { - if (sig[j]) { - return 1; - } - } - - sig += OMEGA + K; - - /* Decode c */ - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)sig[N / 8 + i] << 8 * i; - } - - /* Extra sign bits are zero for strong unforgeability */ - if (signs >> 60) { - return 1; - } - - for (size_t i = 0; i < N / 8; ++i) { - for (size_t j = 0; j < 8; ++j) { - if ((sig[i] >> j) & 0x01) { - c->coeffs[8 * i + j] = 1; - c->coeffs[8 * i + j] ^= -((int32_t) signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - } - } - - return 0; -} diff --git a/crypto_sign/dilithium4/clean/packing.h b/crypto_sign/dilithium4/clean/packing.h deleted file mode 100644 index bc9929e4..00000000 --- a/crypto_sign/dilithium4/clean/packing.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_PACKING_H -#define PQCLEAN_DILITHIUM4_CLEAN_PACKING_H - -#include "api.h" -#include "params.h" -#include "polyvec.h" - -void PQCLEAN_DILITHIUM4_CLEAN_pack_pk( - uint8_t pk[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES], - const uint8_t rho[SEEDBYTES], - const polyveck *t1); -void PQCLEAN_DILITHIUM4_CLEAN_pack_sk( - uint8_t sk[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES], - const uint8_t rho[SEEDBYTES], - const uint8_t key[SEEDBYTES], - const uint8_t tr[SEEDBYTES], - const polyvecl *s1, - const polyveck *s2, - const polyveck *t0); -void PQCLEAN_DILITHIUM4_CLEAN_pack_sig( - uint8_t sig[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES], - const polyvecl *z, const polyveck *h, const poly *c); - -void PQCLEAN_DILITHIUM4_CLEAN_unpack_pk( - uint8_t rho[SEEDBYTES], - polyveck *t1, - const uint8_t pk[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES]); -void PQCLEAN_DILITHIUM4_CLEAN_unpack_sk( - uint8_t rho[SEEDBYTES], - uint8_t key[SEEDBYTES], - uint8_t tr[CRHBYTES], - polyvecl *s1, - polyveck *s2, - polyveck *t0, - const uint8_t *sk); -int PQCLEAN_DILITHIUM4_CLEAN_unpack_sig( - polyvecl *z, - polyveck *h, - poly *c, - const uint8_t sig[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES]); - -#endif diff --git a/crypto_sign/dilithium4/clean/params.h b/crypto_sign/dilithium4/clean/params.h deleted file mode 100644 index ce21782d..00000000 --- a/crypto_sign/dilithium4/clean/params.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_PARAMS_H -#define PQCLEAN_DILITHIUM4_CLEAN_PARAMS_H - - -#define SEEDBYTES 32 -#define CRHBYTES 48 -#define N 256 -#define Q 8380417 -#define QBITS 23 -#define D 14 -#define GAMMA1 ((Q - 1)/16) -#define GAMMA2 (GAMMA1/2) -#define ALPHA (2*GAMMA2) - -#define K 6 -#define L 5 -#define ETA 3 -#define SETABITS 3 -#define BETA 175 -#define OMEGA 120 - - -#define POLT1_SIZE_PACKED ((N*(QBITS - D))/8) -#define POLT0_SIZE_PACKED ((N*D)/8) -#define POLETA_SIZE_PACKED ((N*SETABITS)/8) -#define POLZ_SIZE_PACKED ((N*(QBITS - 3))/8) -#define POLW1_SIZE_PACKED ((N*4)/8) - -#endif diff --git a/crypto_sign/dilithium4/clean/poly.c b/crypto_sign/dilithium4/clean/poly.c deleted file mode 100644 index 993fe3ec..00000000 --- a/crypto_sign/dilithium4/clean/poly.c +++ /dev/null @@ -1,726 +0,0 @@ -#include "ntt.h" -#include "params.h" -#include "poly.h" -#include "reduce.h" -#include "rounding.h" -#include "symmetric.h" - - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_reduce -* -* Description: Reduce all coefficients of input polynomial to representative -* in [0,2*Q[. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_reduce(poly *a) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_reduce32(a->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_csubq -* -* Description: For all coefficients of input polynomial subtract Q if -* coefficient is bigger than Q. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_csubq(poly *a) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_csubq(a->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_freeze -* -* Description: Reduce all coefficients of the polynomial to standard -* representatives. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_freeze(poly *a) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_freeze(a->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_add -* -* Description: Add polynomials. No modular reduction is performed. -* -* Arguments: - poly *c: pointer to output polynomial -* - const poly *a: pointer to first summand -* - const poly *b: pointer to second summand -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_sub -* -* Description: Subtract polynomials. Assumes coefficients of second input -* polynomial to be less than 2*Q. No modular reduction is -* performed. -* -* Arguments: - poly *c: pointer to output polynomial -* - const poly *a: pointer to first input polynomial -* - const poly *b: pointer to second input polynomial to be -* subtracted from first input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i]; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_shiftl -* -* Description: Multiply polynomial by 2^D without modular reduction. Assumes -* input coefficients to be less than 2^{32-D}. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_shiftl(poly *a) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] <<= D; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_ntt -* -* Description: Forward NTT. Output coefficients can be up to 16*Q larger than -* input coefficients. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_ntt(poly *a) { - PQCLEAN_DILITHIUM4_CLEAN_ntt(a->coeffs); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery -* -* Description: Inverse NTT and multiplication with 2^{32}. Input coefficients -* need to be less than 2*Q. Output coefficients are less than 2*Q. -* -* Arguments: - poly *a: pointer to input/output polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(poly *a) { - PQCLEAN_DILITHIUM4_CLEAN_invntt_frominvmont(a->coeffs); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery -* -* Description: Pointwise multiplication of polynomials in NTT domain -* representation and multiplication of resulting polynomial -* with 2^{-32}. Output coefficients are less than 2*Q if input -* coefficient are less than 22*Q. -* -* Arguments: - poly *c: pointer to output polynomial -* - const poly *a: pointer to first input polynomial -* - const poly *b: pointer to second input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) { - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]); - } - -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_power2round -* -* Description: For all coefficients c of the input polynomial, -* compute c0, c1 such that c mod Q = c1*2^D + c0 -* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be -* standard representatives. -* -* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *v: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_power2round(a->coeffs[i], &a0->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_decompose -* -* Description: For all coefficients c of the input polynomial, -* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 -* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we -* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. -* Assumes coefficients to be standard representatives. -* -* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 -* - poly *a0: pointer to output polynomial with coefficients Q + a0 -* - const poly *c: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { - for (size_t i = 0; i < N; ++i) { - a1->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_decompose(a->coeffs[i], &a0->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_make_hint -* -* Description: Compute hint polynomial. The coefficients of which indicate -* whether the low bits of the corresponding coefficient of -* the input polynomial overflow into the high bits. -* -* Arguments: - poly *h: pointer to output hint polynomial -* - const poly *a0: pointer to low part of input polynomial -* - const poly *a1: pointer to high part of input polynomial -* -* Returns number of 1 bits. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { - uint32_t s = 0; - for (size_t i = 0; i < N; ++i) { - h->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); - s += h->coeffs[i]; - } - return s; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint -* -* Description: Use hint polynomial to correct the high bits of a polynomial. -* -* Arguments: - poly *a: pointer to output polynomial with corrected high bits -* - const poly *b: pointer to input polynomial -* - const poly *h: pointer to input hint polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint(poly *a, const poly *b, const poly *h) { - for (size_t i = 0; i < N; ++i) { - a->coeffs[i] = PQCLEAN_DILITHIUM4_CLEAN_use_hint(b->coeffs[i], h->coeffs[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm -* -* Description: Check infinity norm of polynomial against given bound. -* Assumes input coefficients to be standard representatives. -* -* Arguments: - const poly *a: pointer to polynomial -* - uint32_t B: norm bound -* -* Returns 0 if norm is strictly smaller than B and 1 otherwise. -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(const poly *a, uint32_t B) { - int32_t t; - /* It is ok to leak which coefficient violates the bound since - the probability for each coefficient is independent of secret - data but we must not leak the sign of the centralized representative. */ - for (size_t i = 0; i < N; ++i) { - /* Absolute value of centralized representative */ - t = (int32_t)((Q - 1) / 2 - a->coeffs[i]); - t ^= (t >> 31); - t = (Q - 1) / 2 - t; - - if ((uint32_t)t >= B) { - return 1; - } - } - return 0; -} - -/************************************************* -* Name: rej_uniform -* -* Description: Sample uniformly random coefficients in [0, Q-1] by -* performing rejection sampling using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_uniform( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint32_t t; - - ctr = pos = 0; - while (ctr < len && pos + 3 <= buflen) { - t = buf[pos++]; - t |= (uint32_t)buf[pos++] << 8; - t |= (uint32_t)buf[pos++] << 16; - t &= 0x7FFFFF; - - if (t < Q) { - a[ctr++] = t; - } - } - - return ctr; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_uniform -* -* Description: Sample polynomial with uniformly random coefficients -* in [0,Q-1] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). -* -* Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES -* - uint16_t nonce: 2-byte nonce -**************************************************/ -#define POLY_UNIFORM_NBLOCKS ((769 + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_BUFLEN (POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES) -void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform(poly *a, - const uint8_t seed[SEEDBYTES], - uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_BUFLEN; - uint8_t buf[POLY_UNIFORM_BUFLEN + 2]; - stream128_state state; - - stream128_init(&state, seed, nonce); - stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); - - ctr = rej_uniform(a->coeffs, N, buf, buflen); - - while (ctr < N) { - off = buflen % 3; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } - - buflen = STREAM128_BLOCKBYTES + off; - stream128_squeezeblocks(buf + off, 1, &state); - ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); - } - stream128_ctx_release(&state); -} - -/************************************************* -* Name: rej_eta -* -* Description: Sample uniformly random coefficients in [-ETA, ETA] by -* performing rejection sampling using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_eta( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint32_t t0, t1; - - ctr = pos = 0; - while (ctr < len && pos < buflen) { - t0 = buf[pos] & 0x07; - t1 = buf[pos++] >> 5; - - if (t0 <= 2 * ETA) { - a[ctr++] = Q + ETA - t0; - } - if (t1 <= 2 * ETA && ctr < len) { - a[ctr++] = Q + ETA - t1; - } - } - - return ctr; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta -* -* Description: Sample polynomial with uniformly random coefficients -* in [-ETA,ETA] by performing rejection sampling using the -* output stream from SHAKE256(seed|nonce). -* -* Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* SEEDBYTES -* - uint16_t nonce: 2-byte nonce -**************************************************/ -#define POLY_UNIFORM_ETA_NBLOCKS (((N / 2 * (1u << SETABITS)) / (2 * ETA + 1) + STREAM128_BLOCKBYTES) / STREAM128_BLOCKBYTES) -#define POLY_UNIFORM_ETA_BUFLEN (POLY_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) -void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta(poly *a, - const uint8_t *seed, - uint16_t nonce) { - size_t ctr; - uint8_t buf[POLY_UNIFORM_ETA_BUFLEN]; - stream128_state state; - - stream128_init(&state, seed, nonce); - stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); - - ctr = rej_eta(a->coeffs, N, buf, POLY_UNIFORM_ETA_BUFLEN); - - while (ctr < N) { - stream128_squeezeblocks(buf, 1, &state); - ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); - } - stream128_ctx_release(&state); -} - -/************************************************* -* Name: rej_gamma1m1 -* -* Description: Sample uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling -* using array of random bytes. -* -* Arguments: - uint32_t *a: pointer to output array (allocated) -* - size_t len: number of coefficients to be sampled -* - const uint8_t *buf: array of random bytes -* - size_t buflen: length of array of random bytes -* -* Returns number of sampled coefficients. Can be smaller than len if not enough -* random bytes were given. -**************************************************/ -static size_t rej_gamma1m1( - uint32_t *a, - size_t len, - const uint8_t *buf, - size_t buflen) { - - size_t ctr, pos; - uint32_t t0, t1; - - ctr = pos = 0; - while (ctr < len && pos + 5 <= buflen) { - t0 = buf[pos]; - t0 |= (uint32_t)buf[pos + 1] << 8; - t0 |= (uint32_t)buf[pos + 2] << 16; - t0 &= 0xFFFFF; - - t1 = buf[pos + 2] >> 4; - t1 |= (uint32_t)buf[pos + 3] << 4; - t1 |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (t0 <= 2 * GAMMA1 - 2) { - a[ctr++] = Q + GAMMA1 - 1 - t0; - } - if (t1 <= 2 * GAMMA1 - 2 && ctr < len) { - a[ctr++] = Q + GAMMA1 - 1 - t1; - } - } - - return ctr; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_gamma1m1 -* -* Description: Sample polynomial with uniformly random coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection -* sampling on output stream of SHAKE256(seed|nonce). -* -* Arguments: - poly *a: pointer to output polynomial -* - const uint8_t seed[]: byte array with seed of length -* CRHBYTES -* - uint16_t nonce: 16-bit nonce -**************************************************/ -#define POLY_UNIFORM_GAMMA1M1_NBLOCKS ((641 + STREAM256_BLOCKBYTES) / STREAM256_BLOCKBYTES) -#define POLY_UNIFORM_GAMMA1M1_BUFLEN (POLY_UNIFORM_GAMMA1M1_NBLOCKS * STREAM256_BLOCKBYTES) -void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_gamma1m1(poly *a, - const uint8_t seed[CRHBYTES], - uint16_t nonce) { - size_t ctr, off; - size_t buflen = POLY_UNIFORM_GAMMA1M1_BUFLEN; - uint8_t buf[POLY_UNIFORM_GAMMA1M1_BUFLEN + 4]; - stream256_state state; - - stream256_init(&state, seed, nonce); - stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1M1_NBLOCKS, &state); - - ctr = rej_gamma1m1(a->coeffs, N, buf, buflen); - - while (ctr < N) { - off = buflen % 5; - for (size_t i = 0; i < off; ++i) { - buf[i] = buf[buflen - off + i]; - } - - buflen = STREAM256_BLOCKBYTES + off; - stream256_squeezeblocks(buf + off, 1, &state); - ctr += rej_gamma1m1(a->coeffs + ctr, N - ctr, buf, buflen); - } - stream256_ctx_release(&state); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack -* -* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. -* Input coefficients are assumed to lie in [Q-ETA,Q+ETA]. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLETA_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { - uint8_t t[8]; - - for (size_t i = 0; i < N / 8; ++i) { - t[0] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 0]); - t[1] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 1]); - t[2] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 2]); - t[3] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 3]); - t[4] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 4]); - t[5] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 5]); - t[6] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 6]); - t[7] = (uint8_t)(Q + ETA - a->coeffs[8 * i + 7]); - - r[3 * i + 0] = (uint8_t)((t[0] >> 0) | (t[1] << 3) | (t[2] << 6)); - r[3 * i + 1] = (uint8_t)((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)); - r[3 * i + 2] = (uint8_t)((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack -* -* Description: Unpack polynomial with coefficients in [-ETA,ETA]. -* Output coefficients lie in [Q-ETA,Q+ETA]. -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: byte array with bit-packed polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = a[3 * i + 0] & 0x07; - r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 0x07; - r->coeffs[8 * i + 2] = (uint32_t)((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 0x07; - r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 0x07; - r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 0x07; - r->coeffs[8 * i + 5] = (uint32_t)((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 0x07; - r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 0x07; - r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 0x07; - - r->coeffs[8 * i + 0] = Q + ETA - r->coeffs[8 * i + 0]; - r->coeffs[8 * i + 1] = Q + ETA - r->coeffs[8 * i + 1]; - r->coeffs[8 * i + 2] = Q + ETA - r->coeffs[8 * i + 2]; - r->coeffs[8 * i + 3] = Q + ETA - r->coeffs[8 * i + 3]; - r->coeffs[8 * i + 4] = Q + ETA - r->coeffs[8 * i + 4]; - r->coeffs[8 * i + 5] = Q + ETA - r->coeffs[8 * i + 5]; - r->coeffs[8 * i + 6] = Q + ETA - r->coeffs[8 * i + 6]; - r->coeffs[8 * i + 7] = Q + ETA - r->coeffs[8 * i + 7]; - } - -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack -* -* Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits. -* Input coefficients are assumed to be standard representatives. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT1_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { - - for (size_t i = 0; i < N / 8; ++i) { - r[9 * i + 0] = (uint8_t)((a->coeffs[8 * i + 0] >> 0)); - r[9 * i + 1] = (uint8_t)((a->coeffs[8 * i + 0] >> 8) | (a->coeffs[8 * i + 1] << 1)); - r[9 * i + 2] = (uint8_t)((a->coeffs[8 * i + 1] >> 7) | (a->coeffs[8 * i + 2] << 2)); - r[9 * i + 3] = (uint8_t)((a->coeffs[8 * i + 2] >> 6) | (a->coeffs[8 * i + 3] << 3)); - r[9 * i + 4] = (uint8_t)((a->coeffs[8 * i + 3] >> 5) | (a->coeffs[8 * i + 4] << 4)); - r[9 * i + 5] = (uint8_t)((a->coeffs[8 * i + 4] >> 4) | (a->coeffs[8 * i + 5] << 5)); - r[9 * i + 6] = (uint8_t)((a->coeffs[8 * i + 5] >> 3) | (a->coeffs[8 * i + 6] << 6)); - r[9 * i + 7] = (uint8_t)((a->coeffs[8 * i + 6] >> 2) | (a->coeffs[8 * i + 7] << 7)); - r[9 * i + 8] = (uint8_t)((a->coeffs[8 * i + 7] >> 1)); - } - -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack -* -* Description: Unpack polynomial t1 with 9-bit coefficients. -* Output coefficients are standard representatives. -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: byte array with bit-packed polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 8; ++i) { - r->coeffs[8 * i + 0] = ((a[9 * i + 0] ) | ((uint32_t) a[9 * i + 1] << 8)) & 0x1FF; - r->coeffs[8 * i + 1] = ((a[9 * i + 1] >> 1) | ((uint32_t) a[9 * i + 2] << 7)) & 0x1FF; - r->coeffs[8 * i + 2] = ((a[9 * i + 2] >> 2) | ((uint32_t) a[9 * i + 3] << 6)) & 0x1FF; - r->coeffs[8 * i + 3] = ((a[9 * i + 3] >> 3) | ((uint32_t) a[9 * i + 4] << 5)) & 0x1FF; - r->coeffs[8 * i + 4] = ((a[9 * i + 4] >> 4) | ((uint32_t) a[9 * i + 5] << 4)) & 0x1FF; - r->coeffs[8 * i + 5] = ((a[9 * i + 5] >> 5) | ((uint32_t) a[9 * i + 6] << 3)) & 0x1FF; - r->coeffs[8 * i + 6] = ((a[9 * i + 6] >> 6) | ((uint32_t) a[9 * i + 7] << 2)) & 0x1FF; - r->coeffs[8 * i + 7] = ((a[9 * i + 7] >> 7) | ((uint32_t) a[9 * i + 8] << 1)) & 0x1FF; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack -* -* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}]. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLT0_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { - uint32_t t[4]; - - for (size_t i = 0; i < N / 4; ++i) { - t[0] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 0]; - t[1] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 1]; - t[2] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 2]; - t[3] = Q + (1u << (D - 1)) - a->coeffs[4 * i + 3]; - - r[7 * i + 0] = (uint8_t)(t[0]); - r[7 * i + 1] = (uint8_t)(t[0] >> 8); - r[7 * i + 1] |= (uint8_t)(t[1] << 6); - r[7 * i + 2] = (uint8_t)(t[1] >> 2); - r[7 * i + 3] = (uint8_t)(t[1] >> 10); - r[7 * i + 3] |= (uint8_t)(t[2] << 4); - r[7 * i + 4] = (uint8_t)(t[2] >> 4); - r[7 * i + 5] = (uint8_t)(t[2] >> 12); - r[7 * i + 5] |= (uint8_t)(t[3] << 2); - r[7 * i + 6] = (uint8_t)(t[3] >> 6); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack -* -* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. -* Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}]. -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: byte array with bit-packed polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { - - for (size_t i = 0; i < N / 4; ++i) { - r->coeffs[4 * i + 0] = a[7 * i + 0]; - r->coeffs[4 * i + 0] |= (uint32_t) (a[7 * i + 1] & 0x3F) << 8; - - r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6; - r->coeffs[4 * i + 1] |= (uint32_t) a[7 * i + 2] << 2; - r->coeffs[4 * i + 1] |= (uint32_t) (a[7 * i + 3] & 0x0F) << 10; - - r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4; - r->coeffs[4 * i + 2] |= (uint32_t) a[7 * i + 4] << 4; - r->coeffs[4 * i + 2] |= (uint32_t) (a[7 * i + 5] & 0x03) << 12; - - r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2; - r->coeffs[4 * i + 3] |= (uint32_t) a[7 * i + 6] << 6; - - r->coeffs[4 * i + 0] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 0]; - r->coeffs[4 * i + 1] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 1]; - r->coeffs[4 * i + 2] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 2]; - r->coeffs[4 * i + 3] = Q + (1U << (D - 1)) - r->coeffs[4 * i + 3]; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyz_pack -* -* Description: Bit-pack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Input coefficients are assumed to be standard representatives. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLZ_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyz_pack(uint8_t *r, const poly *a) { - uint32_t t[2]; - - for (size_t i = 0; i < N / 2; ++i) { - /* Map to {0,...,2*GAMMA1 - 2} */ - t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0]; - t[0] += ((int32_t)t[0] >> 31) & Q; - t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1]; - t[1] += ((int32_t)t[1] >> 31) & Q; - - r[5 * i + 0] = (uint8_t)t[0]; - r[5 * i + 1] = (uint8_t)(t[0] >> 8); - r[5 * i + 2] = (uint8_t)(t[0] >> 16); - r[5 * i + 2] |= (uint8_t)(t[1] << 4); - r[5 * i + 3] = (uint8_t)(t[1] >> 4); - r[5 * i + 4] = (uint8_t)(t[1] >> 12); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack -* -* Description: Unpack polynomial z with coefficients -* in [-(GAMMA1 - 1), GAMMA1 - 1]. -* Output coefficients are standard representatives. -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: byte array with bit-packed polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { - for (size_t i = 0; i < N / 2; ++i) { - r->coeffs[2 * i + 0] = a[5 * i + 0]; - r->coeffs[2 * i + 0] |= (uint32_t) a[5 * i + 1] << 8; - r->coeffs[2 * i + 0] |= (uint32_t) (a[5 * i + 2] & 0x0F) << 16; - - r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; - r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 3] << 4; - r->coeffs[2 * i + 1] |= (uint32_t) a[5 * i + 4] << 12; - - r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0]; - r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q; - r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1]; - r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q; - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyw1_pack -* -* Description: Bit-pack polynomial w1 with coefficients in [0, 15]. -* Input coefficients are assumed to be standard representatives. -* -* Arguments: - uint8_t *r: pointer to output byte array with at least -* POLW1_SIZE_PACKED bytes -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { - for (size_t i = 0; i < N / 2; ++i) { - r[i] = (uint8_t)(a->coeffs[2 * i + 0] | a->coeffs[2 * i + 1] << 4); - } - -} diff --git a/crypto_sign/dilithium4/clean/poly.h b/crypto_sign/dilithium4/clean/poly.h deleted file mode 100644 index c9f21776..00000000 --- a/crypto_sign/dilithium4/clean/poly.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_POLY_H -#define PQCLEAN_DILITHIUM4_CLEAN_POLY_H - -#include -#include - -#include "params.h" - -typedef struct { - uint32_t coeffs[N]; -} poly; - -void PQCLEAN_DILITHIUM4_CLEAN_poly_reduce(poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_poly_csubq(poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_poly_freeze(poly *a); - -void PQCLEAN_DILITHIUM4_CLEAN_poly_add( - poly *c, const poly *a, const poly *b); -void PQCLEAN_DILITHIUM4_CLEAN_poly_sub( - poly *c, const poly *a, const poly *b); -void PQCLEAN_DILITHIUM4_CLEAN_poly_shiftl(poly *a); - -void PQCLEAN_DILITHIUM4_CLEAN_poly_ntt(poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery( - poly *c, const poly *a, const poly *b); - -void PQCLEAN_DILITHIUM4_CLEAN_poly_power2round( - poly *a1, poly *a0, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_poly_decompose( - poly *a1, poly *a0, const poly *a); -uint32_t PQCLEAN_DILITHIUM4_CLEAN_poly_make_hint( - poly *h, const poly *a0, const poly *a1); -void PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint( - poly *a, const poly *b, const poly *h); - -int PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm( - const poly *a, uint32_t B); -void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform( - poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta( - poly *a, - const uint8_t *seed, - uint16_t nonce); -void PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_gamma1m1( - poly *a, - const uint8_t seed[CRHBYTES], - uint16_t nonce); - -void PQCLEAN_DILITHIUM4_CLEAN_polyeta_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); - -void PQCLEAN_DILITHIUM4_CLEAN_polyt1_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); - -void PQCLEAN_DILITHIUM4_CLEAN_polyt0_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); - -void PQCLEAN_DILITHIUM4_CLEAN_polyz_pack(uint8_t *r, const poly *a); -void PQCLEAN_DILITHIUM4_CLEAN_polyz_unpack(poly *r, const uint8_t *a); - -void PQCLEAN_DILITHIUM4_CLEAN_polyw1_pack(uint8_t *r, const poly *a); - -#endif diff --git a/crypto_sign/dilithium4/clean/polyvec.c b/crypto_sign/dilithium4/clean/polyvec.c deleted file mode 100644 index eab70e23..00000000 --- a/crypto_sign/dilithium4/clean/polyvec.c +++ /dev/null @@ -1,336 +0,0 @@ -#include -#include - -#include "params.h" -#include "poly.h" -#include "polyvec.h" - -/**************************************************************/ -/************ Vectors of polynomials of length L **************/ -/**************************************************************/ - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_freeze -* -* Description: Reduce coefficients of polynomials in vector of length L -* to standard representatives. -* -* Arguments: - polyvecl *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_freeze(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_freeze(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_add -* -* Description: Add vectors of polynomials of length L. -* No modular reduction is performed. -* -* Arguments: - polyvecl *w: pointer to output vector -* - const polyvecl *u: pointer to first summand -* - const polyvecl *v: pointer to second summand -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_add( - polyvecl *w, const polyvecl *u, const polyvecl *v) { - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt -* -* Description: Forward NTT of all polynomials in vector of length L. Output -* coefficients can be up to 16*Q larger than input coefficients. -* -* Arguments: - polyvecl *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt(polyvecl *v) { - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_ntt(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery -* -* Description: Pointwise multiply vectors of polynomials of length L, multiply -* resulting vector by 2^{-32} and add (accumulate) polynomials -* in it. Input/output vectors are in NTT domain representation. -* Input coefficients are assumed to be less than 22*Q. Output -* coeffcient are less than 2*L*Q. -* -* Arguments: - poly *w: output polynomial -* - const polyvecl *u: pointer to first input vector -* - const polyvecl *v: pointer to second input vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v) { - poly t; - - PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(w, &u->vec[0], &v->vec[0]); - - for (size_t i = 1; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(&t, &u->vec[i], &v->vec[i]); - PQCLEAN_DILITHIUM4_CLEAN_poly_add(w, w, &t); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm -* -* Description: Check infinity norm of polynomials in vector of length L. -* Assumes input coefficients to be standard representatives. -* -* Arguments: - const polyvecl *v: pointer to vector -* - uint32_t B: norm bound -* -* Returns 0 if norm of all polynomials is strictly smaller than B and 1 -* otherwise. -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B) { - for (size_t i = 0; i < L; ++i) { - if (PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(&v->vec[i], B)) { - return 1; - } - } - - return 0; -} - -/**************************************************************/ -/************ Vectors of polynomials of length K **************/ -/**************************************************************/ - - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_reduce -* -* Description: Reduce coefficients of polynomials in vector of length K -* to representatives in [0,2*Q[. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_reduce(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_reduce(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq -* -* Description: For all coefficients of polynomials in vector of length K -* subtract Q if coefficient is bigger than Q. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_csubq(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_freeze -* -* Description: Reduce coefficients of polynomials in vector of length K -* to standard representatives. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_freeze(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_freeze(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_add -* -* Description: Add vectors of polynomials of length K. -* No modular reduction is performed. -* -* Arguments: - polyveck *w: pointer to output vector -* - const polyveck *u: pointer to first summand -* - const polyveck *v: pointer to second summand -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_sub -* -* Description: Subtract vectors of polynomials of length K. -* Assumes coefficients of polynomials in second input vector -* to be less than 2*Q. No modular reduction is performed. -* -* Arguments: - polyveck *w: pointer to output vector -* - const polyveck *u: pointer to first input vector -* - const polyveck *v: pointer to second input vector to be -* subtracted from first input vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_shiftl -* -* Description: Multiply vector of polynomials of Length K by 2^D without modular -* reduction. Assumes input coefficients to be less than 2^{32-D}. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_shiftl(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_shiftl(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_ntt -* -* Description: Forward NTT of all polynomials in vector of length K. Output -* coefficients can be up to 16*Q larger than input coefficients. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_ntt(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_ntt(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_invntt_montgomery -* -* Description: Inverse NTT and multiplication by 2^{32} of polynomials -* in vector of length K. Input coefficients need to be less -* than 2*Q. -* -* Arguments: - polyveck *v: pointer to input/output vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_invntt_montgomery(polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(&v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_chknorm -* -* Description: Check infinity norm of polynomials in vector of length K. -* Assumes input coefficients to be standard representatives. -* -* Arguments: - const polyveck *v: pointer to vector -* - uint32_t B: norm bound -* -* Returns 0 if norm of all polynomials are strictly smaller than B and 1 -* otherwise. -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_polyveck_chknorm(const polyveck *v, uint32_t B) { - for (size_t i = 0; i < K; ++i) { - if (PQCLEAN_DILITHIUM4_CLEAN_poly_chknorm(&v->vec[i], B)) { - return 1; - } - } - - return 0; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_power2round -* -* Description: For all coefficients a of polynomials in vector of length K, -* compute a0, a1 such that a mod Q = a1*2^D + a0 -* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be -* standard representatives. -* -* Arguments: - polyveck *v1: pointer to output vector of polynomials with -* coefficients a1 -* - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 -* - const polyveck *v: pointer to input vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_decompose -* -* Description: For all coefficients a of polynomials in vector of length K, -* compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0 -* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we -* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. -* Assumes coefficients to be standard representatives. -* -* Arguments: - polyveck *v1: pointer to output vector of polynomials with -* coefficients a1 -* - polyveck *v0: pointer to output vector of polynomials with -* coefficients Q + a0 -* - const polyveck *v: pointer to input vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_make_hint -* -* Description: Compute hint vector. -* -* Arguments: - polyveck *h: pointer to output vector -* - const polyveck *v0: pointer to low part of input vector -* - const polyveck *v1: pointer to high part of input vector -* -* Returns number of 1 bits. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1) { - uint32_t s = 0; - - for (size_t i = 0; i < K; ++i) { - s += PQCLEAN_DILITHIUM4_CLEAN_poly_make_hint( - &h->vec[i], &v0->vec[i], &v1->vec[i]); - } - - return s; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint -* -* Description: Use hint vector to correct the high bits of input vector. -* -* Arguments: - polyveck *w: pointer to output vector of polynomials with -* corrected high bits -* - const polyveck *v: pointer to input vector -* - const polyveck *h: pointer to input hint vector -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h) { - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_use_hint( - &w->vec[i], &v->vec[i], &h->vec[i]); - } -} diff --git a/crypto_sign/dilithium4/clean/polyvec.h b/crypto_sign/dilithium4/clean/polyvec.h deleted file mode 100644 index 035b2085..00000000 --- a/crypto_sign/dilithium4/clean/polyvec.h +++ /dev/null @@ -1,58 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_POLYVEC_H -#define PQCLEAN_DILITHIUM4_CLEAN_POLYVEC_H - -#include - -#include "params.h" -#include "poly.h" - -/* Vectors of polynomials of length L */ -typedef struct { - poly vec[L]; -} polyvecl; - -void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_freeze(polyvecl *v); - -void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); - -void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt(polyvecl *v); -void PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery( - poly *w, const polyvecl *u, const polyvecl *v); - -int PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm(const polyvecl *v, uint32_t B); - - - -/* Vectors of polynomials of length K */ -typedef struct { - poly vec[K]; -} polyveck; - -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_reduce(polyveck *v); -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq(polyveck *v); -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_freeze(polyveck *v); - -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_add( - polyveck *w, const polyveck *u, const polyveck *v); -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_sub( - polyveck *w, const polyveck *u, const polyveck *v); -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_shiftl(polyveck *v); - -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_ntt(polyveck *v); -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_invntt_montgomery(polyveck *v); - -int PQCLEAN_DILITHIUM4_CLEAN_polyveck_chknorm( - const polyveck *v, uint32_t B); - -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_power2round( - polyveck *v1, polyveck *v0, const polyveck *v); -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_decompose( - polyveck *v1, polyveck *v0, const polyveck *v); -uint32_t PQCLEAN_DILITHIUM4_CLEAN_polyveck_make_hint( - polyveck *h, - const polyveck *v0, - const polyveck *v1); -void PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint( - polyveck *w, const polyveck *v, const polyveck *h); - -#endif diff --git a/crypto_sign/dilithium4/clean/reduce.c b/crypto_sign/dilithium4/clean/reduce.c deleted file mode 100644 index 69334c2d..00000000 --- a/crypto_sign/dilithium4/clean/reduce.c +++ /dev/null @@ -1,75 +0,0 @@ -#include - -#include "params.h" -#include "reduce.h" - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce -* -* Description: For finite field element a with 0 <= a <= Q*2^32, -* compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q. -* -* Arguments: - uint64_t: finite field element a -* -* Returns r. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce(uint64_t a) { - uint64_t t; - - t = a * QINV; - t &= (1ULL << 32) - 1; - t *= Q; - t = a + t; - t >>= 32; - return (uint32_t)t; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_reduce32 -* -* Description: For finite field element a, compute r \equiv a (mod Q) -* such that 0 <= r < 2*Q. -* -* Arguments: - uint32_t: finite field element a -* -* Returns r. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_reduce32(uint32_t a) { - uint32_t t; - - t = a & 0x7FFFFF; - a >>= 23; - t += (a << 13) - a; - return t; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_csubq -* -* Description: Subtract Q if input coefficient is bigger than Q. -* -* Arguments: - uint32_t: finite field element a -* -* Returns r. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_csubq(uint32_t a) { - a -= Q; - a += ((int32_t)a >> 31) & Q; - return a; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_freeze -* -* Description: For finite field element a, compute standard -* representative r = a mod Q. -* -* Arguments: - uint32_t: finite field element a -* -* Returns r. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_freeze(uint32_t a) { - a = PQCLEAN_DILITHIUM4_CLEAN_reduce32(a); - a = PQCLEAN_DILITHIUM4_CLEAN_csubq(a); - return a; -} diff --git a/crypto_sign/dilithium4/clean/reduce.h b/crypto_sign/dilithium4/clean/reduce.h deleted file mode 100644 index 9caf592d..00000000 --- a/crypto_sign/dilithium4/clean/reduce.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_REDUCE_H -#define PQCLEAN_DILITHIUM4_CLEAN_REDUCE_H - -#include - -#define MONT 4193792U // 2^32 % Q -#define QINV 4236238847U // -q^(-1) mod 2^32 - -/* a <= Q*2^32 => r < 2*Q */ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_montgomery_reduce(uint64_t a); - -/* r < 2*Q */ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_reduce32(uint32_t a); - -/* a < 2*Q => r < Q */ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_csubq(uint32_t a); - -/* r < Q */ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_freeze(uint32_t a); - -#endif diff --git a/crypto_sign/dilithium4/clean/rounding.c b/crypto_sign/dilithium4/clean/rounding.c deleted file mode 100644 index 27c35122..00000000 --- a/crypto_sign/dilithium4/clean/rounding.c +++ /dev/null @@ -1,117 +0,0 @@ -#include "params.h" -#include "rounding.h" - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_power2round -* -* Description: For finite field element a, compute a0, a1 such that -* a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. -* Assumes a to be standard representative. -* -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 -* -* Returns a1. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_power2round(uint32_t a, uint32_t *a0) { - uint32_t t; - - /* Centralized remainder mod 2^D */ - t = a & ((1U << D) - 1); - t -= (1U << (D - 1)) + 1; - t += ((uint32_t)((int32_t)t >> 31) & (1 << D)); - t -= (1U << (D - 1)) - 1; - *a0 = Q + t; - a = (a - t) >> D; - return a; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_decompose -* -* Description: For finite field element a, compute high and low bits a0, a1 such -* that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except -* if a1 = (Q-1)/ALPHA where we set a1 = 0 and -* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be standard -* representative. -* -* Arguments: - uint32_t a: input element -* - uint32_t *a0: pointer to output element Q + a0 -* -* Returns a1. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_decompose(uint32_t a, uint32_t *a0) { - int32_t t, u; - - /* Centralized remainder mod ALPHA */ - t = a & 0x7FFFFu; - t += (int32_t)((a >> 19u) << 9u); - t -= ALPHA / 2 + 1; - t += (t >> 31) & ALPHA; - t -= ALPHA / 2 - 1; - a -= (uint32_t)t; - - /* Divide by ALPHA (possible to avoid) */ - u = (int32_t)(a - 1); - u >>= 31; - a = (a >> 19) + 1; - a -= u & 1; - - /* Border case */ - *a0 = (uint32_t)(Q + t - (int32_t)(a >> 4u)); - a &= 0xFu; - return a; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_make_hint -* -* Description: Compute hint bit indicating whether the low bits of the -* input element overflow into the high bits. Inputs assumed to be -* standard representatives. -* -* Arguments: - uint32_t a0: low bits of input element -* - uint32_t a1: high bits of input element -* -* Returns 1 if high bits of a and b differ and 0 otherwise. -**************************************************/ -unsigned int PQCLEAN_DILITHIUM4_CLEAN_make_hint(uint32_t a0, uint32_t a1) { - if (a0 <= GAMMA2 || a0 > Q - GAMMA2 || (a0 == Q - GAMMA2 && a1 == 0)) { - return 0; - } - - return 1; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_use_hint -* -* Description: Correct high bits according to hint. -* -* Arguments: - uint32_t a: input element -* - unsigned int hint: hint bit -* -* Returns corrected high bits. -**************************************************/ -uint32_t PQCLEAN_DILITHIUM4_CLEAN_use_hint(uint32_t a, unsigned int hint) { - uint32_t a0, a1; - - a1 = PQCLEAN_DILITHIUM4_CLEAN_decompose(a, &a0); - if (hint == 0) { - return a1; - } - if (a0 > Q) { - return (a1 + 1) & 0xF; - } - - return (a1 - 1) & 0xF; - - /* If PQCLEAN_DILITHIUM4_CLEAN_decompose does not divide out ALPHA: - if(hint == 0) - return a1; - else if(a0 > Q) - return (a1 + ALPHA) % (Q - 1); - else - return (a1 - ALPHA) % (Q - 1); - */ -} diff --git a/crypto_sign/dilithium4/clean/rounding.h b/crypto_sign/dilithium4/clean/rounding.h deleted file mode 100644 index e0eed02f..00000000 --- a/crypto_sign/dilithium4/clean/rounding.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_ROUNDING_H -#define PQCLEAN_DILITHIUM4_CLEAN_ROUNDING_H - -#include - -uint32_t PQCLEAN_DILITHIUM4_CLEAN_power2round(uint32_t a, uint32_t *a0); -uint32_t PQCLEAN_DILITHIUM4_CLEAN_decompose(uint32_t a, uint32_t *a0); -unsigned int PQCLEAN_DILITHIUM4_CLEAN_make_hint(uint32_t a0, uint32_t a1); -uint32_t PQCLEAN_DILITHIUM4_CLEAN_use_hint(uint32_t a, unsigned int hint); - -#endif diff --git a/crypto_sign/dilithium4/clean/sign.c b/crypto_sign/dilithium4/clean/sign.c deleted file mode 100644 index aebe76be..00000000 --- a/crypto_sign/dilithium4/clean/sign.c +++ /dev/null @@ -1,427 +0,0 @@ -#include -#include - -#include "fips202.h" -#include "packing.h" -#include "params.h" -#include "poly.h" -#include "polyvec.h" -#include "randombytes.h" -#include "sign.h" -#include "symmetric.h" - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_expand_mat -* -* Description: Implementation of ExpandA. Generates matrix A with uniformly -* random coefficients a_{i,j} by performing rejection -* sampling on the output stream of SHAKE128(rho|i|j). -* -* Arguments: - polyvecl mat[K]: output matrix -* - const uint8_t rho[]: byte array containing seed rho -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { - for (size_t i = 0; i < K; ++i) { - for (size_t j = 0; j < L; ++j) { - PQCLEAN_DILITHIUM4_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t)((i << 8) + j)); - } - } -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_challenge -* -* Description: Implementation of H. Samples polynomial with 60 nonzero -* coefficients in {-1,1} using the output stream of -* SHAKE256(mu|w1). -* -* Arguments: - poly *c: pointer to output polynomial -* - const uint8_t mu[]: byte array containing mu -* - const polyveck *w1: pointer to vector w1 -**************************************************/ -void PQCLEAN_DILITHIUM4_CLEAN_challenge(poly *c, - const uint8_t mu[CRHBYTES], - const polyveck *w1) { - uint64_t signs; - uint8_t inbuf[CRHBYTES + K * POLW1_SIZE_PACKED]; - uint8_t outbuf[SHAKE256_RATE]; - shake256ctx state; - uint8_t b; - size_t pos; - - for (size_t i = 0; i < CRHBYTES; ++i) { - inbuf[i] = mu[i]; - } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, &w1->vec[i]); - } - - shake256_absorb(&state, inbuf, sizeof(inbuf)); - shake256_squeezeblocks(outbuf, 1, &state); - - signs = 0; - for (size_t i = 0; i < 8; ++i) { - signs |= (uint64_t)outbuf[i] << 8 * i; - } - - pos = 8; - - for (size_t i = 0; i < N; ++i) { - c->coeffs[i] = 0; - } - - for (size_t i = 196; i < 256; ++i) { - do { - if (pos >= SHAKE256_RATE) { - shake256_squeezeblocks(outbuf, 1, &state); - pos = 0; - } - - b = outbuf[pos++]; - } while (b > i); - - c->coeffs[i] = c->coeffs[b]; - c->coeffs[b] = 1; - c->coeffs[b] ^= -((int32_t)signs & 1) & (1 ^ (Q - 1)); - signs >>= 1; - } - shake256_ctx_release(&state); -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair -* -* Description: Generates public and private key. -* -* Arguments: - uint8_t *pk: pointer to output public key (allocated -* array of PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (allocated -* array of PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_SECRETKEYBYTES bytes) -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { - uint8_t seedbuf[3 * SEEDBYTES]; - uint8_t tr[CRHBYTES]; - const uint8_t *rho, *rhoprime, *key; - uint16_t nonce = 0; - polyvecl mat[K]; - polyvecl s1, s1hat; - polyveck s2, t, t1, t0; - - /* Expand 32 bytes of randomness into rho, rhoprime and key */ - randombytes(seedbuf, 3 * SEEDBYTES); - rho = seedbuf; - rhoprime = seedbuf + SEEDBYTES; - key = seedbuf + 2 * SEEDBYTES; - - /* Expand matrix */ - PQCLEAN_DILITHIUM4_CLEAN_expand_mat(mat, rho); - - /* Sample short vectors s1 and s2 */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta(&s1.vec[i], rhoprime, nonce++); - } - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_eta(&s2.vec[i], rhoprime, nonce++); - } - - /* Matrix-vector multiplication */ - s1hat = s1; - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt(&s1hat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery(&t.vec[i], &mat[i], &s1hat); - PQCLEAN_DILITHIUM4_CLEAN_poly_reduce(&t.vec[i]); - PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(&t.vec[i]); - } - - /* Add error vector s2 */ - PQCLEAN_DILITHIUM4_CLEAN_polyveck_add(&t, &t, &s2); - - /* Extract t1 and write public key */ - PQCLEAN_DILITHIUM4_CLEAN_polyveck_freeze(&t); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_power2round(&t1, &t0, &t); - PQCLEAN_DILITHIUM4_CLEAN_pack_pk(pk, rho, &t1); - - /* Compute CRH(rho, t1) and write secret key */ - crh(tr, pk, PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES); - PQCLEAN_DILITHIUM4_CLEAN_pack_sk(sk, rho, key, tr, &s1, &s2, &t0); - - return 0; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature -* -* Description: Compute signed message. -* -* Arguments: - uint8_t *sig: pointer to output signature (PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES -* of len) -* - size_t *smlen: pointer to output length of signed message -* (should be PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES) -* - uint8_t *m: pointer to message to be signed -* - size_t mlen: length of message -* - uint8_t *sk: pointer to bit-packed secret key -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature( - uint8_t *sig, size_t *siglen, - const uint8_t *msg, size_t mlen, - const uint8_t *sk) { - uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; - uint8_t *rho, *tr, *key, *mu, *rhoprime; - uint32_t n; - uint16_t nonce = 0; - poly c, chat; - polyvecl mat[K], s1, y, yhat, z; - polyveck t0, s2, w, w1, w0; - polyveck h, cs2, ct0; - - rho = seedbuf; - tr = rho + SEEDBYTES; - key = tr + CRHBYTES; - mu = key + SEEDBYTES; - rhoprime = mu + CRHBYTES; - PQCLEAN_DILITHIUM4_CLEAN_unpack_sk(rho, key, tr, &s1, &s2, &t0, sk); - - // use incremental hash API instead of copying around buffers - /* Compute CRH(tr, msg) */ - shake256incctx state; - shake256_inc_init(&state); - shake256_inc_absorb(&state, tr, CRHBYTES); - shake256_inc_absorb(&state, msg, mlen); - shake256_inc_finalize(&state); - shake256_inc_squeeze(mu, CRHBYTES, &state); - shake256_inc_ctx_release(&state); - - crh(rhoprime, key, SEEDBYTES + CRHBYTES); - - /* Expand matrix and transform vectors */ - PQCLEAN_DILITHIUM4_CLEAN_expand_mat(mat, rho); - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt(&s1); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_ntt(&s2); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_ntt(&t0); - -rej: - /* Sample intermediate vector y */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_uniform_gamma1m1(&y.vec[i], rhoprime, nonce++); - } - - /* Matrix-vector multiplication */ - yhat = y; - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt(&yhat); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery(&w.vec[i], &mat[i], &yhat); - PQCLEAN_DILITHIUM4_CLEAN_poly_reduce(&w.vec[i]); - PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(&w.vec[i]); - } - - /* Decompose w and call the random oracle */ - PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq(&w); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_decompose(&w1, &w0, &w); - PQCLEAN_DILITHIUM4_CLEAN_challenge(&c, mu, &w1); - chat = c; - PQCLEAN_DILITHIUM4_CLEAN_poly_ntt(&chat); - - /* Check that subtracting cs2 does not change high bits of w and low bits - * do not reveal secret information */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(&cs2.vec[i], &chat, &s2.vec[i]); - PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(&cs2.vec[i]); - } - PQCLEAN_DILITHIUM4_CLEAN_polyveck_sub(&w0, &w0, &cs2); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_freeze(&w0); - if (PQCLEAN_DILITHIUM4_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { - goto rej; - } - - /* Compute z, reject if it reveals secret */ - for (size_t i = 0; i < L; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(&z.vec[i], &chat, &s1.vec[i]); - PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(&z.vec[i]); - } - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_add(&z, &z, &y); - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_freeze(&z); - if (PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { - goto rej; - } - - /* Compute hints for w1 */ - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(&ct0.vec[i], &chat, &t0.vec[i]); - PQCLEAN_DILITHIUM4_CLEAN_poly_invntt_montgomery(&ct0.vec[i]); - } - - PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq(&ct0); - if (PQCLEAN_DILITHIUM4_CLEAN_polyveck_chknorm(&ct0, GAMMA2)) { - goto rej; - } - - PQCLEAN_DILITHIUM4_CLEAN_polyveck_add(&w0, &w0, &ct0); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq(&w0); - n = PQCLEAN_DILITHIUM4_CLEAN_polyveck_make_hint(&h, &w0, &w1); - if (n > OMEGA) { - goto rej; - } - - /* Write signature */ - PQCLEAN_DILITHIUM4_CLEAN_pack_sig(sig, &z, &h, &c); - *siglen = PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES; - return 0; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign -* -* Description: Compute signed message. -* -* Arguments: - uint8_t *sm: pointer to output signed message (allocated -* array with PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES + mlen bytes), -* can be equal to m -* - size_t *smlen: pointer to output length of signed -* message -* - const uint8_t *m: pointer to message to be signed -* - size_t mlen: length of message -* - const uint8_t *sk: pointer to bit-packed secret key -* -* Returns 0 (success) -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign( - uint8_t *sm, size_t *smlen, - const uint8_t *m, size_t mlen, - const uint8_t *sk) { - int rc; - memmove(sm + PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES, m, mlen); - rc = PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_signature(sm, smlen, m, mlen, sk); - *smlen += mlen; - return rc; -} - - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify -* -* Description: Verify signed message. -* -* Arguments: - uint8_t *sig: signature -* - size_t siglen: length of signature (PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES) -* - uint8_t *m: pointer to message -* - size_t *mlen: pointer to output length of message -* - uint8_t *pk: pointer to bit-packed public key -* -* Returns 0 if signed message could be verified correctly and -1 otherwise -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify( - const uint8_t *sig, size_t siglen, - const uint8_t *m, size_t mlen, const uint8_t *pk) { - uint8_t rho[SEEDBYTES]; - uint8_t mu[CRHBYTES]; - poly c, chat, cp; - polyvecl mat[K], z; - polyveck t1, w1, h, tmp1, tmp2; - - if (siglen < PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES) { - return -1; - } - - PQCLEAN_DILITHIUM4_CLEAN_unpack_pk(rho, &t1, pk); - if (PQCLEAN_DILITHIUM4_CLEAN_unpack_sig(&z, &h, &c, sig)) { - return -1; - } - if (PQCLEAN_DILITHIUM4_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { - return -1; - } - - /* Compute CRH(CRH(rho, t1), msg) */ - crh(mu, pk, PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_PUBLICKEYBYTES); - - shake256incctx state; - shake256_inc_init(&state); - shake256_inc_absorb(&state, mu, CRHBYTES); - shake256_inc_absorb(&state, m, mlen); - shake256_inc_finalize(&state); - shake256_inc_squeeze(mu, CRHBYTES, &state); - shake256_inc_ctx_release(&state); - - /* Matrix-vector multiplication; compute Az - c2^dt1 */ - PQCLEAN_DILITHIUM4_CLEAN_expand_mat(mat, rho); - - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_ntt(&z); - for (size_t i = 0; i < K ; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_polyvecl_pointwise_acc_invmontgomery(&tmp1.vec[i], &mat[i], &z); - } - - chat = c; - PQCLEAN_DILITHIUM4_CLEAN_poly_ntt(&chat); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_shiftl(&t1); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_ntt(&t1); - for (size_t i = 0; i < K; ++i) { - PQCLEAN_DILITHIUM4_CLEAN_poly_pointwise_invmontgomery(&tmp2.vec[i], &chat, &t1.vec[i]); - } - - PQCLEAN_DILITHIUM4_CLEAN_polyveck_sub(&tmp1, &tmp1, &tmp2); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_reduce(&tmp1); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_invntt_montgomery(&tmp1); - - /* Reconstruct w1 */ - PQCLEAN_DILITHIUM4_CLEAN_polyveck_csubq(&tmp1); - PQCLEAN_DILITHIUM4_CLEAN_polyveck_use_hint(&w1, &tmp1, &h); - - /* Call random oracle and verify challenge */ - PQCLEAN_DILITHIUM4_CLEAN_challenge(&cp, mu, &w1); - for (size_t i = 0; i < N; ++i) { - if (c.coeffs[i] != cp.coeffs[i]) { - return -1; - } - } - - // All good - return 0; -} - -/************************************************* -* Name: PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open -* -* Description: Verify signed message. -* -* Arguments: - uint8_t *m: pointer to output message (allocated -* array with smlen bytes), can be equal to sm -* - size_t *mlen: pointer to output length of message -* - const uint8_t *sm: pointer to signed message -* - size_t smlen: length of signed message -* - const uint8_t *pk: pointer to bit-packed public key -* -* Returns 0 if signed message could be verified correctly and -1 otherwise -**************************************************/ -int PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_open( - uint8_t *m, size_t *mlen, - const uint8_t *sm, size_t smlen, - const uint8_t *pk) { - if (smlen < PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES) { - goto badsig; - } - *mlen = smlen - PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES; - - if (PQCLEAN_DILITHIUM4_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES, - sm + PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES, *mlen, pk)) { - goto badsig; - } else { - /* All good, copy msg, return 0 */ - for (size_t i = 0; i < *mlen; ++i) { - m[i] = sm[PQCLEAN_DILITHIUM4_CLEAN_CRYPTO_BYTES + i]; - } - return 0; - } - - /* Signature verification failed */ -badsig: - *mlen = (size_t) -1; - for (size_t i = 0; i < smlen; ++i) { - m[i] = 0; - } - - return -1; -} - diff --git a/crypto_sign/dilithium4/clean/sign.h b/crypto_sign/dilithium4/clean/sign.h deleted file mode 100644 index f44cb5fd..00000000 --- a/crypto_sign/dilithium4/clean/sign.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_SIGN_H -#define PQCLEAN_DILITHIUM4_CLEAN_SIGN_H - -#include "api.h" -#include "params.h" -#include "poly.h" -#include "polyvec.h" - -void PQCLEAN_DILITHIUM4_CLEAN_expand_mat(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); -void PQCLEAN_DILITHIUM4_CLEAN_challenge(poly *c, const uint8_t mu[CRHBYTES], - const polyveck *w1); -#endif diff --git a/crypto_sign/dilithium4/clean/stream.c b/crypto_sign/dilithium4/clean/stream.c deleted file mode 100644 index 9be23a56..00000000 --- a/crypto_sign/dilithium4/clean/stream.c +++ /dev/null @@ -1,26 +0,0 @@ -#include "stream.h" - -#include - -void PQCLEAN_DILITHIUM4_CLEAN_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { - - uint8_t buf[SEEDBYTES + 2]; - memcpy(buf, seed, SEEDBYTES); - buf[SEEDBYTES] = (uint8_t)nonce; - buf[SEEDBYTES + 1] = (uint8_t)(nonce >> 8); - - shake128_absorb(state, buf, SEEDBYTES + 2); -} - - -void PQCLEAN_DILITHIUM4_CLEAN_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { - - uint8_t buf[CRHBYTES + 2]; - memcpy(buf, seed, CRHBYTES); - buf[CRHBYTES] = (uint8_t)nonce; - buf[CRHBYTES + 1] = (uint8_t)(nonce >> 8); - - shake256_absorb(state, buf, CRHBYTES + 2); -} diff --git a/crypto_sign/dilithium4/clean/stream.h b/crypto_sign/dilithium4/clean/stream.h deleted file mode 100644 index d9807822..00000000 --- a/crypto_sign/dilithium4/clean/stream.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_STREAM_H -#define PQCLEAN_DILITHIUM4_CLEAN_STREAM_H - -#include - -#include "fips202.h" -#include "params.h" - -void PQCLEAN_DILITHIUM4_CLEAN_shake128_stream_init( - shake128ctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce); - -void PQCLEAN_DILITHIUM4_CLEAN_shake256_stream_init( - shake256ctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce); - -#endif diff --git a/crypto_sign/dilithium4/clean/symmetric.h b/crypto_sign/dilithium4/clean/symmetric.h deleted file mode 100644 index 117f5640..00000000 --- a/crypto_sign/dilithium4/clean/symmetric.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef PQCLEAN_DILITHIUM4_CLEAN_SYMMETRIC_H -#define PQCLEAN_DILITHIUM4_CLEAN_SYMMETRIC_H - -#include "params.h" -#include "stream.h" - - -#include "fips202.h" - -#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) -#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM4_CLEAN_shake128_stream_init(STATE, SEED, NONCE) -#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream128_ctx_release(STATE) shake128_ctx_release(STATE) -#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM4_CLEAN_shake256_stream_init(STATE, SEED, NONCE) -#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) shake256_squeezeblocks(OUT, OUTBLOCKS, STATE) -#define stream256_ctx_release(STATE) shake256_ctx_release(STATE) - -#define STREAM128_BLOCKBYTES SHAKE128_RATE -#define STREAM256_BLOCKBYTES SHAKE256_RATE - -typedef shake128ctx stream128_state; -typedef shake256ctx stream256_state; - - -#endif diff --git a/crypto_sign/dilithium5/META.yml b/crypto_sign/dilithium5/META.yml new file mode 100644 index 00000000..e498d4bc --- /dev/null +++ b/crypto_sign/dilithium5/META.yml @@ -0,0 +1,31 @@ +name: Dilithium5 +type: signature +claimed-nist-level: 5 +length-public-key: 2592 +length-secret-key: 4880 +length-signature: 4595 +nistkat-sha256: 1d1ee6fb14b864bcc564ad9c416593b2ee1bf93cd65dfe70d9e400bc66be3229 +testvectors-sha256: 9bc663cbfc1b43cff759cfeddd365b665762bc36e1f1d0777ae1196f59617a70 +principal-submitters: + - Vadim Lyubashevsky +auxiliary-submitters: + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Peter Schwabe + - Gregor Seiler + - Damien Stehlé +implementations: + - name: clean + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium + - name: avx2 + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - aes + - avx2 + - popcnt diff --git a/crypto_sign/dilithium5/avx2/LICENSE b/crypto_sign/dilithium5/avx2/LICENSE new file mode 100644 index 00000000..08473af7 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/LICENSE @@ -0,0 +1,5 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium5/avx2/Makefile b/crypto_sign/dilithium5/avx2/Makefile new file mode 100644 index 00000000..9cd68f9c --- /dev/null +++ b/crypto_sign/dilithium5/avx2/Makefile @@ -0,0 +1,31 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium5_avx2.a +HEADERS=align.h api.h cdecl.h consts.h fips202x4.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc +OBJECTS=consts.o fips202x4.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o symmetric-shake.o f1600x4.o invntt.o ntt.o pointwise.o shuffle.o +KECCAK4XDIR=../../../common/keccak4x +KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o +KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) + +CFLAGS=-mavx2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls \ + -Wpointer-arith -Wshadow \ + -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) $(KECCAK4X) + $(AR) -r $@ $(OBJECTS) $(KECCAK4X) + +$(KECCAK4X): + $(MAKE) -C $(KECCAK4XDIR) $(KECCAK4XOBJ) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_sign/dilithium5/avx2/align.h b/crypto_sign/dilithium5/avx2/align.h new file mode 100644 index 00000000..e54cb4a8 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/align.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_ALIGN_H +#define PQCLEAN_DILITHIUM5_AVX2_ALIGN_H + +#include +#include + +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[N]; \ + __m256i vec[((N)+31)/32]; \ + } + +#define ALIGNED_INT32(N) \ + union { \ + int32_t coeffs[N]; \ + __m256i vec[((N)+7)/8]; \ + } + +#endif diff --git a/crypto_sign/dilithium5/avx2/api.h b/crypto_sign/dilithium5/avx2/api.h new file mode 100644 index 00000000..7586ee9c --- /dev/null +++ b/crypto_sign/dilithium5/avx2/api.h @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_API_H +#define PQCLEAN_DILITHIUM5_AVX2_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES 2592 +#define PQCLEAN_DILITHIUM5_AVX2_CRYPTO_SECRETKEYBYTES 4880 +#define PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES 4595 +#define PQCLEAN_DILITHIUM5_AVX2_CRYPTO_ALGNAME "Dilithium5" + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium3/avx2/cdecl.inc b/crypto_sign/dilithium5/avx2/cdecl.h similarity index 55% rename from crypto_sign/dilithium3/avx2/cdecl.inc rename to crypto_sign/dilithium5/avx2/cdecl.h index 6c9e5ac1..e961164d 100644 --- a/crypto_sign/dilithium3/avx2/cdecl.inc +++ b/crypto_sign/dilithium5/avx2/cdecl.h @@ -1,5 +1,14 @@ -#ifndef PQCLEAN_DILITHIUM3_AVX2_CDECL -#define PQCLEAN_DILITHIUM3_AVX2_CDECL +#ifndef PQCLEAN_DILITHIUM5_AVX2_CDECL_H +#define PQCLEAN_DILITHIUM5_AVX2_CDECL_H + + + +#define _8XQ 0 +#define _8XQINV 8 +#define _8XDIV_QINV 16 +#define _8XDIV 24 +#define _ZETAS_QINV 32 +#define _ZETAS 328 /* The C ABI on MacOS exports all symbols with a leading * underscore. This means that any symbols we refer to from @@ -9,10 +18,7 @@ * This define helps us get around this */ -#if defined(__WIN32__) || defined(__APPLE__) -#define cdecl(s) _##s -#else +#define _cdecl(s) _##s #define cdecl(s) s -#endif #endif diff --git a/crypto_sign/dilithium5/avx2/consts.c b/crypto_sign/dilithium5/avx2/consts.c new file mode 100644 index 00000000..1c4a9828 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/consts.c @@ -0,0 +1,101 @@ +#include "consts.h" +#include "params.h" +#include + +#define QINV 58728449 // q^(-1) mod 2^32 +#define MONT (-4186625) // 2^32 mod q +#define DIV 41978 // mont^2/256 +#define DIV_QINV (-8395782) + +const qdata_t PQCLEAN_DILITHIUM5_AVX2_qdata = {{ +//#define _8XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, + +//#define _8XQINV 8 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +//#define _8XDIV_QINV 16 + DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, + +//#define _8XDIV 24 + DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, + +//#define _ZETAS_QINV 32 + -151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, + 308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, + -1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, + -1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, + -285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, + 1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, + 1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, + 1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, + 329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, + -1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, + -202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, + -1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, + 1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, + -1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, + -783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, + 1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, + -695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, + -654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, + -247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, + -916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, + 1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, + -898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, + 2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, + 831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, + -2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, + 991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, + 908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, + -1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, + 6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, + 1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, + -1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, + 1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, + 702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, + 746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, + 885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, + 1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, + -1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, + +//#define _ZETAS 328 + -3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, + -359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, + 3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, + 3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, + 2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, + -549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, + -2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, + 1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, + 3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, + -3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, + 189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, + -1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, + -983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, + 264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, + -3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, + 2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, + 342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, + -1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, + -3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, + 3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, + 286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, + 1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, + 3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, + 2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, + -2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, + -2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, + 3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, + 3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, + 4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, + -1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, + 269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, + } +}; diff --git a/crypto_sign/dilithium5/avx2/consts.h b/crypto_sign/dilithium5/avx2/consts.h new file mode 100644 index 00000000..ff251a04 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/consts.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_CONSTS_H +#define PQCLEAN_DILITHIUM5_AVX2_CONSTS_H +#include "align.h" +#include "cdecl.h" + + +typedef ALIGNED_INT32(624) qdata_t; +extern const qdata_t PQCLEAN_DILITHIUM5_AVX2_qdata; + +#endif diff --git a/crypto_sign/dilithium5/avx2/f1600x4.S b/crypto_sign/dilithium5/avx2/f1600x4.S new file mode 100644 index 00000000..a21aab88 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/f1600x4.S @@ -0,0 +1,909 @@ +/* Taken from Bas Westerbaan's new 4-way SHAKE implementation + * for Sphincs+ (https://github.com/sphincs/sphincsplus/pull/14/), + * but uses vpshufb for byte-granular rotations as in the Keccak Code Package. */ + +#include "cdecl.h" + +.data +.p2align 5 +rho8: +.byte 7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14,7,0,1,2,3,4,5,6,15,8,9,10,11,12,13,14 +rho56: +.byte 1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8,1,2,3,4,5,6,7,0,9,10,11,12,13,14,15,8 + +.text +.global cdecl(PQCLEAN_DILITHIUM5_AVX2_f1600x4) +.global _cdecl(PQCLEAN_DILITHIUM5_AVX2_f1600x4) +cdecl(PQCLEAN_DILITHIUM5_AVX2_f1600x4): +_cdecl(PQCLEAN_DILITHIUM5_AVX2_f1600x4): +vmovdqa rho8(%rip), %ymm0 +movq $6, %rax +looptop: +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 192(%rdi), %ymm4, %ymm9 +vpxor 384(%rdi), %ymm3, %ymm10 +vpxor 576(%rdi), %ymm2, %ymm11 +vpxor 768(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 0(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 96(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 320(%rdi), %ymm5, %ymm10 +vpxor 512(%rdi), %ymm4, %ymm11 +vpxor 704(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 32(%rdi), %ymm4, %ymm8 +vpxor 224(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 608(%rdi), %ymm1, %ymm11 +vpxor 640(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 128(%rdi), %ymm1, %ymm8 +vpxor 160(%rdi), %ymm5, %ymm9 +vpxor 352(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 736(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 64(%rdi), %ymm3, %ymm8 +vpxor 256(%rdi), %ymm2, %ymm9 +vpxor 448(%rdi), %ymm1, %ymm10 +vpxor 480(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 448(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 512(%rdi), %ymm4, %ymm9 +vpxor 224(%rdi), %ymm3, %ymm10 +vpxor 736(%rdi), %ymm2, %ymm11 +vpxor 448(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 8(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 576(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 640(%rdi), %ymm5, %ymm10 +vpxor 352(%rdi), %ymm4, %ymm11 +vpxor 64(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 192(%rdi), %ymm4, %ymm8 +vpxor 704(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 128(%rdi), %ymm1, %ymm11 +vpxor 480(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 768(%rdi), %ymm1, %ymm8 +vpxor 320(%rdi), %ymm5, %ymm9 +vpxor 32(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 256(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 384(%rdi), %ymm3, %ymm8 +vpxor 96(%rdi), %ymm2, %ymm9 +vpxor 608(%rdi), %ymm1, %ymm10 +vpxor 160(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 608(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 352(%rdi), %ymm4, %ymm9 +vpxor 704(%rdi), %ymm3, %ymm10 +vpxor 256(%rdi), %ymm2, %ymm11 +vpxor 608(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 16(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 736(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 480(%rdi), %ymm5, %ymm10 +vpxor 32(%rdi), %ymm4, %ymm11 +vpxor 384(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 512(%rdi), %ymm4, %ymm8 +vpxor 64(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 768(%rdi), %ymm1, %ymm11 +vpxor 160(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 768(%rdi) +vpxor 448(%rdi), %ymm1, %ymm8 +vpxor 640(%rdi), %ymm5, %ymm9 +vpxor 192(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 96(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 224(%rdi), %ymm3, %ymm8 +vpxor 576(%rdi), %ymm2, %ymm9 +vpxor 128(%rdi), %ymm1, %ymm10 +vpxor 320(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 128(%rdi) +vmovdqa 0(%rdi), %ymm8 +vmovdqa 32(%rdi), %ymm9 +vmovdqa 64(%rdi), %ymm10 +vmovdqa 96(%rdi), %ymm11 +vmovdqa 128(%rdi), %ymm12 +vpxor 160(%rdi), %ymm8, %ymm8 +vpxor 192(%rdi), %ymm9, %ymm9 +vpxor 224(%rdi), %ymm10, %ymm10 +vpxor 256(%rdi), %ymm11, %ymm11 +vpxor 288(%rdi), %ymm12, %ymm12 +vpxor 320(%rdi), %ymm8, %ymm8 +vpxor 352(%rdi), %ymm9, %ymm9 +vpxor 384(%rdi), %ymm10, %ymm10 +vpxor 416(%rdi), %ymm11, %ymm11 +vpxor 448(%rdi), %ymm12, %ymm12 +vpxor 480(%rdi), %ymm8, %ymm8 +vpxor 512(%rdi), %ymm9, %ymm9 +vpxor 544(%rdi), %ymm10, %ymm10 +vpxor 576(%rdi), %ymm11, %ymm11 +vpxor 608(%rdi), %ymm12, %ymm12 +vpxor 640(%rdi), %ymm8, %ymm8 +vpxor 672(%rdi), %ymm9, %ymm9 +vpxor 704(%rdi), %ymm10, %ymm10 +vpxor 736(%rdi), %ymm11, %ymm11 +vpxor 768(%rdi), %ymm12, %ymm12 +vpsllq $1, %ymm9, %ymm13 +vpsllq $1, %ymm10, %ymm14 +vpsllq $1, %ymm11, %ymm15 +vpsllq $1, %ymm12, %ymm7 +vpsllq $1, %ymm8, %ymm6 +vpsrlq $63, %ymm9, %ymm5 +vpsrlq $63, %ymm10, %ymm4 +vpsrlq $63, %ymm11, %ymm3 +vpsrlq $63, %ymm12, %ymm2 +vpsrlq $63, %ymm8, %ymm1 +vpor %ymm13, %ymm5, %ymm5 +vpor %ymm14, %ymm4, %ymm4 +vpor %ymm15, %ymm3, %ymm3 +vpor %ymm7, %ymm2, %ymm2 +vpor %ymm6, %ymm1, %ymm1 +vpxor %ymm5, %ymm12, %ymm5 +vpxor %ymm4, %ymm8, %ymm4 +vpxor %ymm3, %ymm9, %ymm3 +vpxor %ymm2, %ymm10, %ymm2 +vpxor %ymm1, %ymm11, %ymm1 +vpxor 0(%rdi), %ymm5, %ymm8 +vpxor 32(%rdi), %ymm4, %ymm9 +vpxor 64(%rdi), %ymm3, %ymm10 +vpxor 96(%rdi), %ymm2, %ymm11 +vpxor 128(%rdi), %ymm1, %ymm12 +vpsllq $44, %ymm9, %ymm14 +vpsllq $43, %ymm10, %ymm15 +vpsllq $21, %ymm11, %ymm7 +vpsllq $14, %ymm12, %ymm6 +vpsrlq $20, %ymm9, %ymm9 +vpsrlq $21, %ymm10, %ymm10 +vpsrlq $43, %ymm11, %ymm11 +vpsrlq $50, %ymm12, %ymm12 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vpbroadcastq 24(%rsi), %ymm8 +vpxor %ymm8, %ymm13, %ymm13 +vmovdqa %ymm13, 0(%rdi) +vmovdqa %ymm14, 32(%rdi) +vmovdqa %ymm15, 64(%rdi) +vmovdqa %ymm7, 96(%rdi) +vmovdqa %ymm6, 128(%rdi) +vpxor 256(%rdi), %ymm2, %ymm8 +vpxor 288(%rdi), %ymm1, %ymm9 +vpxor 160(%rdi), %ymm5, %ymm10 +vpxor 192(%rdi), %ymm4, %ymm11 +vpxor 224(%rdi), %ymm3, %ymm12 +vpsllq $28, %ymm8, %ymm13 +vpsllq $20, %ymm9, %ymm14 +vpsllq $3, %ymm10, %ymm15 +vpsllq $45, %ymm11, %ymm7 +vpsllq $61, %ymm12, %ymm6 +vpsrlq $36, %ymm8, %ymm8 +vpsrlq $44, %ymm9, %ymm9 +vpsrlq $61, %ymm10, %ymm10 +vpsrlq $19, %ymm11, %ymm11 +vpsrlq $3, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 160(%rdi) +vmovdqa %ymm14, 192(%rdi) +vmovdqa %ymm15, 224(%rdi) +vmovdqa %ymm7, 256(%rdi) +vmovdqa %ymm6, 288(%rdi) +vpxor 352(%rdi), %ymm4, %ymm8 +vpxor 384(%rdi), %ymm3, %ymm9 +vpxor 416(%rdi), %ymm2, %ymm10 +vpxor 448(%rdi), %ymm1, %ymm11 +vpxor 320(%rdi), %ymm5, %ymm12 +vpsllq $1, %ymm8, %ymm13 +vpsllq $6, %ymm9, %ymm14 +vpsllq $25, %ymm10, %ymm15 +#vpsllq $8, %ymm11, %ymm7 +vpsllq $18, %ymm12, %ymm6 +vpsrlq $63, %ymm8, %ymm8 +vpsrlq $58, %ymm9, %ymm9 +vpsrlq $39, %ymm10, %ymm10 +#vpsrlq $56, %ymm11, %ymm11 +vpsrlq $46, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +#vpor %ymm7, %ymm11, %ymm11 +vpshufb %ymm0, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 320(%rdi) +vmovdqa %ymm14, 352(%rdi) +vmovdqa %ymm15, 384(%rdi) +vmovdqa %ymm7, 416(%rdi) +vmovdqa %ymm6, 448(%rdi) +vpxor 608(%rdi), %ymm1, %ymm8 +vpxor 480(%rdi), %ymm5, %ymm9 +vpxor 512(%rdi), %ymm4, %ymm10 +vpxor 544(%rdi), %ymm3, %ymm11 +vpxor 576(%rdi), %ymm2, %ymm12 +vpsllq $27, %ymm8, %ymm13 +vpsllq $36, %ymm9, %ymm14 +vpsllq $10, %ymm10, %ymm15 +vpsllq $15, %ymm11, %ymm7 +#vpsllq $56, %ymm12, %ymm6 +vpsrlq $37, %ymm8, %ymm8 +vpsrlq $28, %ymm9, %ymm9 +vpsrlq $54, %ymm10, %ymm10 +vpsrlq $49, %ymm11, %ymm11 +#vpsrlq $8, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +#vpor %ymm6, %ymm12, %ymm12 +vpshufb rho56(%rip), %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 480(%rdi) +vmovdqa %ymm14, 512(%rdi) +vmovdqa %ymm15, 544(%rdi) +vmovdqa %ymm7, 576(%rdi) +vmovdqa %ymm6, 608(%rdi) +vpxor 704(%rdi), %ymm3, %ymm8 +vpxor 736(%rdi), %ymm2, %ymm9 +vpxor 768(%rdi), %ymm1, %ymm10 +vpxor 640(%rdi), %ymm5, %ymm11 +vpxor 672(%rdi), %ymm4, %ymm12 +vpsllq $62, %ymm8, %ymm13 +vpsllq $55, %ymm9, %ymm14 +vpsllq $39, %ymm10, %ymm15 +vpsllq $41, %ymm11, %ymm7 +vpsllq $2, %ymm12, %ymm6 +vpsrlq $2, %ymm8, %ymm8 +vpsrlq $9, %ymm9, %ymm9 +vpsrlq $25, %ymm10, %ymm10 +vpsrlq $23, %ymm11, %ymm11 +vpsrlq $62, %ymm12, %ymm12 +vpor %ymm13, %ymm8, %ymm8 +vpor %ymm14, %ymm9, %ymm9 +vpor %ymm15, %ymm10, %ymm10 +vpor %ymm7, %ymm11, %ymm11 +vpor %ymm6, %ymm12, %ymm12 +vpandn %ymm10, %ymm9, %ymm13 +vpandn %ymm11, %ymm10, %ymm14 +vpandn %ymm12, %ymm11, %ymm15 +vpandn %ymm8, %ymm12, %ymm7 +vpandn %ymm9, %ymm8, %ymm6 +vpxor %ymm8, %ymm13, %ymm13 +vpxor %ymm9, %ymm14, %ymm14 +vpxor %ymm10, %ymm15, %ymm15 +vpxor %ymm11, %ymm7, %ymm7 +vpxor %ymm12, %ymm6, %ymm6 +vmovdqa %ymm13, 640(%rdi) +vmovdqa %ymm14, 672(%rdi) +vmovdqa %ymm15, 704(%rdi) +vmovdqa %ymm7, 736(%rdi) +vmovdqa %ymm6, 768(%rdi) +addq $32, %rsi +subq $1, %rax +jnz looptop +ret diff --git a/crypto_sign/dilithium5/avx2/fips202x4.c b/crypto_sign/dilithium5/avx2/fips202x4.c new file mode 100644 index 00000000..6636b507 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/fips202x4.c @@ -0,0 +1,219 @@ +#include "fips202.h" +#include "fips202x4.h" +#include +#include +#include +#include + +#define NROUNDS 24 + +/* Keccak round constants */ +static const uint64_t KeccakF_RoundConstants[NROUNDS] = { + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + +static void keccakx4_absorb_once(__m256i s[25], + unsigned int r, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, + uint8_t p) { + size_t i; + uint64_t pos = 0; + __m256i t, idx; + + for (i = 0; i < 25; ++i) { + s[i] = _mm256_setzero_si256(); + } + + idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); + while (inlen >= r) { + for (i = 0; i < r / 8; ++i) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; + } + inlen -= r; + + PQCLEAN_DILITHIUM5_AVX2_f1600x4(s, KeccakF_RoundConstants); + } + + for (i = 0; i < inlen / 8; ++i) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; + } + inlen -= 8 * i; + + if (inlen) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); + t = _mm256_and_si256(t, idx); + s[i] = _mm256_xor_si256(s[i], t); + } + + t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); + s[i] = _mm256_xor_si256(s[i], t); + t = _mm256_set1_epi64x((long long)(1ULL << 63)); + s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); +} + +static void keccakx4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + unsigned int r, + __m256i s[25]) { + unsigned int i; + __m128d t; + + while (nblocks > 0) { + PQCLEAN_DILITHIUM5_AVX2_f1600x4(s, KeccakF_RoundConstants); + for (i = 0; i < r / 8; ++i) { + t = _mm_castsi128_pd(_mm256_castsi256_si128(s[i])); + _mm_storel_pd((double *)&out0[8 * i], t); + _mm_storeh_pd((double *)&out1[8 * i], t); + t = _mm_castsi128_pd(_mm256_extracti128_si256(s[i], 1)); + _mm_storel_pd((double *)&out2[8 * i], t); + _mm_storeh_pd((double *)&out3[8 * i], t); + } + + out0 += r; + out1 += r; + out2 += r; + out3 += r; + --nblocks; + } +} + +void PQCLEAN_DILITHIUM5_AVX2_shake128x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb_once(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); +} + +void PQCLEAN_DILITHIUM5_AVX2_shake128x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, state->s); +} + +void PQCLEAN_DILITHIUM5_AVX2_shake256x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb_once(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); +} + +void PQCLEAN_DILITHIUM5_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, state->s); +} + +void PQCLEAN_DILITHIUM5_AVX2_shake128x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[4][SHAKE128_RATE]; + keccakx4_state state; + + PQCLEAN_DILITHIUM5_AVX2_shake128x4_absorb_once(&state, in0, in1, in2, in3, inlen); + PQCLEAN_DILITHIUM5_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + out2 += nblocks * SHAKE128_RATE; + out3 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + PQCLEAN_DILITHIUM5_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; + } + } +} + +void PQCLEAN_DILITHIUM5_AVX2_shake256x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + unsigned int i; + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[4][SHAKE256_RATE]; + keccakx4_state state; + + PQCLEAN_DILITHIUM5_AVX2_shake256x4_absorb_once(&state, in0, in1, in2, in3, inlen); + PQCLEAN_DILITHIUM5_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + out2 += nblocks * SHAKE256_RATE; + out3 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + PQCLEAN_DILITHIUM5_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; + } + } +} diff --git a/crypto_sign/dilithium5/avx2/fips202x4.h b/crypto_sign/dilithium5/avx2/fips202x4.h new file mode 100644 index 00000000..826688a3 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/fips202x4.h @@ -0,0 +1,64 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_FIPS202X4_H +#define PQCLEAN_DILITHIUM5_AVX2_FIPS202X4_H + +#include +#include +#include + +typedef struct { + __m256i s[25]; +} keccakx4_state; + +void PQCLEAN_DILITHIUM5_AVX2_f1600x4(__m256i *s, const uint64_t *rc); + +void PQCLEAN_DILITHIUM5_AVX2_shake128x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +void PQCLEAN_DILITHIUM5_AVX2_shake128x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state); + +void PQCLEAN_DILITHIUM5_AVX2_shake256x4_absorb_once(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +void PQCLEAN_DILITHIUM5_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state); + +void PQCLEAN_DILITHIUM5_AVX2_shake128x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +void PQCLEAN_DILITHIUM5_AVX2_shake256x4(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t outlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen); + +#endif diff --git a/crypto_sign/dilithium5/avx2/invntt.S b/crypto_sign/dilithium5/avx2/invntt.S new file mode 100644 index 00000000..0ed38a11 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/invntt.S @@ -0,0 +1,240 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpsubd %ymm\l,%ymm\h,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vpmuldq %ymm\zl0,%ymm12,%ymm13 +vmovshdup %ymm12,%ymm\h +vpmuldq %ymm\zl1,%ymm\h,%ymm14 + +vpmuldq %ymm\zh0,%ymm12,%ymm12 +vpmuldq %ymm\zh1,%ymm\h,%ymm\h + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vpsubd %ymm13,%ymm12,%ymm12 +vpsubd %ymm14,%ymm\h,%ymm\h + +vmovshdup %ymm12,%ymm12 +vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h +.endm + +.macro levels0t5 off +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +/* level 0 */ +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,5,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 6,7,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,9,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 10,11,1,3,2,15 + +/* level 1 */ +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,6,1,3,2,15 +butterfly 5,7,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,10,1,3,2,15 +butterfly 9,11,1,3,2,15 + +/* level 2 */ +vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,8,1,3,2,15 +butterfly 5,9,1,3,2,15 +butterfly 6,10,1,3,2,15 +butterfly 7,11,1,3,2,15 + +/* level 3 */ +shuffle2 4,5,3,5 +shuffle2 6,7,4,7 +shuffle2 8,9,6,9 +shuffle2 10,11,8,11 + +vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 +butterfly 3,5 +butterfly 4,7 +butterfly 6,9 +butterfly 8,11 + +/* level 4 */ +shuffle4 3,4,10,4 +shuffle4 6,8,3,8 +shuffle4 5,7,6,7 +shuffle4 9,11,5,11 + +vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 +butterfly 10,4 +butterfly 3,8 +butterfly 6,7 +butterfly 5,11 + +/* level 5 */ +shuffle8 10,3,9,3 +shuffle8 6,5,10,5 +shuffle8 4,8,6,8 +shuffle8 7,11,4,11 + +vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 +butterfly 9,3 +butterfly 10,5 +butterfly 6,8 +butterfly 4,11 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm10,256*\off+ 32(%rdi) +vmovdqa %ymm6,256*\off+ 64(%rdi) +vmovdqa %ymm4,256*\off+ 96(%rdi) +vmovdqa %ymm3,256*\off+128(%rdi) +vmovdqa %ymm5,256*\off+160(%rdi) +vmovdqa %ymm8,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.macro levels6t7 off +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +/* level 6 */ +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +/* level 7 */ +vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) + +vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 +vmovdqa (_8XDIV)*4(%rsi),%ymm2 +vpmuldq %ymm1,%ymm4,%ymm12 +vpmuldq %ymm1,%ymm5,%ymm13 +vmovshdup %ymm4,%ymm8 +vmovshdup %ymm5,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm4,%ymm4 +vpmuldq %ymm2,%ymm5,%ymm5 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm4,%ymm4 +vpsubd %ymm13,%ymm5,%ymm5 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm4,%ymm4 +vmovshdup %ymm5,%ymm5 +vpblendd $0xAA,%ymm8,%ymm4,%ymm4 +vpblendd $0xAA,%ymm9,%ymm5,%ymm5 + +vpmuldq %ymm1,%ymm6,%ymm12 +vpmuldq %ymm1,%ymm7,%ymm13 +vmovshdup %ymm6,%ymm8 +vmovshdup %ymm7,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm6,%ymm6 +vpmuldq %ymm2,%ymm7,%ymm7 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm6,%ymm6 +vpsubd %ymm13,%ymm7,%ymm7 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm6,%ymm6 +vmovshdup %ymm7,%ymm7 +vpblendd $0xAA,%ymm8,%ymm6,%ymm6 +vpblendd $0xAA,%ymm9,%ymm7,%ymm7 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM5_AVX2_invntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM5_AVX2_invntt_avx) +cdecl(PQCLEAN_DILITHIUM5_AVX2_invntt_avx): +_cdecl(PQCLEAN_DILITHIUM5_AVX2_invntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t5 0 +levels0t5 1 +levels0t5 2 +levels0t5 3 + +levels6t7 0 +levels6t7 1 +levels6t7 2 +levels6t7 3 + +ret diff --git a/crypto_sign/dilithium5/avx2/ntt.S b/crypto_sign/dilithium5/avx2/ntt.S new file mode 100644 index 00000000..971cc84b --- /dev/null +++ b/crypto_sign/dilithium5/avx2/ntt.S @@ -0,0 +1,199 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpmuldq %ymm\zl0,%ymm\h,%ymm13 +vmovshdup %ymm\h,%ymm12 +vpmuldq %ymm\zl1,%ymm12,%ymm14 + +vpmuldq %ymm\zh0,%ymm\h,%ymm\h +vpmuldq %ymm\zh1,%ymm12,%ymm12 + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vmovshdup %ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h + +vpsubd %ymm\h,%ymm\l,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vmovshdup %ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 + +vpaddd %ymm13,%ymm12,%ymm\h +vpsubd %ymm13,%ymm\l,%ymm\l +.endm + +.macro levels0t1 off +/* level 0 */ +vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 + +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +/* level 1 */ +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) +.endm + +.macro levels2t7 off +/* level 2 */ +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +/* level 3 */ +vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 + +butterfly 3,5 +butterfly 8,10 +butterfly 4,6 +butterfly 9,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +/* level 4 */ +vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 + +butterfly 7,8 +butterfly 5,6 +butterfly 3,4 +butterfly 10,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +/* level 5 */ +vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 + +butterfly 9,5,1,10,2,15 +butterfly 8,4,1,10,2,15 +butterfly 7,3,1,10,2,15 +butterfly 6,11,1,10,2,15 + +/* level 6 */ +vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,7,1,10,2,15 +butterfly 8,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,3,1,10,2,15 +butterfly 4,11,1,10,2,15 + +/* level 7 */ +vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,8,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 7,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,4,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 3,11,1,10,2,15 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm8,256*\off+ 32(%rdi) +vmovdqa %ymm7,256*\off+ 64(%rdi) +vmovdqa %ymm6,256*\off+ 96(%rdi) +vmovdqa %ymm5,256*\off+128(%rdi) +vmovdqa %ymm4,256*\off+160(%rdi) +vmovdqa %ymm3,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM5_AVX2_ntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM5_AVX2_ntt_avx) +cdecl(PQCLEAN_DILITHIUM5_AVX2_ntt_avx): +_cdecl(PQCLEAN_DILITHIUM5_AVX2_ntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 + +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 + +ret + diff --git a/crypto_sign/dilithium5/avx2/ntt.h b/crypto_sign/dilithium5/avx2/ntt.h new file mode 100644 index 00000000..98ac1f21 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/ntt.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_NTT_H +#define PQCLEAN_DILITHIUM5_AVX2_NTT_H + +#include + +void PQCLEAN_DILITHIUM5_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM5_AVX2_qdata); +void PQCLEAN_DILITHIUM5_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM5_AVX2_qdata); + +void PQCLEAN_DILITHIUM5_AVX2_nttunpack_avx(__m256i *a); + +void PQCLEAN_DILITHIUM5_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM5_AVX2_qdata); +void PQCLEAN_DILITHIUM5_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM5_AVX2_qdata); + +#endif diff --git a/crypto_sign/dilithium5/avx2/packing.c b/crypto_sign/dilithium5/avx2/packing.c new file mode 100644 index 00000000..2aeec579 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/packing.c @@ -0,0 +1,261 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM5_AVX2_challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Encode h */ + for (i = 0; i < OMEGA + K; ++i) { + sig[i] = 0; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; + } + } + + sig[OMEGA + i] = (uint8_t) k; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM5_AVX2_challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES]) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + return 0; +} diff --git a/crypto_sign/dilithium5/avx2/packing.h b/crypto_sign/dilithium5/avx2/packing.h new file mode 100644 index 00000000..d69bac5a --- /dev/null +++ b/crypto_sign/dilithium5/avx2/packing.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_PACKING_H +#define PQCLEAN_DILITHIUM5_AVX2_PACKING_H +#include "params.h" +#include "polyvec.h" +#include + +void PQCLEAN_DILITHIUM5_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +void PQCLEAN_DILITHIUM5_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM5_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM5_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM5_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM5_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES]); + +#endif diff --git a/crypto_sign/dilithium5/avx2/params.h b/crypto_sign/dilithium5/avx2/params.h new file mode 100644 index 00000000..70ddfabd --- /dev/null +++ b/crypto_sign/dilithium5/avx2/params.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_PARAMS_H +#define PQCLEAN_DILITHIUM5_AVX2_PARAMS_H + + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#define K 8 +#define L 7 +#define ETA 2 +#define TAU 60 +#define BETA 120 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 75 +#define PQCLEAN_DILITHIUM5_AVX2_CRYPTO_ALGNAME "Dilithium5" + + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 640 + +#define POLYW1_PACKEDBYTES 128 + +#define POLYETA_PACKEDBYTES 96 + +#define PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM5_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif diff --git a/crypto_sign/dilithium5/avx2/pointwise.S b/crypto_sign/dilithium5/avx2/pointwise.S new file mode 100644 index 00000000..73c0589e --- /dev/null +++ b/crypto_sign/dilithium5/avx2/pointwise.S @@ -0,0 +1,205 @@ +#include "params.h" +#include "cdecl.h" + +.text +.global cdecl(PQCLEAN_DILITHIUM5_AVX2_pointwise_avx) +.global _cdecl(PQCLEAN_DILITHIUM5_AVX2_pointwise_avx) +cdecl(PQCLEAN_DILITHIUM5_AVX2_pointwise_avx): +_cdecl(PQCLEAN_DILITHIUM5_AVX2_pointwise_avx): +#consts +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop1: +#load +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa 64(%rsi),%ymm6 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vmovdqa 64(%rdx),%ymm14 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm6,%ymm7 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm14,%ymm15 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm6,%ymm14,%ymm6 +vpmuldq %ymm7,%ymm15,%ymm7 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm0,%ymm6,%ymm14 +vpmuldq %ymm0,%ymm7,%ymm15 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpmuldq %ymm1,%ymm14,%ymm14 +vpmuldq %ymm1,%ymm15,%ymm15 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsubq %ymm14,%ymm6,%ymm6 +vpsubq %ymm15,%ymm7,%ymm7 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm6,%ymm6 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 +vpblendd $0xAA,%ymm7,%ymm6,%ymm6 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm6,64(%rdi) + +add $96,%rdi +add $96,%rsi +add $96,%rdx +add $1,%eax +cmp $10,%eax +jb _looptop1 + +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0x55,%ymm2,%ymm3,%ymm2 +vpblendd $0x55,%ymm4,%ymm5,%ymm4 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +ret + +.macro pointwise off +#load +vmovdqa \off(%rsi),%ymm6 +vmovdqa \off+32(%rsi),%ymm8 +vmovdqa \off(%rdx),%ymm10 +vmovdqa \off+32(%rdx),%ymm12 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm6,%ymm10,%ymm6 +vpmuldq %ymm7,%ymm11,%ymm7 +vpmuldq %ymm8,%ymm12,%ymm8 +vpmuldq %ymm9,%ymm13,%ymm9 +.endm + +.macro acc +vpaddq %ymm6,%ymm2,%ymm2 +vpaddq %ymm7,%ymm3,%ymm3 +vpaddq %ymm8,%ymm4,%ymm4 +vpaddq %ymm9,%ymm5,%ymm5 +.endm + +.global cdecl(PQCLEAN_DILITHIUM5_AVX2_pointwise_acc_avx) +.global _cdecl(PQCLEAN_DILITHIUM5_AVX2_pointwise_acc_avx) +cdecl(PQCLEAN_DILITHIUM5_AVX2_pointwise_acc_avx): +_cdecl(PQCLEAN_DILITHIUM5_AVX2_pointwise_acc_avx): +#consts +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop2: +pointwise 0 + +#mov +vmovdqa %ymm6,%ymm2 +vmovdqa %ymm7,%ymm3 +vmovdqa %ymm8,%ymm4 +vmovdqa %ymm9,%ymm5 + +pointwise 1024 +acc + +pointwise 2048 +acc + +pointwise 3072 +acc + +pointwise 4096 +acc + +pointwise 5120 +acc + +pointwise 6144 +acc + +#reduce +vpmuldq %ymm0,%ymm2,%ymm6 +vpmuldq %ymm0,%ymm3,%ymm7 +vpmuldq %ymm0,%ymm4,%ymm8 +vpmuldq %ymm0,%ymm5,%ymm9 +vpmuldq %ymm1,%ymm6,%ymm6 +vpmuldq %ymm1,%ymm7,%ymm7 +vpmuldq %ymm1,%ymm8,%ymm8 +vpmuldq %ymm1,%ymm9,%ymm9 +vpsubq %ymm6,%ymm2,%ymm2 +vpsubq %ymm7,%ymm3,%ymm3 +vpsubq %ymm8,%ymm4,%ymm4 +vpsubq %ymm9,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 + +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +add $64,%rsi +add $64,%rdx +add $64,%rdi +add $1,%eax +cmp $16,%eax +jb _looptop2 + +ret diff --git a/crypto_sign/dilithium5/avx2/poly.c b/crypto_sign/dilithium5/avx2/poly.c new file mode 100644 index 00000000..e6bfd3c8 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/poly.c @@ -0,0 +1,1022 @@ +#include "align.h" +#include "consts.h" +#include "fips202x4.h" +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "rejsample.h" +#include "rounding.h" +#include "symmetric.h" +#include +#include +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. Assumes input +* coefficients to be at most 2^31 - 2^22 - 1 in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_reduce(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM5_AVX2_qdata.vec[_8XQ / 8]); + const __m256i off = _mm256_set1_epi32(1 << 22); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_add_epi32(f, off); + g = _mm256_srai_epi32(g, 23); + g = _mm256_mullo_epi32(g, q); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: poly_addq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_caddq(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM5_AVX2_qdata.vec[_8XQ / 8]); + const __m256i zero = _mm256_setzero_si256(); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_blendv_epi32(zero, q, f); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* positive standard representatives. Assumes input +* coefficients to be at most 2^31 - 2^22 + 1 in +* absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_freeze(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_AVX2_poly_reduce(a); + PQCLEAN_DILITHIUM5_AVX2_poly_caddq(a); + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_shiftl(poly *a) { + unsigned int i; + __m256i f; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_slli_epi32(f, D); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by up to +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_ntt(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_AVX2_ntt_avx(a->vec, PQCLEAN_DILITHIUM5_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_AVX2_invntt_avx(a->vec, PQCLEAN_DILITHIUM5_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +void PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_AVX2_nttunpack_avx(a->vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_AVX2_pointwise_avx(c->vec, a->vec, b->vec, PQCLEAN_DILITHIUM5_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod^+ Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* positive standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_AVX2_power2round_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be positive standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_AVX2_decompose_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_make_hint +* +* Description: Compute hint array. The coefficients of which are the +* indices of the coefficients of the input polynomial +* whose low bits overflow into the high bits. +* +* Arguments: - uint8_t *h: pointer to output hint array (preallocated of length N) +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of hints, i.e. length of hint array. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1) { + unsigned int r; + DBENCH_START(); + + r = PQCLEAN_DILITHIUM5_AVX2_make_hint_avx(hint, a0->vec, a1->vec); + + DBENCH_STOP(*tround); + return r; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_AVX2_use_hint_avx(b->vec, a->vec, h->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input polynomial to be reduced by PQCLEAN_DILITHIUM5_AVX2_poly_reduce(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int r; + __m256i f, t; + const __m256i bound = _mm256_set1_epi32(B - 1); + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + + t = _mm256_setzero_si256(); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_abs_epi32(f); + f = _mm256_cmpgt_epi32(f, bound); + t = _mm256_or_si256(t, f); + } + + r = 1 - _mm256_testz_si256(t, t); + DBENCH_STOP(*tsample); + return r; +} + +/************************************************* +* Name: rej_uniform +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + /* PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx reads up to 8 additional bytes */ + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + /* length of buf is always divisible by 3; hence, no bytes left */ + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_preinit(a, &state); + stream128_release(&state); +} + +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t seed[32], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int ctr0, ctr1, ctr2, ctr3; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf[4]; + keccakx4_state state; + __m256i f; + + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + + buf[0].coeffs[SEEDBYTES + 0] = nonce0; + buf[0].coeffs[SEEDBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[SEEDBYTES + 0] = nonce1; + buf[1].coeffs[SEEDBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[SEEDBYTES + 0] = nonce2; + buf[2].coeffs[SEEDBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[SEEDBYTES + 0] = nonce3; + buf[3].coeffs[SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM5_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, SEEDBYTES + 2); + PQCLEAN_DILITHIUM5_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_NBLOCKS, &state); + + ctr0 = PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx(a0->coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx(a1->coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx(a2->coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx(a3->coeffs, buf[3].coeffs); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM5_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); + + ctr0 += rej_uniform(a0->coeffs + ctr0, N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_uniform(a1->coeffs + ctr1, N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_uniform(a2->coeffs + ctr2, N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_uniform(a3->coeffs + ctr3, N - ctr3, buf[3].coeffs, SHAKE128_RATE); + } +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling using the +* output stream of SHAKE256(seed|nonce) +* or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN * STREAM128_BLOCKBYTES) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_ETA_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM5_AVX2_rej_eta_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_preinit(a, &state); + stream128_release(&state); +} + +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t seed[32], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + unsigned int ctr0, ctr1, ctr2, ctr3; + ALIGNED_UINT8(REJ_UNIFORM_ETA_BUFLEN) buf[4]; + + __m256i f; + keccakx4_state state; + + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + + buf[0].coeffs[SEEDBYTES + 0] = nonce0; + buf[0].coeffs[SEEDBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[SEEDBYTES + 0] = nonce1; + buf[1].coeffs[SEEDBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[SEEDBYTES + 0] = nonce2; + buf[2].coeffs[SEEDBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[SEEDBYTES + 0] = nonce3; + buf[3].coeffs[SEEDBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM5_AVX2_shake128x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, SEEDBYTES + 2); + PQCLEAN_DILITHIUM5_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, REJ_UNIFORM_ETA_NBLOCKS, &state); + + ctr0 = PQCLEAN_DILITHIUM5_AVX2_rej_eta_avx(a0->coeffs, buf[0].coeffs); + ctr1 = PQCLEAN_DILITHIUM5_AVX2_rej_eta_avx(a1->coeffs, buf[1].coeffs); + ctr2 = PQCLEAN_DILITHIUM5_AVX2_rej_eta_avx(a2->coeffs, buf[2].coeffs); + ctr3 = PQCLEAN_DILITHIUM5_AVX2_rej_eta_avx(a3->coeffs, buf[3].coeffs); + + while (ctr0 < N || ctr1 < N || ctr2 < N || ctr3 < N) { + PQCLEAN_DILITHIUM5_AVX2_shake128x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, 1, &state); + + ctr0 += rej_eta(a0->coeffs + ctr0, N - ctr0, buf[0].coeffs, SHAKE128_RATE); + ctr1 += rej_eta(a1->coeffs + ctr1, N - ctr1, buf[1].coeffs, SHAKE128_RATE); + ctr2 += rej_eta(a2->coeffs + ctr2, N - ctr2, buf[2].coeffs, SHAKE128_RATE); + ctr3 += rej_eta(a3->coeffs + ctr3, N - ctr3, buf[3].coeffs, SHAKE128_RATE); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES+STREAM256_BLOCKBYTES-1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state) { + /* PQCLEAN_DILITHIUM5_AVX2_polyz_unpack reads 14 additional bytes */ + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf; + stream256_squeezeblocks(buf.coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, state); + PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(a, buf.coeffs); +} + +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { + stream256_state state; + stream256_init(&state, seed, nonce); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1_preinit(a, &state); + stream256_release(&state); +} + +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t seed[48], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3) { + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf[4]; + keccakx4_state state; + __m256i f; + __m128i g; + + f = _mm256_loadu_si256((__m256i *)seed); + _mm256_store_si256(buf[0].vec, f); + _mm256_store_si256(buf[1].vec, f); + _mm256_store_si256(buf[2].vec, f); + _mm256_store_si256(buf[3].vec, f); + g = _mm_loadu_si128((__m128i *)&seed[32]); + _mm_store_si128((__m128i *)&buf[0].vec[1], g); + _mm_store_si128((__m128i *)&buf[1].vec[1], g); + _mm_store_si128((__m128i *)&buf[2].vec[1], g); + _mm_store_si128((__m128i *)&buf[3].vec[1], g); + + buf[0].coeffs[CRHBYTES + 0] = nonce0; + buf[0].coeffs[CRHBYTES + 1] = nonce0 >> 8; + buf[1].coeffs[CRHBYTES + 0] = nonce1; + buf[1].coeffs[CRHBYTES + 1] = nonce1 >> 8; + buf[2].coeffs[CRHBYTES + 0] = nonce2; + buf[2].coeffs[CRHBYTES + 1] = nonce2 >> 8; + buf[3].coeffs[CRHBYTES + 0] = nonce3; + buf[3].coeffs[CRHBYTES + 1] = nonce3 >> 8; + + PQCLEAN_DILITHIUM5_AVX2_shake256x4_absorb_once(&state, buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, CRHBYTES + 2); + PQCLEAN_DILITHIUM5_AVX2_shake256x4_squeezeblocks(buf[0].coeffs, buf[1].coeffs, buf[2].coeffs, buf[3].coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + + PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(a0, buf[0].coeffs); + PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(a1, buf[1].coeffs); + PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(a2, buf[2].coeffs); + PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(a3, buf[3].coeffs); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_poly_challenge(poly *restrict c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + ALIGNED_UINT8(SHAKE256_RATE) buf; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + + memcpy(&signs, buf.coeffs, 8); + pos = 8; + + memset(c->vec, 0, sizeof(poly)); + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + pos = 0; + } + + b = buf.coeffs[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = ETA - a->coeffs[8 * i + 0]; + t[1] = ETA - a->coeffs[8 * i + 1]; + t[2] = ETA - a->coeffs[8 * i + 2]; + t[3] = ETA - a->coeffs[8 * i + 3]; + t[4] = ETA - a->coeffs[8 * i + 4]; + t[5] = ETA - a->coeffs[8 * i + 5]; + t[6] = ETA - a->coeffs[8 * i + 6]; + t[7] = ETA - a->coeffs[8 * i + 7]; + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyeta_unpack(poly *restrict r, const uint8_t a[POLYETA_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be positive standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2); + r[5 * i + 2] = (a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4); + r[5 * i + 3] = (a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6); + r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are positive standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyt1_unpack(poly *restrict r, const uint8_t a[POLYT1_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = t[0]; + r[13 * i + 1] = t[0] >> 8; + r[13 * i + 1] |= t[1] << 5; + r[13 * i + 2] = t[1] >> 3; + r[13 * i + 3] = t[1] >> 11; + r[13 * i + 3] |= t[2] << 2; + r[13 * i + 4] = t[2] >> 6; + r[13 * i + 4] |= t[3] << 7; + r[13 * i + 5] = t[3] >> 1; + r[13 * i + 6] = t[3] >> 9; + r[13 * i + 6] |= t[4] << 4; + r[13 * i + 7] = t[4] >> 4; + r[13 * i + 8] = t[4] >> 12; + r[13 * i + 8] |= t[5] << 1; + r[13 * i + 9] = t[5] >> 7; + r[13 * i + 9] |= t[6] << 6; + r[13 * i + 10] = t[6] >> 2; + r[13 * i + 11] = t[6] >> 10; + r[13 * i + 11] |= t[7] << 3; + r[13 * i + 12] = t[7] >> 5; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyt0_unpack(poly *restrict r, const uint8_t a[POLYT0_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; + + r[5 * i + 0] = t[0]; + r[5 * i + 1] = t[0] >> 8; + r[5 * i + 2] = t[0] >> 16; + r[5 * i + 2] |= t[1] << 4; + r[5 * i + 3] = t[1] >> 4; + r[5 * i + 4] = t[1] >> 12; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(poly *restrict r, const uint8_t a[POLYZ_PACKEDBYTES + 12]) { + unsigned int i; + __m256i f; + const __m256i shufbidx = _mm256_set_epi8(-1, 11, 10, 9, -1, 9, 8, 7, -1, 6, 5, 4, -1, 4, 3, 2, + -1, 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0); + const __m256i srlvdidx = _mm256_set1_epi64x((uint64_t)4 << 32); + const __m256i mask = _mm256_set1_epi32(0xFFFFF); + const __m256i gamma1 = _mm256_set1_epi32(GAMMA1); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_srlv_epi32(f, srlvdidx); + f = _mm256_and_si256(f, mask); + f = _mm256_sub_epi32(gamma1, f); + _mm256_store_si256(&r->vec[i], f); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be positive standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + __m256i f0, f1, f2, f3, f4, f5, f6, f7; + const __m256i shift = _mm256_set1_epi16((16 << 8) + 1); + const __m256i shufbidx = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, + 15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + DBENCH_START(); + + for (i = 0; i < N / 64; ++i) { + f0 = _mm256_load_si256(&a->vec[8 * i + 0]); + f1 = _mm256_load_si256(&a->vec[8 * i + 1]); + f2 = _mm256_load_si256(&a->vec[8 * i + 2]); + f3 = _mm256_load_si256(&a->vec[8 * i + 3]); + f4 = _mm256_load_si256(&a->vec[8 * i + 4]); + f5 = _mm256_load_si256(&a->vec[8 * i + 5]); + f6 = _mm256_load_si256(&a->vec[8 * i + 6]); + f7 = _mm256_load_si256(&a->vec[8 * i + 7]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f2 = _mm256_packus_epi32(f4, f5); + f3 = _mm256_packus_epi32(f6, f7); + f0 = _mm256_packus_epi16(f0, f1); + f1 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift); + f1 = _mm256_maddubs_epi16(f1, shift); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_permute4x64_epi64(f0, 0xD8); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); + } + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium5/avx2/poly.h b/crypto_sign/dilithium5/avx2/poly.h new file mode 100644 index 00000000..0dc8c4ac --- /dev/null +++ b/crypto_sign/dilithium5/avx2/poly.h @@ -0,0 +1,79 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_POLY_H +#define PQCLEAN_DILITHIUM5_AVX2_POLY_H +#include "align.h" +#include "params.h" +#include "symmetric.h" +#include + +typedef ALIGNED_INT32(N) poly; + +void PQCLEAN_DILITHIUM5_AVX2_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM5_AVX2_poly_caddq(poly *a); +void PQCLEAN_DILITHIUM5_AVX2_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM5_AVX2_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM5_AVX2_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM5_AVX2_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM5_AVX2_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(poly *a); +void PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM5_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM5_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM5_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM5_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM5_AVX2_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM5_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t seed[SEEDBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t seed[SEEDBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); +void PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1_4x(poly *a0, + poly *a1, + poly *a2, + poly *a3, + const uint8_t seed[CRHBYTES], + uint16_t nonce0, + uint16_t nonce1, + uint16_t nonce2, + uint16_t nonce3); + +void PQCLEAN_DILITHIUM5_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM5_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM5_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM5_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM5_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM5_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM5_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); + +void PQCLEAN_DILITHIUM5_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); + +#endif diff --git a/crypto_sign/dilithium5/avx2/polyvec.c b/crypto_sign/dilithium5/avx2/polyvec.c new file mode 100644 index 00000000..de295523 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/polyvec.c @@ -0,0 +1,538 @@ +#include "consts.h" +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include + +#define UNUSED(x) (void)x + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row0(&mat[0], &mat[1], rho); + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row1(&mat[1], &mat[2], rho); + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row2(&mat[2], &mat[3], rho); + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row3(&mat[3], NULL, rho); + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row4(&mat[4], &mat[5], rho); + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row5(&mat[5], &mat[6], rho); + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row6(&mat[6], &mat[7], rho); + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row7(&mat[7], NULL, rho); +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 0, 1, 2, 3); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[4], &rowa->vec[5], &rowa->vec[6], &rowb->vec[0], rho, 4, 5, 6, 256); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[0]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[5]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[6]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[0]); +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[1], &rowa->vec[2], &rowa->vec[3], &rowa->vec[4], rho, 257, 258, 259, 260); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[5], &rowa->vec[6], &rowb->vec[0], &rowb->vec[1], rho, 261, 262, 512, 513); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[5]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[6]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[0]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[1]); +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[2], &rowa->vec[3], &rowa->vec[4], &rowa->vec[5], rho, 514, 515, 516, 517); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[6], &rowb->vec[0], &rowb->vec[1], &rowb->vec[2], rho, 518, 768, 769, 770); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[5]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[6]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[0]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[1]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[2]); +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + UNUSED(rowb); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[3], &rowa->vec[4], &rowa->vec[5], &rowa->vec[6], rho, 771, 772, 773, 774); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[5]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[6]); +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[0], &rowa->vec[1], &rowa->vec[2], &rowa->vec[3], rho, 1024, 1025, 1026, 1027); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[4], &rowa->vec[5], &rowa->vec[6], &rowb->vec[0], rho, 1028, 1029, 1030, 1280); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[0]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[5]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[6]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[0]); +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[1], &rowa->vec[2], &rowa->vec[3], &rowa->vec[4], rho, 1281, 1282, 1283, 1284); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[5], &rowa->vec[6], &rowb->vec[0], &rowb->vec[1], rho, 1285, 1286, 1536, 1537); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[1]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[5]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[6]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[0]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[1]); +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[2], &rowa->vec[3], &rowa->vec[4], &rowa->vec[5], rho, 1538, 1539, 1540, 1541); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[6], &rowb->vec[0], &rowb->vec[1], &rowb->vec[2], rho, 1542, 1792, 1793, 1794); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[2]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[5]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[6]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[0]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[1]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowb->vec[2]); +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]) { + UNUSED(rowb); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_4x(&rowa->vec[3], &rowa->vec[4], &rowa->vec[5], &rowa->vec[6], rho, 1795, 1796, 1797, 1798); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[3]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[4]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[5]); + PQCLEAN_DILITHIUM5_AVX2_poly_nttunpack(&rowa->vec[6]); +} + + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); + } +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_ntt(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { + PQCLEAN_DILITHIUM5_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM5_AVX2_qdata.vec); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM5_AVX2_polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM5_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_caddq(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM5_AVX2_polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM5_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - uint8_t *hint: pointer to output hint array +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { + unsigned int i, n = 0; + + for (i = 0; i < K; ++i) { + n += PQCLEAN_DILITHIUM5_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); + } + + return n; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} diff --git a/crypto_sign/dilithium5/avx2/polyvec.h b/crypto_sign/dilithium5/avx2/polyvec.h new file mode 100644 index 00000000..6213436b --- /dev/null +++ b/crypto_sign/dilithium5/avx2/polyvec.h @@ -0,0 +1,72 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_POLYVEC_H +#define PQCLEAN_DILITHIUM5_AVX2_POLYVEC_H +#include "params.h" +#include "poly.h" +#include + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_reduce(polyvecl *v); + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM5_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + +int PQCLEAN_DILITHIUM5_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM5_AVX2_polyveck_caddq(polyveck *v); +void PQCLEAN_DILITHIUM5_AVX2_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM5_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM5_AVX2_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM5_AVX2_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM5_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + +int PQCLEAN_DILITHIUM5_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM5_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM5_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); +void PQCLEAN_DILITHIUM5_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM5_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row0(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row1(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row2(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row3(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row4(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row5(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row6(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row7(polyvecl *rowa, polyvecl *rowb, const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium4/avx2/rejsample.c b/crypto_sign/dilithium5/avx2/rejsample.c similarity index 66% rename from crypto_sign/dilithium4/avx2/rejsample.c rename to crypto_sign/dilithium5/avx2/rejsample.c index d7dfe41c..96159f34 100644 --- a/crypto_sign/dilithium4/avx2/rejsample.c +++ b/crypto_sign/dilithium5/avx2/rejsample.c @@ -1,9 +1,10 @@ -#include - #include "params.h" #include "rejsample.h" +#include "symmetric.h" +#include +#include -static const uint8_t idx[256][8] = { +const uint8_t PQCLEAN_DILITHIUM5_AVX2_idxlut[256][8] = { { 0, 0, 0, 0, 0, 0, 0, 0}, { 0, 0, 0, 0, 0, 0, 0, 0}, { 1, 0, 0, 0, 0, 0, 0, 0}, @@ -262,178 +263,144 @@ static const uint8_t idx[256][8] = { { 0, 1, 2, 3, 4, 5, 6, 7} }; -uint32_t PQCLEAN_DILITHIUM4_AVX2_rej_uniform( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos, vec[8]; - __m256i d, tmp; +unsigned int PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { + unsigned int ctr, pos; uint32_t good; + __m256i d, tmp; const __m256i bound = _mm256_set1_epi32(Q); + const __m256i mask = _mm256_set1_epi32(0x7FFFFF); + const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, + -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, + -1, 5, 4, 3, -1, 2, 1, 0); ctr = pos = 0; - while (ctr + 8 <= len && pos + 24 <= buflen) { - for (size_t i = 0; i < 8; i++) { - vec[i] = buf[pos++]; - vec[i] |= (uint32_t)buf[pos++] << 8; - vec[i] |= (uint32_t)buf[pos++] << 16; - vec[i] &= 0x7FFFFF; - } + while (pos <= REJ_UNIFORM_BUFLEN - 24) { + d = _mm256_loadu_si256((__m256i *)&buf[pos]); + d = _mm256_permute4x64_epi64(d, 0x94); + d = _mm256_shuffle_epi8(d, idx8); + d = _mm256_and_si256(d, mask); + pos += 24; - d = _mm256_loadu_si256((__m256i_u *)vec); - tmp = _mm256_cmpgt_epi32(bound, d); + tmp = _mm256_sub_epi32(d, bound); good = _mm256_movemask_ps((__m256)tmp); - - __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); - tmp = _mm256_cvtepu8_epi32(rid); + tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good])); d = _mm256_permutevar8x32_epi32(d, tmp); - _mm256_storeu_si256((__m256i_u *)&r[ctr], d); - ctr += __builtin_popcount(good); + + _mm256_storeu_si256((__m256i *)&r[ctr], d); + ctr += _mm_popcnt_u32(good); + + if (ctr > N - 8) { + break; + } } - while (ctr < len && pos + 3 <= buflen) { - vec[0] = buf[pos++]; - vec[0] |= (uint32_t)buf[pos++] << 8; - vec[0] |= (uint32_t)buf[pos++] << 16; - vec[0] &= 0x7FFFFF; + uint32_t t; + while (ctr < N && pos <= REJ_UNIFORM_BUFLEN - 3) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; - if (vec[0] < Q) { - r[ctr++] = vec[0]; + if (t < Q) { + r[ctr++] = t; } } return ctr; } -uint32_t PQCLEAN_DILITHIUM4_AVX2_rej_eta( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos; - uint8_t vec[32]; - __m256i tmp0, tmp1; - __m128i d0, d1, rid; +unsigned int PQCLEAN_DILITHIUM5_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { + unsigned int ctr, pos; uint32_t good; - const __m256i bound = _mm256_set1_epi8(2 * ETA + 1); - const __m256i off = _mm256_set1_epi32(Q + ETA); + __m256i f0, f1, f2; + __m128i g0, g1; + const __m256i mask = _mm256_set1_epi8(15); + const __m256i eta = _mm256_set1_epi8(ETA); + const __m256i bound = mask; + const __m256i v = _mm256_set1_epi32(-6560); + const __m256i p = _mm256_set1_epi32(5); ctr = pos = 0; - while (ctr + 32 <= len && pos + 16 <= buflen) { - for (size_t i = 0; i < 16; i++) { - vec[2 * i + 0] = buf[pos] & 0x07; - vec[2 * i + 1] = buf[pos++] >> 5; + while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { + f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); + f1 = _mm256_slli_epi16(f0, 4); + f0 = _mm256_or_si256(f0, f1); + f0 = _mm256_and_si256(f0, mask); + + f1 = _mm256_sub_epi8(f0, bound); + f0 = _mm256_sub_epi8(eta, f0); + good = _mm256_movemask_epi8(f1); + + g0 = _mm256_castsi256_si128(f0); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; - tmp0 = _mm256_loadu_si256((__m256i_u *)vec); - tmp1 = _mm256_cmpgt_epi8(bound, tmp0); - good = _mm256_movemask_epi8(tmp1); + if (ctr > N - 8) { + break; + } + g0 = _mm256_extracti128_si256(f0, 1); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; - d0 = _mm256_castsi256_si128(tmp0); - rid = _mm_loadl_epi64((__m128i_u *)&idx[good & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount(good & 0xFF); - - d0 = _mm_bsrli_si128(d0, 8); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 8) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 8) & 0xFF); - - d0 = _mm256_extracti128_si256(tmp0, 1); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 16) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 16) & 0xFF); - - d0 = _mm_bsrli_si128(d0, 8); - rid = _mm_loadl_epi64((__m128i_u *)&idx[(good >> 24) & 0xFF]); - d1 = _mm_shuffle_epi8(d0, rid); - tmp1 = _mm256_cvtepu8_epi32(d1); - tmp1 = _mm256_sub_epi32(off, tmp1); - _mm256_storeu_si256((__m256i_u *)&r[ctr], tmp1); - ctr += __builtin_popcount((good >> 24) & 0xFF); + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5_AVX2_idxlut[good]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good); + pos += 4; } - while (ctr < len && pos < buflen) { - vec[0] = buf[pos] & 0x07; - vec[1] = buf[pos++] >> 5; + uint32_t t0, t1; + while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; - if (vec[0] <= 2 * ETA) { - r[ctr++] = Q + ETA - vec[0]; + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + r[ctr++] = 2 - t0; } - if (vec[1] <= 2 * ETA && ctr < len) { - r[ctr++] = Q + ETA - vec[1]; - } - } - - return ctr; -} - -uint32_t PQCLEAN_DILITHIUM4_AVX2_rej_gamma1m1( - uint32_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - uint32_t ctr, pos; - uint32_t vec[8]; - __m256i d, tmp; - uint32_t good; - const __m256i bound = _mm256_set1_epi32(2 * GAMMA1 - 1); - const __m256i off = _mm256_set1_epi32(Q + GAMMA1 - 1); - - ctr = pos = 0; - while (ctr + 8 <= len && pos + 20 <= buflen) { - for (size_t i = 0; i < 4; i++) { - vec[2 * i + 0] = buf[pos + 0]; - vec[2 * i + 0] |= (uint32_t)buf[pos + 1] << 8; - vec[2 * i + 0] |= (uint32_t)buf[pos + 2] << 16; - vec[2 * i + 0] &= 0xFFFFF; - - vec[2 * i + 1] = buf[pos + 2] >> 4; - vec[2 * i + 1] |= (uint32_t)buf[pos + 3] << 4; - vec[2 * i + 1] |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - } - - d = _mm256_loadu_si256((__m256i_u *)vec); - tmp = _mm256_cmpgt_epi32(bound, d); - good = _mm256_movemask_ps((__m256)tmp); - d = _mm256_sub_epi32(off, d); - - __m128i rid = _mm_loadl_epi64((__m128i_u *)&idx[good]); - tmp = _mm256_cvtepu8_epi32(rid); - d = _mm256_permutevar8x32_epi32(d, tmp); - _mm256_storeu_si256((__m256i_u *)&r[ctr], d); - ctr += __builtin_popcount(good); - } - - while (ctr < len && pos + 5 <= buflen) { - vec[0] = buf[pos + 0]; - vec[0] |= (uint32_t)buf[pos + 1] << 8; - vec[0] |= (uint32_t)buf[pos + 2] << 16; - vec[0] &= 0xFFFFF; - - vec[1] = buf[pos + 2] >> 4; - vec[1] |= (uint32_t)buf[pos + 3] << 4; - vec[1] |= (uint32_t)buf[pos + 4] << 12; - - pos += 5; - - if (vec[0] <= 2 * GAMMA1 - 2) { - r[ctr++] = Q + GAMMA1 - 1 - vec[0]; - } - if (vec[1] <= 2 * GAMMA1 - 2 && ctr < len) { - r[ctr++] = Q + GAMMA1 - 1 - vec[1]; + if (t1 < 15 && ctr < N) { + t1 = t1 - (205 * t1 >> 10) * 5; + r[ctr++] = 2 - t1; } } diff --git a/crypto_sign/dilithium5/avx2/rejsample.h b/crypto_sign/dilithium5/avx2/rejsample.h new file mode 100644 index 00000000..78786bf8 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/rejsample.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_REJSAMPLE_H +#define PQCLEAN_DILITHIUM5_AVX2_REJSAMPLE_H +#include "params.h" +#include "symmetric.h" +#include + +#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) + +#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) + +extern const uint8_t PQCLEAN_DILITHIUM5_AVX2_idxlut[256][8]; + +unsigned int PQCLEAN_DILITHIUM5_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); + +unsigned int PQCLEAN_DILITHIUM5_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); + +#endif diff --git a/crypto_sign/dilithium5/avx2/rounding.c b/crypto_sign/dilithium5/avx2/rounding.c new file mode 100644 index 00000000..ad1c7128 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/rounding.c @@ -0,0 +1,154 @@ +#include "consts.h" +#include "params.h" +#include "rejsample.h" +#include "rounding.h" +#include +#include +#include + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +/************************************************* +* Name: power2round +* +* Description: For finite field elements a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be positive standard representative. +* +* Arguments: - __m256i *a1: output array of length N/8 with high bits +* - __m256i *a0: output array of length N/8 with low bits a0 +* - const __m256i *a: input array of length N/8 +* +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i mask = _mm256_set1_epi32(-(1 << D)); + const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); + + for (i = 0; i < N / 8; ++i) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, half); + f0 = _mm256_and_si256(f1, mask); + f1 = _mm256_srli_epi32(f1, D); + f0 = _mm256_sub_epi32(f, f0); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + +/************************************************* +* Name: decompose +* +* Description: For finite field element a, compute high and low parts a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard +* representative. +* +* Arguments: - __m256i *a1: output array of length N/8 with high parts +* - __m256i *a0: output array of length N/8 with low parts a0 +* - const __m256i *a: input array of length N/8 +* +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM5_AVX2_qdata.vec[_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + const __m256i v = _mm256_set1_epi32(1025); + const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(512); + const __m256i mask = _mm256_set1_epi32(15); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + + +/************************************************* +* Name: make_hint +* +* Description: Compute indices of polynomial coefficients whose low bits +* overflow into the high bits. +* +* Arguments: - uint8_t *hint: hint array +* - const __m256i *a0: low bits of input elements +* - const __m256i *a1: high bits of input elements +* +* Returns number of overflowing low bits +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { + unsigned int i, n = 0; + __m256i f0, f1, g0, g1; + uint32_t bad; + uint64_t idx; + const __m256i low = _mm256_set1_epi32(-GAMMA2); + const __m256i high = _mm256_set1_epi32(GAMMA2); + + for (i = 0; i < N / 8; ++i) { + f0 = _mm256_load_si256(&a0[i]); + f1 = _mm256_load_si256(&a1[i]); + g0 = _mm256_abs_epi32(f0); + g0 = _mm256_cmpgt_epi32(g0, high); + g1 = _mm256_cmpeq_epi32(f0, low); + g1 = _mm256_sign_epi32(g1, f1); + g0 = _mm256_or_si256(g0, g1); + + bad = _mm256_movemask_ps((__m256)g0); + memcpy(&idx, PQCLEAN_DILITHIUM5_AVX2_idxlut[bad], 8); + idx += (uint64_t)0x0808080808080808 * i; + memcpy(&hint[n], &idx, 8); + n += _mm_popcnt_u32(bad); + } + + return n; +} + +/************************************************* +* Name: use_hint +* +* Description: Correct high parts according to hint. +* +* Arguments: - __m256i *b: output array of length N/8 with corrected high parts +* - const __m256i *a: input array of length N/8 +* - const __m256i *a: input array of length N/8 with hint bits +* +**************************************************/ +void PQCLEAN_DILITHIUM5_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { + unsigned int i; + __m256i a0[N / 8]; + __m256i f, g, h, t; + const __m256i zero = _mm256_setzero_si256(); + const __m256i mask = _mm256_set1_epi32(15); + + PQCLEAN_DILITHIUM5_AVX2_decompose_avx(b, a0, a); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a0[i]); + g = _mm256_load_si256(&b[i]); + h = _mm256_load_si256(&hint[i]); + t = _mm256_blendv_epi32(zero, h, f); + t = _mm256_slli_epi32(t, 1); + h = _mm256_sub_epi32(h, t); + g = _mm256_add_epi32(g, h); + g = _mm256_and_si256(g, mask); + _mm256_store_si256(&b[i], g); + } +} diff --git a/crypto_sign/dilithium5/avx2/rounding.h b/crypto_sign/dilithium5/avx2/rounding.h new file mode 100644 index 00000000..c3483f1e --- /dev/null +++ b/crypto_sign/dilithium5/avx2/rounding.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_ROUNDING_H +#define PQCLEAN_DILITHIUM5_AVX2_ROUNDING_H +#include "params.h" +#include +#include + +void PQCLEAN_DILITHIUM5_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); +void PQCLEAN_DILITHIUM5_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); +unsigned int PQCLEAN_DILITHIUM5_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); +void PQCLEAN_DILITHIUM5_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); + +#endif diff --git a/crypto_sign/dilithium5/avx2/shuffle.S b/crypto_sign/dilithium5/avx2/shuffle.S new file mode 100644 index 00000000..ab186107 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/shuffle.S @@ -0,0 +1,54 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +#store +vmovdqa %ymm9,(%rdi) +vmovdqa %ymm8,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm3,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_DILITHIUM5_AVX2_nttunpack_avx) +.global _cdecl(PQCLEAN_DILITHIUM5_AVX2_nttunpack_avx) +cdecl(PQCLEAN_DILITHIUM5_AVX2_nttunpack_avx): +_cdecl(PQCLEAN_DILITHIUM5_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret diff --git a/crypto_sign/dilithium5/avx2/shuffle.inc b/crypto_sign/dilithium5/avx2/shuffle.inc new file mode 100644 index 00000000..73e9ffe0 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/shuffle.inc @@ -0,0 +1,25 @@ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm diff --git a/crypto_sign/dilithium5/avx2/sign.c b/crypto_sign/dilithium5/avx2/sign.c new file mode 100644 index 00000000..050a5371 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/sign.c @@ -0,0 +1,435 @@ +#include "align.h" +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" +#include +#include + +static inline void polyvec_matrix_expand_row(polyvecl **row, polyvecl buf[2], const uint8_t rho[SEEDBYTES], unsigned int i) { + switch (i) { + case 0: + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row0(buf, buf + 1, rho); + *row = buf; + break; + case 1: + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row1(buf + 1, buf, rho); + *row = buf + 1; + break; + case 2: + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row2(buf, buf + 1, rho); + *row = buf; + break; + case 3: + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row3(buf + 1, buf, rho); + *row = buf + 1; + break; + case 4: + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row4(buf, buf + 1, rho); + *row = buf; + break; + case 5: + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row5(buf + 1, buf, rho); + *row = buf + 1; + break; + case 6: + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row6(buf, buf + 1, rho); + *row = buf; + break; + case 7: + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand_row7(buf + 1, buf, rho); + *row = buf + 1; + break; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM5_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; + uint8_t seedbuf[3 * SEEDBYTES]; + const uint8_t *rho, *rhoprime, *key; + polyvecl rowbuf[2]; + polyvecl s1, *row = rowbuf; + polyveck s2; + poly t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Store rho, key */ + memcpy(pk, rho, SEEDBYTES); + memcpy(sk, rho, SEEDBYTES); + memcpy(sk + SEEDBYTES, key, SEEDBYTES); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_4x(&s1.vec[0], &s1.vec[1], &s1.vec[2], &s1.vec[3], rhoprime, 0, 1, 2, 3); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_4x(&s1.vec[4], &s1.vec[5], &s1.vec[6], &s2.vec[0], rhoprime, 4, 5, 6, 7); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_4x(&s2.vec[1], &s2.vec[2], &s2.vec[3], &s2.vec[4], rhoprime, 8, 9, 10, 11); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_eta_4x(&s2.vec[5], &s2.vec[6], &s2.vec[7], &t0, rhoprime, 12, 13, 14, 15); + + /* Pack secret vectors */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM5_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); + } + for (i = 0; i < K; i++) { + PQCLEAN_DILITHIUM5_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); + } + + /* Transform s1 */ + PQCLEAN_DILITHIUM5_AVX2_polyvecl_ntt(&s1); + + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + polyvec_matrix_expand_row(&row, rowbuf, rho, i); + + /* Compute inner-product */ + PQCLEAN_DILITHIUM5_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); + PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(&t1); + + /* Add error polynomial */ + PQCLEAN_DILITHIUM5_AVX2_poly_add(&t1, &t1, &s2.vec[i]); + + /* Round t and pack t1, t0 */ + PQCLEAN_DILITHIUM5_AVX2_poly_caddq(&t1); + PQCLEAN_DILITHIUM5_AVX2_poly_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM5_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); + PQCLEAN_DILITHIUM5_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); + } + + /* Compute CRH(rho, t1) and store in secret key */ + crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + unsigned int i, n, pos; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint8_t hintbuf[N]; + uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce = 0; + polyvecl mat[K], s1, z; + polyveck t0, s2, w1; + poly c, tmp; + union { + polyvecl y; + polyveck w0; + } tmpv; + shake256incctx state; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM5_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + /* Compute CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_expand(mat, rho); + PQCLEAN_DILITHIUM5_AVX2_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM5_AVX2_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM5_AVX2_polyveck_ntt(&t0); + + +rej: + /* Sample intermediate vector y */ + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1_4x(&z.vec[0], &z.vec[1], &z.vec[2], &z.vec[3], + rhoprime, nonce, nonce + 1, nonce + 2, nonce + 3); + PQCLEAN_DILITHIUM5_AVX2_poly_uniform_gamma1_4x(&z.vec[4], &z.vec[5], &z.vec[6], &tmp, + rhoprime, nonce + 4, nonce + 5, nonce + 6, 0); + nonce += 7; + + /* Matrix-vector product */ + tmpv.y = z; + PQCLEAN_DILITHIUM5_AVX2_polyvecl_ntt(&tmpv.y); + PQCLEAN_DILITHIUM5_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); + PQCLEAN_DILITHIUM5_AVX2_polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM5_AVX2_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM5_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); + PQCLEAN_DILITHIUM5_AVX2_polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM5_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM5_AVX2_poly_ntt(&c); + + /* Compute z, reject if it reveals secret */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); + PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM5_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); + PQCLEAN_DILITHIUM5_AVX2_poly_reduce(&z.vec[i]); + if (PQCLEAN_DILITHIUM5_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { + goto rej; + } + } + + /* Zero hint vector in signature */ + pos = 0; + memset(hint, 0, OMEGA); + + for (i = 0; i < K; i++) { + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); + PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM5_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + PQCLEAN_DILITHIUM5_AVX2_poly_reduce(&tmpv.w0.vec[i]); + if (PQCLEAN_DILITHIUM5_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints */ + PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); + PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM5_AVX2_poly_reduce(&tmp); + if (PQCLEAN_DILITHIUM5_AVX2_poly_chknorm(&tmp, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM5_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + n = PQCLEAN_DILITHIUM5_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); + if (pos + n > OMEGA) { + goto rej; + } + + /* Store hints in signature */ + memcpy(&hint[pos], hintbuf, n); + hint[OMEGA + i] = pos = pos + n; + } + + /* Pack z into signature */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM5_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); + } + + *siglen = PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM5_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { + unsigned int i, j, pos = 0; + /* PQCLEAN_DILITHIUM5_AVX2_polyw1_pack writes additional 14 bytes */ + ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; + uint8_t mu[CRHBYTES]; + const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + polyvecl rowbuf[2]; + polyvecl *row = rowbuf; + polyvecl z; + poly c, w1, h; + shake256incctx state; + + if (siglen != PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM5_AVX2_CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + /* Expand PQCLEAN_DILITHIUM5_AVX2_challenge */ + PQCLEAN_DILITHIUM5_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM5_AVX2_poly_ntt(&c); + + /* Unpack z; shortness follows from unpacking */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM5_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); + PQCLEAN_DILITHIUM5_AVX2_poly_ntt(&z.vec[i]); + } + + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + polyvec_matrix_expand_row(&row, rowbuf, pk, i); + + /* Compute i-th row of Az - c2^Dt1 */ + PQCLEAN_DILITHIUM5_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); + + PQCLEAN_DILITHIUM5_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); + PQCLEAN_DILITHIUM5_AVX2_poly_shiftl(&h); + PQCLEAN_DILITHIUM5_AVX2_poly_ntt(&h); + PQCLEAN_DILITHIUM5_AVX2_poly_pointwise_montgomery(&h, &c, &h); + + PQCLEAN_DILITHIUM5_AVX2_poly_sub(&w1, &w1, &h); + PQCLEAN_DILITHIUM5_AVX2_poly_reduce(&w1); + PQCLEAN_DILITHIUM5_AVX2_poly_invntt_tomont(&w1); + + /* Get hint polynomial and reconstruct w1 */ + memset(h.vec, 0, sizeof(poly)); + if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { + return -1; + } + + for (j = pos; j < hint[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > pos && hint[j] <= hint[j - 1]) { + return -1; + } + h.coeffs[hint[j]] = 1; + } + pos = hint[OMEGA + i]; + + PQCLEAN_DILITHIUM5_AVX2_poly_caddq(&w1); + PQCLEAN_DILITHIUM5_AVX2_poly_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM5_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); + } + + /* Extra indices are zero for strong unforgeability */ + for (j = pos; j < OMEGA; ++j) { + if (hint[j]) { + return -1; + } + } + + /* Call random oracle and verify PQCLEAN_DILITHIUM5_AVX2_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (buf.coeffs[i] != sig[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_AVX2_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { + size_t i; + + if (smlen < PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES) { + goto badsig; + } + + *mlen = smlen - PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM5_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM5_AVX2_CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium5/avx2/sign.h b/crypto_sign/dilithium5/avx2/sign.h new file mode 100644 index 00000000..e1c0ecf4 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/sign.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_SIGN_H +#define PQCLEAN_DILITHIUM5_AVX2_SIGN_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM5_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM5_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium5/avx2/symmetric-shake.c b/crypto_sign/dilithium5/avx2/symmetric-shake.c new file mode 100644 index 00000000..aee9daa5 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/symmetric-shake.c @@ -0,0 +1,26 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM5_AVX2_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake128_inc_init(state); + shake128_inc_absorb(state, seed, SEEDBYTES); + shake128_inc_absorb(state, t, 2); + shake128_inc_finalize(state); +} + +void PQCLEAN_DILITHIUM5_AVX2_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake256_inc_init(state); + shake256_inc_absorb(state, seed, CRHBYTES); + shake256_inc_absorb(state, t, 2); + shake256_inc_finalize(state); +} diff --git a/crypto_sign/dilithium5/avx2/symmetric.h b/crypto_sign/dilithium5/avx2/symmetric.h new file mode 100644 index 00000000..3d5e8a50 --- /dev/null +++ b/crypto_sign/dilithium5/avx2/symmetric.h @@ -0,0 +1,36 @@ +#ifndef PQCLEAN_DILITHIUM5_AVX2_SYMMETRIC_H +#define PQCLEAN_DILITHIUM5_AVX2_SYMMETRIC_H +#include "fips202.h" +#include "params.h" +#include + + + +typedef shake128incctx stream128_state; +typedef shake256incctx stream256_state; + +void PQCLEAN_DILITHIUM5_AVX2_dilithium_shake128_stream_init(shake128incctx *state, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); + +void PQCLEAN_DILITHIUM5_AVX2_dilithium_shake256_stream_init(shake256incctx *state, + const uint8_t seed[CRHBYTES], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES SHAKE128_RATE +#define STREAM256_BLOCKBYTES SHAKE256_RATE + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM5_AVX2_dilithium_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) +#define stream128_release(STATE) shake128_inc_ctx_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM5_AVX2_dilithium_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) +#define stream256_release(STATE) shake256_inc_ctx_release(STATE) + + +#endif diff --git a/crypto_sign/dilithium5/clean/LICENSE b/crypto_sign/dilithium5/clean/LICENSE new file mode 100644 index 00000000..08473af7 --- /dev/null +++ b/crypto_sign/dilithium5/clean/LICENSE @@ -0,0 +1,5 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium5/clean/Makefile b/crypto_sign/dilithium5/clean/Makefile new file mode 100644 index 00000000..5e731017 --- /dev/null +++ b/crypto_sign/dilithium5/clean/Makefile @@ -0,0 +1,19 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium5_clean.a +HEADERS=api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS=ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-shake.o + +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_sign/dilithium5/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium5/clean/Makefile.Microsoft_nmake new file mode 100644 index 00000000..017ca211 --- /dev/null +++ b/crypto_sign/dilithium5/clean/Makefile.Microsoft_nmake @@ -0,0 +1,23 @@ +# This Makefile can be used with Microsoft Visual Studio's nmake using the command: +# nmake /f Makefile.Microsoft_nmake + +LIBRARY=libdilithium5_clean.lib +OBJECTS=ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-shake.obj + +# Warning C4146 is raised when a unary minus operator is applied to an +# unsigned type; this has nonetheless been standard and portable for as +# long as there has been a C standard, and we need it for constant-time +# computations. Thus, we disable that spurious warning. +CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 + +all: $(LIBRARY) + +# Make sure objects are recompiled if headers change. +$(OBJECTS): *.h + +$(LIBRARY): $(OBJECTS) + LIB.EXE /NOLOGO /WX /OUT:$@ $** + +clean: + -DEL $(OBJECTS) + -DEL $(LIBRARY) diff --git a/crypto_sign/dilithium5/clean/api.h b/crypto_sign/dilithium5/clean/api.h new file mode 100644 index 00000000..8a8d7901 --- /dev/null +++ b/crypto_sign/dilithium5/clean/api.h @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_API_H +#define PQCLEAN_DILITHIUM5_CLEAN_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES 2592 +#define PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_SECRETKEYBYTES 4880 +#define PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES 4595 +#define PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_ALGNAME "Dilithium5" + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium5/clean/ntt.c b/crypto_sign/dilithium5/clean/ntt.c new file mode 100644 index 00000000..4f654191 --- /dev/null +++ b/crypto_sign/dilithium5/clean/ntt.c @@ -0,0 +1,98 @@ +#include "ntt.h" +#include "params.h" +#include "reduce.h" +#include + +static const int32_t zetas[N] = { + 0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, + 2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, + -2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, + -1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, + 3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, + -671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, + -3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, + -3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, + 189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, + 1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, + 2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, + 266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, + 900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, + -655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, + 342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, + 2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, + -3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, + -1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, + -1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, + -542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, + -2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, + -3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, + -3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, + -426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, + -2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, + -554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 +}; + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_ntt +* +* Description: Forward NTT, in-place. No modular reduction is performed after +* additions or subtractions. Output vector is in bitreversed order. +* +* Arguments: - uint32_t p[N]: input/output coefficient array +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_ntt(int32_t a[N]) { + unsigned int len, start, j, k; + int32_t zeta, t; + + k = 0; + for (len = 128; len > 0; len >>= 1) { + for (start = 0; start < N; start = j + len) { + zeta = zetas[++k]; + for (j = start; j < start + len; ++j) { + t = PQCLEAN_DILITHIUM5_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + a[j + len] = a[j] - t; + a[j] = a[j] + t; + } + } + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_invntt_tomont +* +* Description: Inverse NTT and multiplication by Montgomery factor 2^32. +* In-place. No modular reductions after additions or +* subtractions; input coefficients need to be smaller than +* Q in absolute value. Output coefficient are smaller than Q in +* absolute value. +* +* Arguments: - uint32_t p[N]: input/output coefficient array +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_invntt_tomont(int32_t a[N]) { + unsigned int start, len, j, k; + int32_t t, zeta; + const int32_t f = 41978; // mont^2/256 + + k = 256; + for (len = 1; len < N; len <<= 1) { + for (start = 0; start < N; start = j + len) { + zeta = -zetas[--k]; + for (j = start; j < start + len; ++j) { + t = a[j]; + a[j] = t + a[j + len]; + a[j + len] = t - a[j + len]; + a[j + len] = PQCLEAN_DILITHIUM5_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + } + } + } + + for (j = 0; j < N; ++j) { + a[j] = PQCLEAN_DILITHIUM5_CLEAN_montgomery_reduce((int64_t)f * a[j]); + } +} diff --git a/crypto_sign/dilithium5/clean/ntt.h b/crypto_sign/dilithium5/clean/ntt.h new file mode 100644 index 00000000..747b315c --- /dev/null +++ b/crypto_sign/dilithium5/clean/ntt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_NTT_H +#define PQCLEAN_DILITHIUM5_CLEAN_NTT_H +#include "params.h" +#include + +void PQCLEAN_DILITHIUM5_CLEAN_ntt(int32_t a[N]); + +void PQCLEAN_DILITHIUM5_CLEAN_invntt_tomont(int32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium5/clean/packing.c b/crypto_sign/dilithium5/clean/packing.c new file mode 100644 index 00000000..34542a2b --- /dev/null +++ b/crypto_sign/dilithium5/clean/packing.c @@ -0,0 +1,261 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM5_CLEAN_challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Encode h */ + for (i = 0; i < OMEGA + K; ++i) { + sig[i] = 0; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; + } + } + + sig[OMEGA + i] = (uint8_t) k; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM5_CLEAN_challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES]) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + return 0; +} diff --git a/crypto_sign/dilithium5/clean/packing.h b/crypto_sign/dilithium5/clean/packing.h new file mode 100644 index 00000000..f1fa637a --- /dev/null +++ b/crypto_sign/dilithium5/clean/packing.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_PACKING_H +#define PQCLEAN_DILITHIUM5_CLEAN_PACKING_H +#include "params.h" +#include "polyvec.h" +#include + +void PQCLEAN_DILITHIUM5_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +void PQCLEAN_DILITHIUM5_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM5_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM5_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM5_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM5_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES]); + +#endif diff --git a/crypto_sign/dilithium5/clean/params.h b/crypto_sign/dilithium5/clean/params.h new file mode 100644 index 00000000..f7604a1d --- /dev/null +++ b/crypto_sign/dilithium5/clean/params.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_PARAMS_H +#define PQCLEAN_DILITHIUM5_CLEAN_PARAMS_H + + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#define K 8 +#define L 7 +#define ETA 2 +#define TAU 60 +#define BETA 120 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 75 +#define PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_ALGNAME "Dilithium5" + + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 640 + +#define POLYW1_PACKEDBYTES 128 + +#define POLYETA_PACKEDBYTES 96 + +#define PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif diff --git a/crypto_sign/dilithium5/clean/poly.c b/crypto_sign/dilithium5/clean/poly.c new file mode 100644 index 00000000..ff12495b --- /dev/null +++ b/crypto_sign/dilithium5/clean/poly.c @@ -0,0 +1,842 @@ +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "reduce.h" +#include "rounding.h" +#include "symmetric.h" +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_reduce(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM5_CLEAN_reduce32(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_caddq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_caddq(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM5_CLEAN_caddq(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* standard representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_freeze(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM5_CLEAN_freeze(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_shiftl(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] <<= D; + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_ntt(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_CLEAN_ntt(a->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5_CLEAN_invntt_tomont(a->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = PQCLEAN_DILITHIUM5_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM5_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM5_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { + unsigned int i, s = 0; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + h->coeffs[i] = PQCLEAN_DILITHIUM5_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + + DBENCH_STOP(*tround); + return s; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + b->coeffs[i] = PQCLEAN_DILITHIUM5_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM5_CLEAN_reduce32(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int32_t t; + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for (i = 0; i < N; ++i) { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2 * a->coeffs[i]); + + if (t >= B) { + DBENCH_STOP(*tsample); + return 1; + } + } + + DBENCH_STOP(*tsample); + return 0; +} + +/************************************************* +* Name: rej_uniform +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM5_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); + + ctr = rej_uniform(a->coeffs, N, buf, buflen); + + while (ctr < N) { + off = buflen % 3; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + stream128_squeezeblocks(buf + off, 1, &state); + buflen = STREAM128_BLOCKBYTES + off; + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); + } + stream128_release(&state); +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling on the +* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM5_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = rej_eta(a->coeffs, N, buf, buflen); + + while (ctr < N) { + stream128_squeezeblocks(buf, 1, &state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); + } + stream128_release(&state); +} + +/************************************************* +* Name: poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM5_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce) { + uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + stream256_release(&state); + PQCLEAN_DILITHIUM5_CLEAN_polyz_unpack(a, buf); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[SHAKE256_RATE]; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf, sizeof buf, &state); + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)buf[i] << 8 * i; + } + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf, sizeof buf, &state); + pos = 0; + } + + b = buf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (uint8_t) (ETA - a->coeffs[8 * i + 0]); + t[1] = (uint8_t) (ETA - a->coeffs[8 * i + 1]); + t[2] = (uint8_t) (ETA - a->coeffs[8 * i + 2]); + t[3] = (uint8_t) (ETA - a->coeffs[8 * i + 3]); + t[4] = (uint8_t) (ETA - a->coeffs[8 * i + 4]); + t[5] = (uint8_t) (ETA - a->coeffs[8 * i + 5]); + t[6] = (uint8_t) (ETA - a->coeffs[8 * i + 6]); + t[7] = (uint8_t) (ETA - a->coeffs[8 * i + 7]); + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)); + r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)); + r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)); + r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = (uint8_t) t[0]; + r[13 * i + 1] = (uint8_t) (t[0] >> 8); + r[13 * i + 1] |= (uint8_t) (t[1] << 5); + r[13 * i + 2] = (uint8_t) (t[1] >> 3); + r[13 * i + 3] = (uint8_t) (t[1] >> 11); + r[13 * i + 3] |= (uint8_t) (t[2] << 2); + r[13 * i + 4] = (uint8_t) (t[2] >> 6); + r[13 * i + 4] |= (uint8_t) (t[3] << 7); + r[13 * i + 5] = (uint8_t) (t[3] >> 1); + r[13 * i + 6] = (uint8_t) (t[3] >> 9); + r[13 * i + 6] |= (uint8_t) (t[4] << 4); + r[13 * i + 7] = (uint8_t) (t[4] >> 4); + r[13 * i + 8] = (uint8_t) (t[4] >> 12); + r[13 * i + 8] |= (uint8_t) (t[5] << 1); + r[13 * i + 9] = (uint8_t) (t[5] >> 7); + r[13 * i + 9] |= (uint8_t) (t[6] << 6); + r[13 * i + 10] = (uint8_t) (t[6] >> 2); + r[13 * i + 11] = (uint8_t) (t[6] >> 10); + r[13 * i + 11] |= (uint8_t) (t[7] << 3); + r[13 * i + 12] = (uint8_t) (t[7] >> 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyz_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; + + r[5 * i + 0] = (uint8_t) t[0]; + r[5 * i + 1] = (uint8_t) (t[0] >> 8); + r[5 * i + 2] = (uint8_t) (t[0] >> 16); + r[5 * i + 2] |= (uint8_t) (t[1] << 4); + r[5 * i + 3] = (uint8_t) (t[1] >> 4); + r[5 * i + 4] = (uint8_t) (t[1] >> 12); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[5 * i + 0]; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16; + r->coeffs[2 * i + 0] &= 0xFFFFF; + + r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + r->coeffs[2 * i + 0] &= 0xFFFFF; + + r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)); + } + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium5/clean/poly.h b/crypto_sign/dilithium5/clean/poly.h new file mode 100644 index 00000000..f768f63e --- /dev/null +++ b/crypto_sign/dilithium5/clean/poly.h @@ -0,0 +1,53 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_POLY_H +#define PQCLEAN_DILITHIUM5_CLEAN_POLY_H +#include "params.h" +#include + +typedef struct { + int32_t coeffs[N]; +} poly; + +void PQCLEAN_DILITHIUM5_CLEAN_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_poly_caddq(poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM5_CLEAN_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM5_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM5_CLEAN_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM5_CLEAN_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM5_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM5_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM5_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM5_CLEAN_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM5_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM5_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM5_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM5_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +void PQCLEAN_DILITHIUM5_CLEAN_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM5_CLEAN_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM5_CLEAN_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM5_CLEAN_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM5_CLEAN_polyz_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM5_CLEAN_polyw1_pack(uint8_t *r, const poly *a); + +#endif diff --git a/crypto_sign/dilithium5/clean/polyvec.c b/crypto_sign/dilithium5/clean/polyvec.c new file mode 100644 index 00000000..e996d1af --- /dev/null +++ b/crypto_sign/dilithium5/clean/polyvec.c @@ -0,0 +1,448 @@ +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + + for (i = 0; i < K; ++i) { + for (j = 0; j < L; ++j) { + PQCLEAN_DILITHIUM5_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); + } + } +} + +void PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); + } +} + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_ntt(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + unsigned int i; + poly t; + + PQCLEAN_DILITHIUM5_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); + for (i = 1; i < L; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); + PQCLEAN_DILITHIUM5_CLEAN_poly_add(w, w, &t); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM5_CLEAN_polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM5_CLEAN_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_caddq(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM5_CLEAN_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; + + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM5_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + } + + return s; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} diff --git a/crypto_sign/dilithium5/clean/polyvec.h b/crypto_sign/dilithium5/clean/polyvec.h new file mode 100644 index 00000000..9d564f67 --- /dev/null +++ b/crypto_sign/dilithium5/clean/polyvec.h @@ -0,0 +1,68 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_POLYVEC_H +#define PQCLEAN_DILITHIUM5_CLEAN_POLYVEC_H +#include "params.h" +#include "poly.h" +#include + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_reduce(polyvecl *v); + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + + +int PQCLEAN_DILITHIUM5_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_caddq(polyveck *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + +int PQCLEAN_DILITHIUM5_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM5_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM5_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium5/clean/reduce.c b/crypto_sign/dilithium5/clean/reduce.c new file mode 100644 index 00000000..ded7c28b --- /dev/null +++ b/crypto_sign/dilithium5/clean/reduce.c @@ -0,0 +1,69 @@ +#include "params.h" +#include "reduce.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_montgomery_reduce +* +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. +* +* Arguments: - int64_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5_CLEAN_montgomery_reduce(int64_t a) { + int32_t t; + + t = (int32_t)((uint64_t)a * (uint64_t)QINV); + t = (a - (int64_t)t * Q) >> 32; + return t; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_reduce32 +* +* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, +* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5_CLEAN_reduce32(int32_t a) { + int32_t t; + + t = (a + (1 << 22)) >> 23; + t = a - t * Q; + return t; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_caddq +* +* Description: Add Q if input coefficient is negative. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5_CLEAN_caddq(int32_t a) { + a += (a >> 31) & Q; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_freeze +* +* Description: For finite field element a, compute standard +* representative r = a mod^+ Q. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5_CLEAN_freeze(int32_t a) { + a = PQCLEAN_DILITHIUM5_CLEAN_reduce32(a); + a = PQCLEAN_DILITHIUM5_CLEAN_caddq(a); + return a; +} diff --git a/crypto_sign/dilithium5/clean/reduce.h b/crypto_sign/dilithium5/clean/reduce.h new file mode 100644 index 00000000..4448149a --- /dev/null +++ b/crypto_sign/dilithium5/clean/reduce.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_REDUCE_H +#define PQCLEAN_DILITHIUM5_CLEAN_REDUCE_H +#include "params.h" +#include + +#define MONT (-4186625) // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 + +int32_t PQCLEAN_DILITHIUM5_CLEAN_montgomery_reduce(int64_t a); + +int32_t PQCLEAN_DILITHIUM5_CLEAN_reduce32(int32_t a); + +int32_t PQCLEAN_DILITHIUM5_CLEAN_caddq(int32_t a); + +int32_t PQCLEAN_DILITHIUM5_CLEAN_freeze(int32_t a); + +#endif diff --git a/crypto_sign/dilithium5/clean/rounding.c b/crypto_sign/dilithium5/clean/rounding.c new file mode 100644 index 00000000..7a8e3b01 --- /dev/null +++ b/crypto_sign/dilithium5/clean/rounding.c @@ -0,0 +1,92 @@ +#include "params.h" +#include "rounding.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5_CLEAN_power2round(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + (1 << (D - 1)) - 1) >> D; + *a0 = a - (a1 << D); + return a1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5_CLEAN_decompose(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + 127) >> 7; + a1 = (a1 * 1025 + (1 << 21)) >> 22; + a1 &= 15; + + *a0 = a - a1 * 2 * GAMMA2; + *a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; + return a1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. +* +* Arguments: - int32_t a0: low bits of input element +* - int32_t a1: high bits of input element +* +* Returns 1 if overflow. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5_CLEAN_make_hint(int32_t a0, int32_t a1) { + if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { + return 1; + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - int32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5_CLEAN_use_hint(int32_t a, unsigned int hint) { + int32_t a0, a1; + + a1 = PQCLEAN_DILITHIUM5_CLEAN_decompose(&a0, a); + if (hint == 0) { + return a1; + } + + if (a0 > 0) { + return (a1 + 1) & 15; + } + return (a1 - 1) & 15; +} diff --git a/crypto_sign/dilithium5/clean/rounding.h b/crypto_sign/dilithium5/clean/rounding.h new file mode 100644 index 00000000..820e9cd6 --- /dev/null +++ b/crypto_sign/dilithium5/clean/rounding.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_ROUNDING_H +#define PQCLEAN_DILITHIUM5_CLEAN_ROUNDING_H +#include "params.h" +#include + +int32_t PQCLEAN_DILITHIUM5_CLEAN_power2round(int32_t *a0, int32_t a); + +int32_t PQCLEAN_DILITHIUM5_CLEAN_decompose(int32_t *a0, int32_t a); + +unsigned int PQCLEAN_DILITHIUM5_CLEAN_make_hint(int32_t a0, int32_t a1); + +int32_t PQCLEAN_DILITHIUM5_CLEAN_use_hint(int32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium5/clean/sign.c b/crypto_sign/dilithium5/clean/sign.c new file mode 100644 index 00000000..791821b4 --- /dev/null +++ b/crypto_sign/dilithium5/clean/sign.c @@ -0,0 +1,343 @@ +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Expand matrix */ + PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_expand(mat, rho); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); + + /* Matrix-vector multiplication */ + s1hat = s1; + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_ntt(&s1hat); + PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce(&t1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_invntt_tomont(&t1); + + /* Add error vector s2 */ + PQCLEAN_DILITHIUM5_CLEAN_polyveck_add(&t1, &t1, &s2); + + /* Extract t1 and write public key */ + PQCLEAN_DILITHIUM5_CLEAN_polyveck_caddq(&t1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM5_CLEAN_pack_pk(pk, rho, &t1); + + /* Compute CRH(rho, t1) and write secret key */ + crh(tr, pk, PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES); + PQCLEAN_DILITHIUM5_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + unsigned int n; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint16_t nonce = 0; + polyvecl mat[K], s1, y, z; + polyveck t0, s2, w1, w0, h; + poly cp; + shake256incctx state; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM5_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + /* Compute CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_expand(mat, rho); + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); + + /* Matrix-vector multiplication */ + z = y; + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM5_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_decompose(&w1, &w0, &w1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM5_CLEAN_poly_challenge(&cp, sig); + PQCLEAN_DILITHIUM5_CLEAN_poly_ntt(&cp); + + /* Compute z, reject if it reveals secret */ + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_invntt_tomont(&z); + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_add(&z, &z, &y); + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_reduce(&z); + if (PQCLEAN_DILITHIUM5_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + goto rej; + } + + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM5_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_sub(&w0, &w0, &h); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce(&w0); + if (PQCLEAN_DILITHIUM5_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints for w1 */ + PQCLEAN_DILITHIUM5_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce(&h); + if (PQCLEAN_DILITHIUM5_CLEAN_polyveck_chknorm(&h, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM5_CLEAN_polyveck_add(&w0, &w0, &h); + n = PQCLEAN_DILITHIUM5_CLEAN_polyveck_make_hint(&h, &w0, &w1); + if (n > OMEGA) { + goto rej; + } + + /* Write signature */ + PQCLEAN_DILITHIUM5_CLEAN_pack_sig(sig, sig, &z, &h); + *siglen = PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) { + unsigned int i; + uint8_t buf[K * POLYW1_PACKEDBYTES]; + uint8_t rho[SEEDBYTES]; + uint8_t mu[CRHBYTES]; + uint8_t c[SEEDBYTES]; + uint8_t c2[SEEDBYTES]; + poly cp; + polyvecl mat[K], z; + polyveck t1, w1, h; + shake256incctx state; + + if (siglen != PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES) { + return -1; + } + + PQCLEAN_DILITHIUM5_CLEAN_unpack_pk(rho, &t1, pk); + if (PQCLEAN_DILITHIUM5_CLEAN_unpack_sig(c, &z, &h, sig)) { + return -1; + } + if (PQCLEAN_DILITHIUM5_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + PQCLEAN_DILITHIUM5_CLEAN_poly_challenge(&cp, c); + PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_expand(mat, rho); + + PQCLEAN_DILITHIUM5_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM5_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + + PQCLEAN_DILITHIUM5_CLEAN_poly_ntt(&cp); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_shiftl(&t1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_ntt(&t1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); + + PQCLEAN_DILITHIUM5_CLEAN_polyveck_sub(&w1, &w1, &t1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_invntt_tomont(&w1); + + /* Reconstruct w1 */ + PQCLEAN_DILITHIUM5_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM5_CLEAN_polyveck_pack_w1(buf, &w1); + + /* Call random oracle and verify PQCLEAN_DILITHIUM5_CLEAN_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(c2, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (c[i] != c2[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) { + size_t i; + + if (smlen < PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES) { + goto badsig; + } + + *mlen = smlen - PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM5_CLEAN_CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = (size_t) -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium5/clean/sign.h b/crypto_sign/dilithium5/clean/sign.h new file mode 100644 index 00000000..82c6855b --- /dev/null +++ b/crypto_sign/dilithium5/clean/sign.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_SIGN_H +#define PQCLEAN_DILITHIUM5_CLEAN_SIGN_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM5_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM5_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium5/clean/symmetric-shake.c b/crypto_sign/dilithium5/clean/symmetric-shake.c new file mode 100644 index 00000000..23faa5f0 --- /dev/null +++ b/crypto_sign/dilithium5/clean/symmetric-shake.c @@ -0,0 +1,26 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM5_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake128_inc_init(state); + shake128_inc_absorb(state, seed, SEEDBYTES); + shake128_inc_absorb(state, t, 2); + shake128_inc_finalize(state); +} + +void PQCLEAN_DILITHIUM5_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, const uint8_t seed[CRHBYTES], uint16_t nonce) { + uint8_t t[2]; + t[0] = (uint8_t) nonce; + t[1] = (uint8_t) (nonce >> 8); + + shake256_inc_init(state); + shake256_inc_absorb(state, seed, CRHBYTES); + shake256_inc_absorb(state, t, 2); + shake256_inc_finalize(state); +} diff --git a/crypto_sign/dilithium5/clean/symmetric.h b/crypto_sign/dilithium5/clean/symmetric.h new file mode 100644 index 00000000..92ed263b --- /dev/null +++ b/crypto_sign/dilithium5/clean/symmetric.h @@ -0,0 +1,36 @@ +#ifndef PQCLEAN_DILITHIUM5_CLEAN_SYMMETRIC_H +#define PQCLEAN_DILITHIUM5_CLEAN_SYMMETRIC_H +#include "fips202.h" +#include "params.h" +#include + + + +typedef shake128incctx stream128_state; +typedef shake256incctx stream256_state; + +void PQCLEAN_DILITHIUM5_CLEAN_dilithium_shake128_stream_init(shake128incctx *state, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); + +void PQCLEAN_DILITHIUM5_CLEAN_dilithium_shake256_stream_init(shake256incctx *state, + const uint8_t seed[CRHBYTES], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES SHAKE128_RATE +#define STREAM256_BLOCKBYTES SHAKE256_RATE + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM5_CLEAN_dilithium_shake128_stream_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE128_RATE), STATE) +#define stream128_release(STATE) shake128_inc_ctx_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM5_CLEAN_dilithium_shake256_stream_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake256_inc_squeeze(OUT, (OUTBLOCKS)*(SHAKE256_RATE), STATE) +#define stream256_release(STATE) shake256_inc_ctx_release(STATE) + + +#endif diff --git a/crypto_sign/dilithium5aes/META.yml b/crypto_sign/dilithium5aes/META.yml new file mode 100644 index 00000000..e14b2cf7 --- /dev/null +++ b/crypto_sign/dilithium5aes/META.yml @@ -0,0 +1,31 @@ +name: Dilithium5-AES +type: signature +claimed-nist-level: 5 +length-public-key: 2592 +length-secret-key: 4880 +length-signature: 4595 +nistkat-sha256: 882d5050d6289875cbaa3bd920ec60ff3e2895257cbe8f76ed9d3735daa188c6 +testvectors-sha256: 8289af5b8aeb78bd6a642d1899364ce3ab9f3b2bd4c66da9a9031a9832e71545 +principal-submitters: + - Vadim Lyubashevsky +auxiliary-submitters: + - Léo Ducas + - Eike Kiltz + - Tancrède Lepoint + - Peter Schwabe + - Gregor Seiler + - Damien Stehlé +implementations: + - name: clean + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium + - name: avx2 + version: https://github.com/pq-crystals/dilithium/commit/1e63a1e880401166f105ab44ec67464c9714a315 via https://github.com/jschanck/package-pqclean/tree/b158a891/dilithium + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - aes + - avx2 + - popcnt diff --git a/crypto_sign/dilithium5aes/avx2/LICENSE b/crypto_sign/dilithium5aes/avx2/LICENSE new file mode 100644 index 00000000..08473af7 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/LICENSE @@ -0,0 +1,5 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium5aes/avx2/Makefile b/crypto_sign/dilithium5aes/avx2/Makefile new file mode 100644 index 00000000..dc96feb1 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/Makefile @@ -0,0 +1,23 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium5aes_avx2.a +HEADERS=aes256ctr.h align.h api.h cdecl.h consts.h ntt.h packing.h params.h poly.h polyvec.h rejsample.h rounding.h sign.h symmetric.h shuffle.inc +OBJECTS=aes256ctr.o consts.o packing.o poly.o polyvec.o rejsample.o rounding.o sign.o invntt.o ntt.o pointwise.o shuffle.o +CFLAGS=-mavx2 -maes -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls -std=c99 \ + -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +%.o: %.S $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_sign/dilithium5aes/avx2/aes256ctr.c b/crypto_sign/dilithium5aes/avx2/aes256ctr.c new file mode 100644 index 00000000..ab61383d --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/aes256ctr.c @@ -0,0 +1,142 @@ +#include "aes256ctr.h" +#include +#include +#include +/* Based heavily on public-domain code by Romain Dolbeau + * Different handling of nonce+counter than original version using + * separated 64-bit nonce and internal 64-bit counter, starting from zero + * Public Domain */ + + +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { + __m128i f, f0, f1, f2, f3; + const __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + + /* Load current counter value */ + f = _mm_load_si128(n); + + /* Increase counter in 4 consecutive blocks */ + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), idx); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), idx); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), idx); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), idx); + + /* Write counter for next iteration, increased by 4 */ + _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); + + /* Actual AES encryption, 4x interleaved */ + f = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, f); + f1 = _mm_xor_si128(f1, f); + f2 = _mm_xor_si128(f2, f); + f3 = _mm_xor_si128(f3, f); + + for (int i = 1; i < 14; i++) { + f = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, f); + f1 = _mm_aesenc_si128(f1, f); + f2 = _mm_aesenc_si128(f2, f); + f3 = _mm_aesenc_si128(f3, f); + } + + f = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, f); + f1 = _mm_aesenclast_si128(f1, f); + f2 = _mm_aesenclast_si128(f2, f); + f3 = _mm_aesenclast_si128(f3, f); + + /* Write results */ + _mm_storeu_si128((__m128i *)(out + 0), f0); + _mm_storeu_si128((__m128i *)(out + 16), f1); + _mm_storeu_si128((__m128i *)(out + 32), f2); + _mm_storeu_si128((__m128i *)(out + 48), f3); +} + +void PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) { + __m128i key0, key1, temp0, temp1, temp2, temp4; + int idx = 0; + + key0 = _mm_loadu_si128((__m128i *)(key + 0)); + key1 = _mm_loadu_si128((__m128i *)(key + 16)); + state->n = _mm_loadl_epi64((__m128i *)&nonce); + + state->rkeys[idx++] = key0; + temp0 = key0; + temp2 = key1; + temp4 = _mm_setzero_si128(); + +#define BLOCK1(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp2, IMM); \ + state->rkeys[idx++] = temp2; \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x10); \ + temp0 = _mm_xor_si128(temp0, temp4); \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp0, 0x8c); \ + temp0 = _mm_xor_si128(temp0, temp4); \ + temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xff); \ + temp0 = _mm_xor_si128(temp0, temp1) + +#define BLOCK2(IMM) \ + temp1 = _mm_aeskeygenassist_si128(temp0, IMM); \ + state->rkeys[idx++] = temp0; \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x10); \ + temp2 = _mm_xor_si128(temp2, temp4); \ + temp4 = (__m128i)_mm_shuffle_ps((__m128)temp4, (__m128)temp2, 0x8c); \ + temp2 = _mm_xor_si128(temp2, temp4); \ + temp1 = (__m128i)_mm_shuffle_ps((__m128)temp1, (__m128)temp1, 0xaa); \ + temp2 = _mm_xor_si128(temp2, temp1) + + BLOCK1(0x01); + BLOCK2(0x01); + + BLOCK1(0x02); + BLOCK2(0x02); + + BLOCK1(0x04); + BLOCK2(0x04); + + BLOCK1(0x08); + BLOCK2(0x08); + + BLOCK1(0x10); + BLOCK2(0x10); + + BLOCK1(0x20); + BLOCK2(0x20); + + BLOCK1(0x40); + state->rkeys[idx++] = temp0; +} + +void PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state) { + size_t i; + for (i = 0; i < nblocks; i++) { + aesni_encrypt4(out, &state->n, state->rkeys); + out += 64; + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t seed[32], + uint64_t nonce) { + unsigned int i; + uint8_t buf[64]; + aes256ctr_ctx state; + + PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(&state, seed, nonce); + + while (outlen >= 64) { + aesni_encrypt4(out, &state.n, state.rkeys); + outlen -= 64; + out += 64; + } + + if (outlen) { + aesni_encrypt4(buf, &state.n, state.rkeys); + for (i = 0; i < outlen; i++) { + out[i] = buf[i]; + } + } +} diff --git a/crypto_sign/dilithium5aes/avx2/aes256ctr.h b/crypto_sign/dilithium5aes/avx2/aes256ctr.h new file mode 100644 index 00000000..9245f9ef --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/aes256ctr.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_AES256CTR_H +#define PQCLEAN_DILITHIUM5AES_AVX2_AES256CTR_H + +#include +#include +#include + + +#define AES256CTR_BLOCKBYTES 64 + +typedef struct { + __m128i rkeys[16]; + __m128i n; +} aes256ctr_ctx; + +void PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint64_t nonce); + +void PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +void PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t seed[32], + uint64_t nonce); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/align.h b/crypto_sign/dilithium5aes/avx2/align.h new file mode 100644 index 00000000..27bd9ce9 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/align.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_ALIGN_H +#define PQCLEAN_DILITHIUM5AES_AVX2_ALIGN_H + +#include +#include + +#define ALIGNED_UINT8(N) \ + union { \ + uint8_t coeffs[N]; \ + __m256i vec[((N)+31)/32]; \ + } + +#define ALIGNED_INT32(N) \ + union { \ + int32_t coeffs[N]; \ + __m256i vec[((N)+7)/8]; \ + } + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/api.h b/crypto_sign/dilithium5aes/avx2/api.h new file mode 100644 index 00000000..a6399853 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/api.h @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_API_H +#define PQCLEAN_DILITHIUM5AES_AVX2_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES 2592 +#define PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_SECRETKEYBYTES 4880 +#define PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES 4595 +#define PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_ALGNAME "Dilithium5-AES" + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/cdecl.h b/crypto_sign/dilithium5aes/avx2/cdecl.h new file mode 100644 index 00000000..f23c2742 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/cdecl.h @@ -0,0 +1,24 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_CDECL_H +#define PQCLEAN_DILITHIUM5AES_AVX2_CDECL_H + + + +#define _8XQ 0 +#define _8XQINV 8 +#define _8XDIV_QINV 16 +#define _8XDIV 24 +#define _ZETAS_QINV 32 +#define _ZETAS 328 + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#define _cdecl(s) _##s +#define cdecl(s) s + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/consts.c b/crypto_sign/dilithium5aes/avx2/consts.c new file mode 100644 index 00000000..6d3c7afc --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/consts.c @@ -0,0 +1,101 @@ +#include "consts.h" +#include "params.h" +#include + +#define QINV 58728449 // q^(-1) mod 2^32 +#define MONT (-4186625) // 2^32 mod q +#define DIV 41978 // mont^2/256 +#define DIV_QINV (-8395782) + +const qdata_t PQCLEAN_DILITHIUM5AES_AVX2_qdata = {{ +//#define _8XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, + +//#define _8XQINV 8 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +//#define _8XDIV_QINV 16 + DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, DIV_QINV, + +//#define _8XDIV 24 + DIV, DIV, DIV, DIV, DIV, DIV, DIV, DIV, + +//#define _ZETAS_QINV 32 + -151046689, 1830765815, -1929875198, -1927777021, 1640767044, 1477910808, 1612161320, 1640734244, + 308362795, 308362795, 308362795, 308362795, -1815525077, -1815525077, -1815525077, -1815525077, + -1374673747, -1374673747, -1374673747, -1374673747, -1091570561, -1091570561, -1091570561, -1091570561, + -1929495947, -1929495947, -1929495947, -1929495947, 515185417, 515185417, 515185417, 515185417, + -285697463, -285697463, -285697463, -285697463, 625853735, 625853735, 625853735, 625853735, + 1727305304, 1727305304, 2082316400, 2082316400, -1364982364, -1364982364, 858240904, 858240904, + 1806278032, 1806278032, 222489248, 222489248, -346752664, -346752664, 684667771, 684667771, + 1654287830, 1654287830, -878576921, -878576921, -1257667337, -1257667337, -748618600, -748618600, + 329347125, 329347125, 1837364258, 1837364258, -1443016191, -1443016191, -1170414139, -1170414139, + -1846138265, -1631226336, -1404529459, 1838055109, 1594295555, -1076973524, -1898723372, -594436433, + -202001019, -475984260, -561427818, 1797021249, -1061813248, 2059733581, -1661512036, -1104976547, + -1750224323, -901666090, 418987550, 1831915353, -1925356481, 992097815, 879957084, 2024403852, + 1484874664, -1636082790, -285388938, -1983539117, -1495136972, -950076368, -1714807468, -952438995, + -1574918427, 1350681039, -2143979939, 1599739335, -1285853323, -993005454, -1440787840, 568627424, + -783134478, -588790216, 289871779, -1262003603, 2135294594, -1018755525, -889861155, 1665705315, + 1321868265, 1225434135, -1784632064, 666258756, 675310538, -1555941048, -1999506068, -1499481951, + -695180180, -1375177022, 1777179795, 334803717, -178766299, -518252220, 1957047970, 1146323031, + -654783359, -1974159335, 1651689966, 140455867, -1039411342, 1955560694, 1529189038, -2131021878, + -247357819, 1518161567, -86965173, 1708872713, 1787797779, 1638590967, -120646188, -1669960606, + -916321552, 1155548552, 2143745726, 1210558298, -1261461890, -318346816, 628664287, -1729304568, + 1422575624, 1424130038, -1185330464, 235321234, 168022240, 1206536194, 985155484, -894060583, + -898413, -1363460238, -605900043, 2027833504, 14253662, 1014493059, 863641633, 1819892093, + 2124962073, -1223601433, -1920467227, -1637785316, -1536588520, 694382729, 235104446, -1045062172, + 831969619, -300448763, 756955444, -260312805, 1554794072, 1339088280, -2040058690, -853476187, + -2047270596, -1723816713, -1591599803, -440824168, 1119856484, 1544891539, 155290192, -973777462, + 991903578, 912367099, -44694137, 1176904444, -421552614, -818371958, 1747917558, -325927722, + 908452108, 1851023419, -1176751719, -1354528380, -72690498, -314284737, 985022747, 963438279, + -1078959975, 604552167, -1021949428, 608791570, 173440395, -2126092136, -1316619236, -1039370342, + 6087993, -110126092, 565464272, -1758099917, -1600929361, 879867909, -1809756372, 400711272, + 1363007700, 30313375, -326425360, 1683520342, -517299994, 2027935492, -1372618620, 128353682, + -1123881663, 137583815, -635454918, -642772911, 45766801, 671509323, -2070602178, 419615363, + 1216882040, -270590488, -1276805128, 371462360, -1357098057, -384158533, 827959816, -596344473, + 702390549, -279505433, -260424530, -71875110, -1208667171, -1499603926, 2036925262, -540420426, + 746144248, -1420958686, 2032221021, 1904936414, 1257750362, 1926727420, 1931587462, 1258381762, + 885133339, 1629985060, 1967222129, 6363718, -1287922800, 1136965286, 1779436847, 1116720494, + 1042326957, 1405999311, 713994583, 940195359, -1542497137, 2061661095, -883155599, 1726753853, + -1547952704, 394851342, 283780712, 776003547, 1123958025, 201262505, 1934038751, 374860238, + +//#define _ZETAS 328 + -3975713, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 1826347, 1826347, 1826347, 2353451, 2353451, 2353451, 2353451, + -359251, -359251, -359251, -359251, -2091905, -2091905, -2091905, -2091905, + 3119733, 3119733, 3119733, 3119733, -2884855, -2884855, -2884855, -2884855, + 3111497, 3111497, 3111497, 3111497, 2680103, 2680103, 2680103, 2680103, + 2725464, 2725464, 1024112, 1024112, -1079900, -1079900, 3585928, 3585928, + -549488, -549488, -1119584, -1119584, 2619752, 2619752, -2108549, -2108549, + -2118186, -2118186, -3859737, -3859737, -1399561, -1399561, -3277672, -3277672, + 1757237, 1757237, -19422, -19422, 4010497, 4010497, 280005, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -3677745, -1452451, 2176455, -1257611, -4083598, -3190144, -3632928, + 3412210, 2147896, -2967645, -411027, -671102, -22981, -381987, 1852771, + -3343383, 508951, 44288, 904516, -3724342, 1653064, 2389356, 759969, + 189548, 3159746, -2409325, 1315589, 1285669, -812732, -3019102, -3628969, + -1528703, -3041255, 3475950, -1585221, 1939314, -1000202, -3157330, 126922, + -983419, 2715295, -3693493, -2477047, -1228525, -1308169, 1349076, -1430430, + 264944, 3097992, -1100098, 3958618, -8578, -3249728, -210977, -1316856, + -3553272, -1851402, -177440, 1341330, -1584928, -1439742, -3881060, 3839961, + 2091667, -3342478, 266997, -3520352, 900702, 495491, -655327, -3556995, + 342297, 3437287, 2842341, 4055324, -3767016, -2994039, -1333058, -451100, + -1279661, 1500165, -542412, -2584293, -2013608, 1957272, -3183426, 810149, + -3038916, 2213111, -426683, -1667432, -2939036, 183443, -554416, 3937738, + 3407706, 2244091, 2434439, -3759364, 1859098, -1613174, -3122442, -525098, + 286988, -3342277, 2691481, 1247620, 1250494, 1869119, 1237275, 1312455, + 1917081, 777191, -2831860, -3724270, 2432395, 3369112, 162844, 1652634, + 3523897, -975884, 1723600, -1104333, -2235985, -976891, 3919660, 1400424, + 2316500, -2446433, -1235728, -1197226, 909542, -43260, 2031748, -768622, + -2437823, 1735879, -2590150, 2486353, 2635921, 1903435, -3318210, 3306115, + -2546312, 2235880, -1671176, 594136, 2454455, 185531, 1616392, -3694233, + 3866901, 1717735, -1803090, -260646, -420899, 1612842, -48306, -846154, + 3817976, -3562462, 3513181, -3193378, 819034, -522500, 3207046, -3595838, + 4108315, 203044, 1265009, 1595974, -3548272, -1050970, -1430225, -1962642, + -1374803, 3406031, -1846953, -3776993, -164721, -1207385, 3014001, -1799107, + 269760, 472078, 1910376, -3833893, -2286327, -3545687, -1362209, 1976782, + } +}; diff --git a/crypto_sign/dilithium5aes/avx2/consts.h b/crypto_sign/dilithium5aes/avx2/consts.h new file mode 100644 index 00000000..3bebc6b0 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/consts.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_CONSTS_H +#define PQCLEAN_DILITHIUM5AES_AVX2_CONSTS_H +#include "align.h" +#include "cdecl.h" + + +typedef ALIGNED_INT32(624) qdata_t; +extern const qdata_t PQCLEAN_DILITHIUM5AES_AVX2_qdata; + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/invntt.S b/crypto_sign/dilithium5aes/avx2/invntt.S new file mode 100644 index 00000000..f30535f1 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/invntt.S @@ -0,0 +1,240 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpsubd %ymm\l,%ymm\h,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vpmuldq %ymm\zl0,%ymm12,%ymm13 +vmovshdup %ymm12,%ymm\h +vpmuldq %ymm\zl1,%ymm\h,%ymm14 + +vpmuldq %ymm\zh0,%ymm12,%ymm12 +vpmuldq %ymm\zh1,%ymm\h,%ymm\h + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vpsubd %ymm13,%ymm12,%ymm12 +vpsubd %ymm14,%ymm\h,%ymm\h + +vmovshdup %ymm12,%ymm12 +vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h +.endm + +.macro levels0t5 off +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +/* level 0 */ +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,5,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 6,7,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,9,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 10,11,1,3,2,15 + +/* level 1 */ +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,6,1,3,2,15 +butterfly 5,7,1,3,2,15 + +vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 8,10,1,3,2,15 +butterfly 9,11,1,3,2,15 + +/* level 2 */ +vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 +vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 +vmovshdup %ymm3,%ymm1 +vmovshdup %ymm15,%ymm2 +butterfly 4,8,1,3,2,15 +butterfly 5,9,1,3,2,15 +butterfly 6,10,1,3,2,15 +butterfly 7,11,1,3,2,15 + +/* level 3 */ +shuffle2 4,5,3,5 +shuffle2 6,7,4,7 +shuffle2 8,9,6,9 +shuffle2 10,11,8,11 + +vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 +butterfly 3,5 +butterfly 4,7 +butterfly 6,9 +butterfly 8,11 + +/* level 4 */ +shuffle4 3,4,10,4 +shuffle4 6,8,3,8 +shuffle4 5,7,6,7 +shuffle4 9,11,5,11 + +vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 +vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 +butterfly 10,4 +butterfly 3,8 +butterfly 6,7 +butterfly 5,11 + +/* level 5 */ +shuffle8 10,3,9,3 +shuffle8 6,5,10,5 +shuffle8 4,8,6,8 +shuffle8 7,11,4,11 + +vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 +butterfly 9,3 +butterfly 10,5 +butterfly 6,8 +butterfly 4,11 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm10,256*\off+ 32(%rdi) +vmovdqa %ymm6,256*\off+ 64(%rdi) +vmovdqa %ymm4,256*\off+ 96(%rdi) +vmovdqa %ymm3,256*\off+128(%rdi) +vmovdqa %ymm5,256*\off+160(%rdi) +vmovdqa %ymm8,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.macro levels6t7 off +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +/* level 6 */ +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +/* level 7 */ +vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) + +vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 +vmovdqa (_8XDIV)*4(%rsi),%ymm2 +vpmuldq %ymm1,%ymm4,%ymm12 +vpmuldq %ymm1,%ymm5,%ymm13 +vmovshdup %ymm4,%ymm8 +vmovshdup %ymm5,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm4,%ymm4 +vpmuldq %ymm2,%ymm5,%ymm5 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm4,%ymm4 +vpsubd %ymm13,%ymm5,%ymm5 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm4,%ymm4 +vmovshdup %ymm5,%ymm5 +vpblendd $0xAA,%ymm8,%ymm4,%ymm4 +vpblendd $0xAA,%ymm9,%ymm5,%ymm5 + +vpmuldq %ymm1,%ymm6,%ymm12 +vpmuldq %ymm1,%ymm7,%ymm13 +vmovshdup %ymm6,%ymm8 +vmovshdup %ymm7,%ymm9 +vpmuldq %ymm1,%ymm8,%ymm14 +vpmuldq %ymm1,%ymm9,%ymm15 +vpmuldq %ymm2,%ymm6,%ymm6 +vpmuldq %ymm2,%ymm7,%ymm7 +vpmuldq %ymm2,%ymm8,%ymm8 +vpmuldq %ymm2,%ymm9,%ymm9 +vpmuldq %ymm0,%ymm12,%ymm12 +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 +vpmuldq %ymm0,%ymm15,%ymm15 +vpsubd %ymm12,%ymm6,%ymm6 +vpsubd %ymm13,%ymm7,%ymm7 +vpsubd %ymm14,%ymm8,%ymm8 +vpsubd %ymm15,%ymm9,%ymm9 +vmovshdup %ymm6,%ymm6 +vmovshdup %ymm7,%ymm7 +vpblendd $0xAA,%ymm8,%ymm6,%ymm6 +vpblendd $0xAA,%ymm9,%ymm7,%ymm7 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM5AES_AVX2_invntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM5AES_AVX2_invntt_avx) +cdecl(PQCLEAN_DILITHIUM5AES_AVX2_invntt_avx): +_cdecl(PQCLEAN_DILITHIUM5AES_AVX2_invntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t5 0 +levels0t5 1 +levels0t5 2 +levels0t5 3 + +levels6t7 0 +levels6t7 1 +levels6t7 2 +levels6t7 3 + +ret diff --git a/crypto_sign/dilithium5aes/avx2/ntt.S b/crypto_sign/dilithium5aes/avx2/ntt.S new file mode 100644 index 00000000..c3cc06d6 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/ntt.S @@ -0,0 +1,199 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 +vpmuldq %ymm\zl0,%ymm\h,%ymm13 +vmovshdup %ymm\h,%ymm12 +vpmuldq %ymm\zl1,%ymm12,%ymm14 + +vpmuldq %ymm\zh0,%ymm\h,%ymm\h +vpmuldq %ymm\zh1,%ymm12,%ymm12 + +vpmuldq %ymm0,%ymm13,%ymm13 +vpmuldq %ymm0,%ymm14,%ymm14 + +vmovshdup %ymm\h,%ymm\h +vpblendd $0xAA,%ymm12,%ymm\h,%ymm\h + +vpsubd %ymm\h,%ymm\l,%ymm12 +vpaddd %ymm\h,%ymm\l,%ymm\l + +vmovshdup %ymm13,%ymm13 +vpblendd $0xAA,%ymm14,%ymm13,%ymm13 + +vpaddd %ymm13,%ymm12,%ymm\h +vpsubd %ymm13,%ymm\l,%ymm\l +.endm + +.macro levels0t1 off +/* level 0 */ +vpbroadcastd (_ZETAS_QINV+1)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+1)*4(%rsi),%ymm2 + +vmovdqa 0+32*\off(%rdi),%ymm4 +vmovdqa 128+32*\off(%rdi),%ymm5 +vmovdqa 256+32*\off(%rdi),%ymm6 +vmovdqa 384+32*\off(%rdi),%ymm7 +vmovdqa 512+32*\off(%rdi),%ymm8 +vmovdqa 640+32*\off(%rdi),%ymm9 +vmovdqa 768+32*\off(%rdi),%ymm10 +vmovdqa 896+32*\off(%rdi),%ymm11 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +/* level 1 */ +vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 +butterfly 4,6 +butterfly 5,7 + +vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 +butterfly 8,10 +butterfly 9,11 + +vmovdqa %ymm4, 0+32*\off(%rdi) +vmovdqa %ymm5,128+32*\off(%rdi) +vmovdqa %ymm6,256+32*\off(%rdi) +vmovdqa %ymm7,384+32*\off(%rdi) +vmovdqa %ymm8,512+32*\off(%rdi) +vmovdqa %ymm9,640+32*\off(%rdi) +vmovdqa %ymm10,768+32*\off(%rdi) +vmovdqa %ymm11,896+32*\off(%rdi) +.endm + +.macro levels2t7 off +/* level 2 */ +vmovdqa 256*\off+ 0(%rdi),%ymm4 +vmovdqa 256*\off+ 32(%rdi),%ymm5 +vmovdqa 256*\off+ 64(%rdi),%ymm6 +vmovdqa 256*\off+ 96(%rdi),%ymm7 +vmovdqa 256*\off+128(%rdi),%ymm8 +vmovdqa 256*\off+160(%rdi),%ymm9 +vmovdqa 256*\off+192(%rdi),%ymm10 +vmovdqa 256*\off+224(%rdi),%ymm11 + +vpbroadcastd (_ZETAS_QINV+4+\off)*4(%rsi),%ymm1 +vpbroadcastd (_ZETAS+4+\off)*4(%rsi),%ymm2 + +butterfly 4,8 +butterfly 5,9 +butterfly 6,10 +butterfly 7,11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +/* level 3 */ +vmovdqa (_ZETAS_QINV+8+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+8+8*\off)*4(%rsi),%ymm2 + +butterfly 3,5 +butterfly 8,10 +butterfly 4,6 +butterfly 9,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +/* level 4 */ +vmovdqa (_ZETAS_QINV+40+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+40+8*\off)*4(%rsi),%ymm2 + +butterfly 7,8 +butterfly 5,6 +butterfly 3,4 +butterfly 10,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +/* level 5 */ +vmovdqa (_ZETAS_QINV+72+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+72+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 + +butterfly 9,5,1,10,2,15 +butterfly 8,4,1,10,2,15 +butterfly 7,3,1,10,2,15 +butterfly 6,11,1,10,2,15 + +/* level 6 */ +vmovdqa (_ZETAS_QINV+104+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,7,1,10,2,15 +butterfly 8,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+104+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+104+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,3,1,10,2,15 +butterfly 4,11,1,10,2,15 + +/* level 7 */ +vmovdqa (_ZETAS_QINV+168+8*\off)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 9,8,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+32)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+32)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 7,6,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+64)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+64)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 5,4,1,10,2,15 + +vmovdqa (_ZETAS_QINV+168+8*\off+96)*4(%rsi),%ymm1 +vmovdqa (_ZETAS+168+8*\off+96)*4(%rsi),%ymm2 +vpsrlq $32,%ymm1,%ymm10 +vmovshdup %ymm2,%ymm15 +butterfly 3,11,1,10,2,15 + +vmovdqa %ymm9,256*\off+ 0(%rdi) +vmovdqa %ymm8,256*\off+ 32(%rdi) +vmovdqa %ymm7,256*\off+ 64(%rdi) +vmovdqa %ymm6,256*\off+ 96(%rdi) +vmovdqa %ymm5,256*\off+128(%rdi) +vmovdqa %ymm4,256*\off+160(%rdi) +vmovdqa %ymm3,256*\off+192(%rdi) +vmovdqa %ymm11,256*\off+224(%rdi) +.endm + +.text +.global cdecl(PQCLEAN_DILITHIUM5AES_AVX2_ntt_avx) +.global _cdecl(PQCLEAN_DILITHIUM5AES_AVX2_ntt_avx) +cdecl(PQCLEAN_DILITHIUM5AES_AVX2_ntt_avx): +_cdecl(PQCLEAN_DILITHIUM5AES_AVX2_ntt_avx): +vmovdqa _8XQ*4(%rsi),%ymm0 + +levels0t1 0 +levels0t1 1 +levels0t1 2 +levels0t1 3 + +levels2t7 0 +levels2t7 1 +levels2t7 2 +levels2t7 3 + +ret + diff --git a/crypto_sign/dilithium5aes/avx2/ntt.h b/crypto_sign/dilithium5aes/avx2/ntt.h new file mode 100644 index 00000000..7c233186 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/ntt.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_NTT_H +#define PQCLEAN_DILITHIUM5AES_AVX2_NTT_H + +#include + +void PQCLEAN_DILITHIUM5AES_AVX2_ntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM5AES_AVX2_qdata); +void PQCLEAN_DILITHIUM5AES_AVX2_invntt_avx(__m256i *a, const __m256i *PQCLEAN_DILITHIUM5AES_AVX2_qdata); + +void PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx(__m256i *a); + +void PQCLEAN_DILITHIUM5AES_AVX2_pointwise_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM5AES_AVX2_qdata); +void PQCLEAN_DILITHIUM5AES_AVX2_pointwise_acc_avx(__m256i *c, const __m256i *a, const __m256i *b, const __m256i *PQCLEAN_DILITHIUM5AES_AVX2_qdata); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/packing.c b/crypto_sign/dilithium5aes/avx2/packing.c new file mode 100644 index 00000000..b40af6d4 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/packing.c @@ -0,0 +1,261 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM5AES_AVX2_challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Encode h */ + for (i = 0; i < OMEGA + K; ++i) { + sig[i] = 0; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; + } + } + + sig[OMEGA + i] = (uint8_t) k; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM5AES_AVX2_challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES]) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + return 0; +} diff --git a/crypto_sign/dilithium5aes/avx2/packing.h b/crypto_sign/dilithium5aes/avx2/packing.h new file mode 100644 index 00000000..6c707af7 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/packing.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_PACKING_H +#define PQCLEAN_DILITHIUM5AES_AVX2_PACKING_H +#include "params.h" +#include "polyvec.h" +#include + +void PQCLEAN_DILITHIUM5AES_AVX2_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +void PQCLEAN_DILITHIUM5AES_AVX2_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM5AES_AVX2_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM5AES_AVX2_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM5AES_AVX2_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM5AES_AVX2_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES]); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/params.h b/crypto_sign/dilithium5aes/avx2/params.h new file mode 100644 index 00000000..52a5dd43 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/params.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_PARAMS_H +#define PQCLEAN_DILITHIUM5AES_AVX2_PARAMS_H + + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#define K 8 +#define L 7 +#define ETA 2 +#define TAU 60 +#define BETA 120 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 75 +#define PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_ALGNAME "Dilithium5-AES" + + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 640 + +#define POLYW1_PACKEDBYTES 128 + +#define POLYETA_PACKEDBYTES 96 + +#define PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/pointwise.S b/crypto_sign/dilithium5aes/avx2/pointwise.S new file mode 100644 index 00000000..c22d80d6 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/pointwise.S @@ -0,0 +1,205 @@ +#include "params.h" +#include "cdecl.h" + +.text +.global cdecl(PQCLEAN_DILITHIUM5AES_AVX2_pointwise_avx) +.global _cdecl(PQCLEAN_DILITHIUM5AES_AVX2_pointwise_avx) +cdecl(PQCLEAN_DILITHIUM5AES_AVX2_pointwise_avx): +_cdecl(PQCLEAN_DILITHIUM5AES_AVX2_pointwise_avx): +#consts +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop1: +#load +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa 64(%rsi),%ymm6 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vmovdqa 64(%rdx),%ymm14 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm6,%ymm7 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm14,%ymm15 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm6,%ymm14,%ymm6 +vpmuldq %ymm7,%ymm15,%ymm7 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm0,%ymm6,%ymm14 +vpmuldq %ymm0,%ymm7,%ymm15 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpmuldq %ymm1,%ymm14,%ymm14 +vpmuldq %ymm1,%ymm15,%ymm15 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsubq %ymm14,%ymm6,%ymm6 +vpsubq %ymm15,%ymm7,%ymm7 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm6,%ymm6 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 +vpblendd $0xAA,%ymm7,%ymm6,%ymm6 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm6,64(%rdi) + +add $96,%rdi +add $96,%rsi +add $96,%rdx +add $1,%eax +cmp $10,%eax +jb _looptop1 + +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0x55,%ymm2,%ymm3,%ymm2 +vpblendd $0x55,%ymm4,%ymm5,%ymm4 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +ret + +.macro pointwise off +#load +vmovdqa \off(%rsi),%ymm6 +vmovdqa \off+32(%rsi),%ymm8 +vmovdqa \off(%rdx),%ymm10 +vmovdqa \off+32(%rdx),%ymm12 +vpsrlq $32,%ymm6,%ymm7 +vpsrlq $32,%ymm8,%ymm9 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm6,%ymm10,%ymm6 +vpmuldq %ymm7,%ymm11,%ymm7 +vpmuldq %ymm8,%ymm12,%ymm8 +vpmuldq %ymm9,%ymm13,%ymm9 +.endm + +.macro acc +vpaddq %ymm6,%ymm2,%ymm2 +vpaddq %ymm7,%ymm3,%ymm3 +vpaddq %ymm8,%ymm4,%ymm4 +vpaddq %ymm9,%ymm5,%ymm5 +.endm + +.global cdecl(PQCLEAN_DILITHIUM5AES_AVX2_pointwise_acc_avx) +.global _cdecl(PQCLEAN_DILITHIUM5AES_AVX2_pointwise_acc_avx) +cdecl(PQCLEAN_DILITHIUM5AES_AVX2_pointwise_acc_avx): +_cdecl(PQCLEAN_DILITHIUM5AES_AVX2_pointwise_acc_avx): +#consts +vmovdqa _8XQINV*4(%rcx),%ymm0 +vmovdqa _8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop2: +pointwise 0 + +#mov +vmovdqa %ymm6,%ymm2 +vmovdqa %ymm7,%ymm3 +vmovdqa %ymm8,%ymm4 +vmovdqa %ymm9,%ymm5 + +pointwise 1024 +acc + +pointwise 2048 +acc + +pointwise 3072 +acc + +pointwise 4096 +acc + +pointwise 5120 +acc + +pointwise 6144 +acc + +#reduce +vpmuldq %ymm0,%ymm2,%ymm6 +vpmuldq %ymm0,%ymm3,%ymm7 +vpmuldq %ymm0,%ymm4,%ymm8 +vpmuldq %ymm0,%ymm5,%ymm9 +vpmuldq %ymm1,%ymm6,%ymm6 +vpmuldq %ymm1,%ymm7,%ymm7 +vpmuldq %ymm1,%ymm8,%ymm8 +vpmuldq %ymm1,%ymm9,%ymm9 +vpsubq %ymm6,%ymm2,%ymm2 +vpsubq %ymm7,%ymm3,%ymm3 +vpsubq %ymm8,%ymm4,%ymm4 +vpsubq %ymm9,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 + +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +add $64,%rsi +add $64,%rdx +add $64,%rdi +add $1,%eax +cmp $16,%eax +jb _looptop2 + +ret diff --git a/crypto_sign/dilithium5aes/avx2/poly.c b/crypto_sign/dilithium5aes/avx2/poly.c new file mode 100644 index 00000000..34884f7b --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/poly.c @@ -0,0 +1,886 @@ +#include "align.h" +#include "consts.h" +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "rejsample.h" +#include "rounding.h" +#include "symmetric.h" +#include +#include +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. Assumes input +* coefficients to be at most 2^31 - 2^22 - 1 in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM5AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i off = _mm256_set1_epi32(1 << 22); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_add_epi32(f, off); + g = _mm256_srai_epi32(g, 23); + g = _mm256_mullo_epi32(g, q); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: poly_addq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_caddq(poly *a) { + unsigned int i; + __m256i f, g; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM5AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i zero = _mm256_setzero_si256(); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_blendv_epi32(zero, q, f); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* positive standard representatives. Assumes input +* coefficients to be at most 2^31 - 2^22 + 1 in +* absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_freeze(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(a); + PQCLEAN_DILITHIUM5AES_AVX2_poly_caddq(a); + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_add_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + __m256i f, g; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + g = _mm256_load_si256(&b->vec[i]); + f = _mm256_sub_epi32(f, g); + _mm256_store_si256(&c->vec[i], f); + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_shiftl(poly *a) { + unsigned int i; + __m256i f; + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_slli_epi32(f, D); + _mm256_store_si256(&a->vec[i], f); + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by up to +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_AVX2_ntt_avx(a->vec, PQCLEAN_DILITHIUM5AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_AVX2_invntt_avx(a->vec, PQCLEAN_DILITHIUM5AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +void PQCLEAN_DILITHIUM5AES_AVX2_poly_nttunpack(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx(a->vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_AVX2_pointwise_avx(c->vec, a->vec, b->vec, PQCLEAN_DILITHIUM5AES_AVX2_qdata.vec); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod^+ Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* positive standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_AVX2_power2round_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod^+ Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except if c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be positive standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_AVX2_decompose_avx(a1->vec, a0->vec, a->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_make_hint +* +* Description: Compute hint array. The coefficients of which are the +* indices of the coefficients of the input polynomial +* whose low bits overflow into the high bits. +* +* Arguments: - uint8_t *h: pointer to output hint array (preallocated of length N) +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of hints, i.e. length of hint array. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1) { + unsigned int r; + DBENCH_START(); + + r = PQCLEAN_DILITHIUM5AES_AVX2_make_hint_avx(hint, a0->vec, a1->vec); + + DBENCH_STOP(*tround); + return r; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_AVX2_use_hint_avx(b->vec, a->vec, h->vec); + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input polynomial to be reduced by PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int r; + __m256i f, t; + const __m256i bound = _mm256_set1_epi32(B - 1); + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + + t = _mm256_setzero_si256(); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a->vec[i]); + f = _mm256_abs_epi32(f); + f = _mm256_cmpgt_epi32(f, bound); + t = _mm256_or_si256(t, f); + } + + r = 1 - _mm256_testz_si256(t, t); + DBENCH_STOP(*tsample); + return r; +} + +/************************************************* +* Name: rej_uniform +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + /* PQCLEAN_DILITHIUM5AES_AVX2_rej_uniform_avx reads up to 8 additional bytes */ + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN + 8) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM5AES_AVX2_rej_uniform_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + /* length of buf is always divisible by 3; hence, no bytes left */ + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_preinit(a, &state); + stream128_release(&state); +} + + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling using the +* output stream of SHAKE256(seed|nonce) +* or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state) { + unsigned int ctr; + ALIGNED_UINT8(REJ_UNIFORM_BUFLEN * STREAM128_BLOCKBYTES) buf; + + stream128_squeezeblocks(buf.coeffs, REJ_UNIFORM_ETA_NBLOCKS, state); + ctr = PQCLEAN_DILITHIUM5AES_AVX2_rej_eta_avx(a->coeffs, buf.coeffs); + + while (ctr < N) { + stream128_squeezeblocks(buf.coeffs, 1, state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf.coeffs, STREAM128_BLOCKBYTES); + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + stream128_state state; + stream128_init(&state, seed, nonce); + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta_preinit(a, &state); + stream128_release(&state); +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_gamma1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES+STREAM256_BLOCKBYTES-1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state) { + /* PQCLEAN_DILITHIUM5AES_AVX2_polyz_unpack reads 14 additional bytes */ + ALIGNED_UINT8(POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES + 14) buf; + stream256_squeezeblocks(buf.coeffs, POLY_UNIFORM_GAMMA1_NBLOCKS, state); + PQCLEAN_DILITHIUM5AES_AVX2_polyz_unpack(a, buf.coeffs); +} + +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce) { + stream256_state state; + stream256_init(&state, seed, nonce); + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_gamma1_preinit(a, &state); + stream256_release(&state); +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_poly_challenge(poly *restrict c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + ALIGNED_UINT8(SHAKE256_RATE) buf; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + + memcpy(&signs, buf.coeffs, 8); + pos = 8; + + memset(c->vec, 0, sizeof(poly)); + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf.coeffs, SHAKE256_RATE, &state); + pos = 0; + } + + b = buf.coeffs[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = ETA - a->coeffs[8 * i + 0]; + t[1] = ETA - a->coeffs[8 * i + 1]; + t[2] = ETA - a->coeffs[8 * i + 2]; + t[3] = ETA - a->coeffs[8 * i + 3]; + t[4] = ETA - a->coeffs[8 * i + 4]; + t[5] = ETA - a->coeffs[8 * i + 5]; + t[6] = ETA - a->coeffs[8 * i + 6]; + t[7] = ETA - a->coeffs[8 * i + 7]; + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyeta_unpack(poly *restrict r, const uint8_t a[POLYETA_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be positive standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2); + r[5 * i + 2] = (a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4); + r[5 * i + 3] = (a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6); + r[5 * i + 4] = (a->coeffs[4 * i + 3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are positive standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyt1_unpack(poly *restrict r, const uint8_t a[POLYT1_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = t[0]; + r[13 * i + 1] = t[0] >> 8; + r[13 * i + 1] |= t[1] << 5; + r[13 * i + 2] = t[1] >> 3; + r[13 * i + 3] = t[1] >> 11; + r[13 * i + 3] |= t[2] << 2; + r[13 * i + 4] = t[2] >> 6; + r[13 * i + 4] |= t[3] << 7; + r[13 * i + 5] = t[3] >> 1; + r[13 * i + 6] = t[3] >> 9; + r[13 * i + 6] |= t[4] << 4; + r[13 * i + 7] = t[4] >> 4; + r[13 * i + 8] = t[4] >> 12; + r[13 * i + 8] |= t[5] << 1; + r[13 * i + 9] = t[5] >> 7; + r[13 * i + 9] |= t[6] << 6; + r[13 * i + 10] = t[6] >> 2; + r[13 * i + 11] = t[6] >> 10; + r[13 * i + 11] |= t[7] << 3; + r[13 * i + 12] = t[7] >> 5; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyt0_unpack(poly *restrict r, const uint8_t a[POLYT0_PACKEDBYTES]) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; + + r[5 * i + 0] = t[0]; + r[5 * i + 1] = t[0] >> 8; + r[5 * i + 2] = t[0] >> 16; + r[5 * i + 2] |= t[1] << 4; + r[5 * i + 3] = t[1] >> 4; + r[5 * i + 4] = t[1] >> 12; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyz_unpack(poly *restrict r, const uint8_t a[POLYZ_PACKEDBYTES + 12]) { + unsigned int i; + __m256i f; + const __m256i shufbidx = _mm256_set_epi8(-1, 11, 10, 9, -1, 9, 8, 7, -1, 6, 5, 4, -1, 4, 3, 2, + -1, 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0); + const __m256i srlvdidx = _mm256_set1_epi64x((uint64_t)4 << 32); + const __m256i mask = _mm256_set1_epi32(0xFFFFF); + const __m256i gamma1 = _mm256_set1_epi32(GAMMA1); + DBENCH_START(); + + for (i = 0; i < N / 8; i++) { + f = _mm256_loadu_si256((__m256i *)&a[20 * i]); + f = _mm256_permute4x64_epi64(f, 0x94); + f = _mm256_shuffle_epi8(f, shufbidx); + f = _mm256_srlv_epi32(f, srlvdidx); + f = _mm256_and_si256(f, mask); + f = _mm256_sub_epi32(gamma1, f); + _mm256_store_si256(&r->vec[i], f); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be positive standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES], const poly *restrict a) { + unsigned int i; + __m256i f0, f1, f2, f3, f4, f5, f6, f7; + const __m256i shift = _mm256_set1_epi16((16 << 8) + 1); + const __m256i shufbidx = _mm256_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0, + 15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); + DBENCH_START(); + + for (i = 0; i < N / 64; ++i) { + f0 = _mm256_load_si256(&a->vec[8 * i + 0]); + f1 = _mm256_load_si256(&a->vec[8 * i + 1]); + f2 = _mm256_load_si256(&a->vec[8 * i + 2]); + f3 = _mm256_load_si256(&a->vec[8 * i + 3]); + f4 = _mm256_load_si256(&a->vec[8 * i + 4]); + f5 = _mm256_load_si256(&a->vec[8 * i + 5]); + f6 = _mm256_load_si256(&a->vec[8 * i + 6]); + f7 = _mm256_load_si256(&a->vec[8 * i + 7]); + f0 = _mm256_packus_epi32(f0, f1); + f1 = _mm256_packus_epi32(f2, f3); + f2 = _mm256_packus_epi32(f4, f5); + f3 = _mm256_packus_epi32(f6, f7); + f0 = _mm256_packus_epi16(f0, f1); + f1 = _mm256_packus_epi16(f2, f3); + f0 = _mm256_maddubs_epi16(f0, shift); + f1 = _mm256_maddubs_epi16(f1, shift); + f0 = _mm256_packus_epi16(f0, f1); + f0 = _mm256_permute4x64_epi64(f0, 0xD8); + f0 = _mm256_shuffle_epi8(f0, shufbidx); + _mm256_storeu_si256((__m256i *)&r[32 * i], f0); + } + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium5aes/avx2/poly.h b/crypto_sign/dilithium5aes/avx2/poly.h new file mode 100644 index 00000000..ca298325 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/poly.h @@ -0,0 +1,52 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_POLY_H +#define PQCLEAN_DILITHIUM5AES_AVX2_POLY_H +#include "align.h" +#include "params.h" +#include "symmetric.h" +#include + +typedef ALIGNED_INT32(N) poly; + +void PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_caddq(poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM5AES_AVX2_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_nttunpack(poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM5AES_AVX2_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_poly_make_hint(uint8_t hint[N], const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM5AES_AVX2_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta_preinit(poly *a, stream128_state *state); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta(poly *a, const uint8_t seed[SEEDBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_gamma1_preinit(poly *a, stream256_state *state); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_gamma1(poly *a, const uint8_t seed[CRHBYTES], uint16_t nonce); +void PQCLEAN_DILITHIUM5AES_AVX2_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + + +void PQCLEAN_DILITHIUM5AES_AVX2_polyeta_pack(uint8_t r[POLYETA_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_polyeta_unpack(poly *r, const uint8_t a[POLYETA_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyt1_pack(uint8_t r[POLYT1_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_polyt1_unpack(poly *r, const uint8_t a[POLYT1_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyt0_pack(uint8_t r[POLYT0_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_polyt0_unpack(poly *r, const uint8_t a[POLYT0_PACKEDBYTES]); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyz_pack(uint8_t r[POLYZ_PACKEDBYTES], const poly *a); +void PQCLEAN_DILITHIUM5AES_AVX2_polyz_unpack(poly *r, const uint8_t a[POLYZ_PACKEDBYTES + 14]); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyw1_pack(uint8_t r[POLYW1_PACKEDBYTES + 8], const poly *a); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/polyvec.c b/crypto_sign/dilithium5aes/avx2/polyvec.c new file mode 100644 index 00000000..bfae22cb --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/polyvec.c @@ -0,0 +1,449 @@ +#include "aes256ctr.h" +#include "consts.h" +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include + +#define UNUSED(x) (void)x + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + uint64_t nonce; + aes256ctr_ctx state; + + PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(&state, rho, 0); + + for (i = 0; i < K; i++) { + for (j = 0; j < L; j++) { + nonce = (i << 8) + j; + state.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_preinit(&mat[i].vec[j], &state); + PQCLEAN_DILITHIUM5AES_AVX2_poly_nttunpack(&mat[i].vec[j]); + } + } +} + + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_gamma1(&v->vec[i], seed, L * nonce + i); + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, const polyvecl *u, const polyvecl *v) { + PQCLEAN_DILITHIUM5AES_AVX2_pointwise_acc_avx(w->vec, u->vec->vec, v->vec->vec, PQCLEAN_DILITHIUM5AES_AVX2_qdata.vec); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM5AES_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_caddq(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM5AES_AVX2_polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM5AES_AVX2_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - uint8_t *hint: pointer to output hint array +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1) { + unsigned int i, n = 0; + + for (i = 0; i < K; ++i) { + n += PQCLEAN_DILITHIUM5AES_AVX2_poly_make_hint(&hint[n], &v0->vec[i], &v1->vec[i]); + } + + return n; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_AVX2_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} diff --git a/crypto_sign/dilithium5aes/avx2/polyvec.h b/crypto_sign/dilithium5aes/avx2/polyvec.h new file mode 100644 index 00000000..abd2277e --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/polyvec.h @@ -0,0 +1,64 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_POLYVEC_H +#define PQCLEAN_DILITHIUM5AES_AVX2_POLYVEC_H +#include "params.h" +#include "poly.h" +#include + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_reduce(polyvecl *v); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + +int PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_chknorm(const polyvecl *v, int32_t B); + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_caddq(polyveck *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + +int PQCLEAN_DILITHIUM5AES_AVX2_polyveck_chknorm(const polyveck *v, int32_t B); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_polyveck_make_hint(uint8_t *hint, const polyveck *v0, const polyveck *v1); +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + + +void PQCLEAN_DILITHIUM5AES_AVX2_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/rejsample.c b/crypto_sign/dilithium5aes/avx2/rejsample.c new file mode 100644 index 00000000..ec73790d --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/rejsample.c @@ -0,0 +1,394 @@ +#include "params.h" +#include "rejsample.h" +#include "symmetric.h" +#include +#include + +const uint8_t PQCLEAN_DILITHIUM5AES_AVX2_idxlut[256][8] = { + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0}, + { 1, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 0, 0, 0, 0, 0, 0}, + { 2, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0}, + { 1, 2, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 0, 0, 0, 0, 0}, + { 3, 0, 0, 0, 0, 0, 0, 0}, + { 0, 3, 0, 0, 0, 0, 0, 0}, + { 1, 3, 0, 0, 0, 0, 0, 0}, + { 0, 1, 3, 0, 0, 0, 0, 0}, + { 2, 3, 0, 0, 0, 0, 0, 0}, + { 0, 2, 3, 0, 0, 0, 0, 0}, + { 1, 2, 3, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 0, 0, 0, 0}, + { 4, 0, 0, 0, 0, 0, 0, 0}, + { 0, 4, 0, 0, 0, 0, 0, 0}, + { 1, 4, 0, 0, 0, 0, 0, 0}, + { 0, 1, 4, 0, 0, 0, 0, 0}, + { 2, 4, 0, 0, 0, 0, 0, 0}, + { 0, 2, 4, 0, 0, 0, 0, 0}, + { 1, 2, 4, 0, 0, 0, 0, 0}, + { 0, 1, 2, 4, 0, 0, 0, 0}, + { 3, 4, 0, 0, 0, 0, 0, 0}, + { 0, 3, 4, 0, 0, 0, 0, 0}, + { 1, 3, 4, 0, 0, 0, 0, 0}, + { 0, 1, 3, 4, 0, 0, 0, 0}, + { 2, 3, 4, 0, 0, 0, 0, 0}, + { 0, 2, 3, 4, 0, 0, 0, 0}, + { 1, 2, 3, 4, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 0, 0, 0}, + { 5, 0, 0, 0, 0, 0, 0, 0}, + { 0, 5, 0, 0, 0, 0, 0, 0}, + { 1, 5, 0, 0, 0, 0, 0, 0}, + { 0, 1, 5, 0, 0, 0, 0, 0}, + { 2, 5, 0, 0, 0, 0, 0, 0}, + { 0, 2, 5, 0, 0, 0, 0, 0}, + { 1, 2, 5, 0, 0, 0, 0, 0}, + { 0, 1, 2, 5, 0, 0, 0, 0}, + { 3, 5, 0, 0, 0, 0, 0, 0}, + { 0, 3, 5, 0, 0, 0, 0, 0}, + { 1, 3, 5, 0, 0, 0, 0, 0}, + { 0, 1, 3, 5, 0, 0, 0, 0}, + { 2, 3, 5, 0, 0, 0, 0, 0}, + { 0, 2, 3, 5, 0, 0, 0, 0}, + { 1, 2, 3, 5, 0, 0, 0, 0}, + { 0, 1, 2, 3, 5, 0, 0, 0}, + { 4, 5, 0, 0, 0, 0, 0, 0}, + { 0, 4, 5, 0, 0, 0, 0, 0}, + { 1, 4, 5, 0, 0, 0, 0, 0}, + { 0, 1, 4, 5, 0, 0, 0, 0}, + { 2, 4, 5, 0, 0, 0, 0, 0}, + { 0, 2, 4, 5, 0, 0, 0, 0}, + { 1, 2, 4, 5, 0, 0, 0, 0}, + { 0, 1, 2, 4, 5, 0, 0, 0}, + { 3, 4, 5, 0, 0, 0, 0, 0}, + { 0, 3, 4, 5, 0, 0, 0, 0}, + { 1, 3, 4, 5, 0, 0, 0, 0}, + { 0, 1, 3, 4, 5, 0, 0, 0}, + { 2, 3, 4, 5, 0, 0, 0, 0}, + { 0, 2, 3, 4, 5, 0, 0, 0}, + { 1, 2, 3, 4, 5, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 0, 0}, + { 6, 0, 0, 0, 0, 0, 0, 0}, + { 0, 6, 0, 0, 0, 0, 0, 0}, + { 1, 6, 0, 0, 0, 0, 0, 0}, + { 0, 1, 6, 0, 0, 0, 0, 0}, + { 2, 6, 0, 0, 0, 0, 0, 0}, + { 0, 2, 6, 0, 0, 0, 0, 0}, + { 1, 2, 6, 0, 0, 0, 0, 0}, + { 0, 1, 2, 6, 0, 0, 0, 0}, + { 3, 6, 0, 0, 0, 0, 0, 0}, + { 0, 3, 6, 0, 0, 0, 0, 0}, + { 1, 3, 6, 0, 0, 0, 0, 0}, + { 0, 1, 3, 6, 0, 0, 0, 0}, + { 2, 3, 6, 0, 0, 0, 0, 0}, + { 0, 2, 3, 6, 0, 0, 0, 0}, + { 1, 2, 3, 6, 0, 0, 0, 0}, + { 0, 1, 2, 3, 6, 0, 0, 0}, + { 4, 6, 0, 0, 0, 0, 0, 0}, + { 0, 4, 6, 0, 0, 0, 0, 0}, + { 1, 4, 6, 0, 0, 0, 0, 0}, + { 0, 1, 4, 6, 0, 0, 0, 0}, + { 2, 4, 6, 0, 0, 0, 0, 0}, + { 0, 2, 4, 6, 0, 0, 0, 0}, + { 1, 2, 4, 6, 0, 0, 0, 0}, + { 0, 1, 2, 4, 6, 0, 0, 0}, + { 3, 4, 6, 0, 0, 0, 0, 0}, + { 0, 3, 4, 6, 0, 0, 0, 0}, + { 1, 3, 4, 6, 0, 0, 0, 0}, + { 0, 1, 3, 4, 6, 0, 0, 0}, + { 2, 3, 4, 6, 0, 0, 0, 0}, + { 0, 2, 3, 4, 6, 0, 0, 0}, + { 1, 2, 3, 4, 6, 0, 0, 0}, + { 0, 1, 2, 3, 4, 6, 0, 0}, + { 5, 6, 0, 0, 0, 0, 0, 0}, + { 0, 5, 6, 0, 0, 0, 0, 0}, + { 1, 5, 6, 0, 0, 0, 0, 0}, + { 0, 1, 5, 6, 0, 0, 0, 0}, + { 2, 5, 6, 0, 0, 0, 0, 0}, + { 0, 2, 5, 6, 0, 0, 0, 0}, + { 1, 2, 5, 6, 0, 0, 0, 0}, + { 0, 1, 2, 5, 6, 0, 0, 0}, + { 3, 5, 6, 0, 0, 0, 0, 0}, + { 0, 3, 5, 6, 0, 0, 0, 0}, + { 1, 3, 5, 6, 0, 0, 0, 0}, + { 0, 1, 3, 5, 6, 0, 0, 0}, + { 2, 3, 5, 6, 0, 0, 0, 0}, + { 0, 2, 3, 5, 6, 0, 0, 0}, + { 1, 2, 3, 5, 6, 0, 0, 0}, + { 0, 1, 2, 3, 5, 6, 0, 0}, + { 4, 5, 6, 0, 0, 0, 0, 0}, + { 0, 4, 5, 6, 0, 0, 0, 0}, + { 1, 4, 5, 6, 0, 0, 0, 0}, + { 0, 1, 4, 5, 6, 0, 0, 0}, + { 2, 4, 5, 6, 0, 0, 0, 0}, + { 0, 2, 4, 5, 6, 0, 0, 0}, + { 1, 2, 4, 5, 6, 0, 0, 0}, + { 0, 1, 2, 4, 5, 6, 0, 0}, + { 3, 4, 5, 6, 0, 0, 0, 0}, + { 0, 3, 4, 5, 6, 0, 0, 0}, + { 1, 3, 4, 5, 6, 0, 0, 0}, + { 0, 1, 3, 4, 5, 6, 0, 0}, + { 2, 3, 4, 5, 6, 0, 0, 0}, + { 0, 2, 3, 4, 5, 6, 0, 0}, + { 1, 2, 3, 4, 5, 6, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 0}, + { 7, 0, 0, 0, 0, 0, 0, 0}, + { 0, 7, 0, 0, 0, 0, 0, 0}, + { 1, 7, 0, 0, 0, 0, 0, 0}, + { 0, 1, 7, 0, 0, 0, 0, 0}, + { 2, 7, 0, 0, 0, 0, 0, 0}, + { 0, 2, 7, 0, 0, 0, 0, 0}, + { 1, 2, 7, 0, 0, 0, 0, 0}, + { 0, 1, 2, 7, 0, 0, 0, 0}, + { 3, 7, 0, 0, 0, 0, 0, 0}, + { 0, 3, 7, 0, 0, 0, 0, 0}, + { 1, 3, 7, 0, 0, 0, 0, 0}, + { 0, 1, 3, 7, 0, 0, 0, 0}, + { 2, 3, 7, 0, 0, 0, 0, 0}, + { 0, 2, 3, 7, 0, 0, 0, 0}, + { 1, 2, 3, 7, 0, 0, 0, 0}, + { 0, 1, 2, 3, 7, 0, 0, 0}, + { 4, 7, 0, 0, 0, 0, 0, 0}, + { 0, 4, 7, 0, 0, 0, 0, 0}, + { 1, 4, 7, 0, 0, 0, 0, 0}, + { 0, 1, 4, 7, 0, 0, 0, 0}, + { 2, 4, 7, 0, 0, 0, 0, 0}, + { 0, 2, 4, 7, 0, 0, 0, 0}, + { 1, 2, 4, 7, 0, 0, 0, 0}, + { 0, 1, 2, 4, 7, 0, 0, 0}, + { 3, 4, 7, 0, 0, 0, 0, 0}, + { 0, 3, 4, 7, 0, 0, 0, 0}, + { 1, 3, 4, 7, 0, 0, 0, 0}, + { 0, 1, 3, 4, 7, 0, 0, 0}, + { 2, 3, 4, 7, 0, 0, 0, 0}, + { 0, 2, 3, 4, 7, 0, 0, 0}, + { 1, 2, 3, 4, 7, 0, 0, 0}, + { 0, 1, 2, 3, 4, 7, 0, 0}, + { 5, 7, 0, 0, 0, 0, 0, 0}, + { 0, 5, 7, 0, 0, 0, 0, 0}, + { 1, 5, 7, 0, 0, 0, 0, 0}, + { 0, 1, 5, 7, 0, 0, 0, 0}, + { 2, 5, 7, 0, 0, 0, 0, 0}, + { 0, 2, 5, 7, 0, 0, 0, 0}, + { 1, 2, 5, 7, 0, 0, 0, 0}, + { 0, 1, 2, 5, 7, 0, 0, 0}, + { 3, 5, 7, 0, 0, 0, 0, 0}, + { 0, 3, 5, 7, 0, 0, 0, 0}, + { 1, 3, 5, 7, 0, 0, 0, 0}, + { 0, 1, 3, 5, 7, 0, 0, 0}, + { 2, 3, 5, 7, 0, 0, 0, 0}, + { 0, 2, 3, 5, 7, 0, 0, 0}, + { 1, 2, 3, 5, 7, 0, 0, 0}, + { 0, 1, 2, 3, 5, 7, 0, 0}, + { 4, 5, 7, 0, 0, 0, 0, 0}, + { 0, 4, 5, 7, 0, 0, 0, 0}, + { 1, 4, 5, 7, 0, 0, 0, 0}, + { 0, 1, 4, 5, 7, 0, 0, 0}, + { 2, 4, 5, 7, 0, 0, 0, 0}, + { 0, 2, 4, 5, 7, 0, 0, 0}, + { 1, 2, 4, 5, 7, 0, 0, 0}, + { 0, 1, 2, 4, 5, 7, 0, 0}, + { 3, 4, 5, 7, 0, 0, 0, 0}, + { 0, 3, 4, 5, 7, 0, 0, 0}, + { 1, 3, 4, 5, 7, 0, 0, 0}, + { 0, 1, 3, 4, 5, 7, 0, 0}, + { 2, 3, 4, 5, 7, 0, 0, 0}, + { 0, 2, 3, 4, 5, 7, 0, 0}, + { 1, 2, 3, 4, 5, 7, 0, 0}, + { 0, 1, 2, 3, 4, 5, 7, 0}, + { 6, 7, 0, 0, 0, 0, 0, 0}, + { 0, 6, 7, 0, 0, 0, 0, 0}, + { 1, 6, 7, 0, 0, 0, 0, 0}, + { 0, 1, 6, 7, 0, 0, 0, 0}, + { 2, 6, 7, 0, 0, 0, 0, 0}, + { 0, 2, 6, 7, 0, 0, 0, 0}, + { 1, 2, 6, 7, 0, 0, 0, 0}, + { 0, 1, 2, 6, 7, 0, 0, 0}, + { 3, 6, 7, 0, 0, 0, 0, 0}, + { 0, 3, 6, 7, 0, 0, 0, 0}, + { 1, 3, 6, 7, 0, 0, 0, 0}, + { 0, 1, 3, 6, 7, 0, 0, 0}, + { 2, 3, 6, 7, 0, 0, 0, 0}, + { 0, 2, 3, 6, 7, 0, 0, 0}, + { 1, 2, 3, 6, 7, 0, 0, 0}, + { 0, 1, 2, 3, 6, 7, 0, 0}, + { 4, 6, 7, 0, 0, 0, 0, 0}, + { 0, 4, 6, 7, 0, 0, 0, 0}, + { 1, 4, 6, 7, 0, 0, 0, 0}, + { 0, 1, 4, 6, 7, 0, 0, 0}, + { 2, 4, 6, 7, 0, 0, 0, 0}, + { 0, 2, 4, 6, 7, 0, 0, 0}, + { 1, 2, 4, 6, 7, 0, 0, 0}, + { 0, 1, 2, 4, 6, 7, 0, 0}, + { 3, 4, 6, 7, 0, 0, 0, 0}, + { 0, 3, 4, 6, 7, 0, 0, 0}, + { 1, 3, 4, 6, 7, 0, 0, 0}, + { 0, 1, 3, 4, 6, 7, 0, 0}, + { 2, 3, 4, 6, 7, 0, 0, 0}, + { 0, 2, 3, 4, 6, 7, 0, 0}, + { 1, 2, 3, 4, 6, 7, 0, 0}, + { 0, 1, 2, 3, 4, 6, 7, 0}, + { 5, 6, 7, 0, 0, 0, 0, 0}, + { 0, 5, 6, 7, 0, 0, 0, 0}, + { 1, 5, 6, 7, 0, 0, 0, 0}, + { 0, 1, 5, 6, 7, 0, 0, 0}, + { 2, 5, 6, 7, 0, 0, 0, 0}, + { 0, 2, 5, 6, 7, 0, 0, 0}, + { 1, 2, 5, 6, 7, 0, 0, 0}, + { 0, 1, 2, 5, 6, 7, 0, 0}, + { 3, 5, 6, 7, 0, 0, 0, 0}, + { 0, 3, 5, 6, 7, 0, 0, 0}, + { 1, 3, 5, 6, 7, 0, 0, 0}, + { 0, 1, 3, 5, 6, 7, 0, 0}, + { 2, 3, 5, 6, 7, 0, 0, 0}, + { 0, 2, 3, 5, 6, 7, 0, 0}, + { 1, 2, 3, 5, 6, 7, 0, 0}, + { 0, 1, 2, 3, 5, 6, 7, 0}, + { 4, 5, 6, 7, 0, 0, 0, 0}, + { 0, 4, 5, 6, 7, 0, 0, 0}, + { 1, 4, 5, 6, 7, 0, 0, 0}, + { 0, 1, 4, 5, 6, 7, 0, 0}, + { 2, 4, 5, 6, 7, 0, 0, 0}, + { 0, 2, 4, 5, 6, 7, 0, 0}, + { 1, 2, 4, 5, 6, 7, 0, 0}, + { 0, 1, 2, 4, 5, 6, 7, 0}, + { 3, 4, 5, 6, 7, 0, 0, 0}, + { 0, 3, 4, 5, 6, 7, 0, 0}, + { 1, 3, 4, 5, 6, 7, 0, 0}, + { 0, 1, 3, 4, 5, 6, 7, 0}, + { 2, 3, 4, 5, 6, 7, 0, 0}, + { 0, 2, 3, 4, 5, 6, 7, 0}, + { 1, 2, 3, 4, 5, 6, 7, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7} +}; + +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_rej_uniform_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]) { + unsigned int ctr, pos; + uint32_t good; + __m256i d, tmp; + const __m256i bound = _mm256_set1_epi32(Q); + const __m256i mask = _mm256_set1_epi32(0x7FFFFF); + const __m256i idx8 = _mm256_set_epi8(-1, 15, 14, 13, -1, 12, 11, 10, + -1, 9, 8, 7, -1, 6, 5, 4, + -1, 11, 10, 9, -1, 8, 7, 6, + -1, 5, 4, 3, -1, 2, 1, 0); + + ctr = pos = 0; + while (pos <= REJ_UNIFORM_BUFLEN - 24) { + d = _mm256_loadu_si256((__m256i *)&buf[pos]); + d = _mm256_permute4x64_epi64(d, 0x94); + d = _mm256_shuffle_epi8(d, idx8); + d = _mm256_and_si256(d, mask); + pos += 24; + + tmp = _mm256_sub_epi32(d, bound); + good = _mm256_movemask_ps((__m256)tmp); + tmp = _mm256_cvtepu8_epi32(_mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5AES_AVX2_idxlut[good])); + d = _mm256_permutevar8x32_epi32(d, tmp); + + _mm256_storeu_si256((__m256i *)&r[ctr], d); + ctr += _mm_popcnt_u32(good); + + } + + + return ctr; +} + +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_rej_eta_avx(int32_t *restrict r, const uint8_t buf[REJ_UNIFORM_ETA_BUFLEN]) { + unsigned int ctr, pos; + uint32_t good; + __m256i f0, f1, f2; + __m128i g0, g1; + const __m256i mask = _mm256_set1_epi8(15); + const __m256i eta = _mm256_set1_epi8(ETA); + const __m256i bound = mask; + const __m256i v = _mm256_set1_epi32(-6560); + const __m256i p = _mm256_set1_epi32(5); + + ctr = pos = 0; + while (ctr <= N - 8 && pos <= REJ_UNIFORM_ETA_BUFLEN - 16) { + f0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)&buf[pos])); + f1 = _mm256_slli_epi16(f0, 4); + f0 = _mm256_or_si256(f0, f1); + f0 = _mm256_and_si256(f0, mask); + + f1 = _mm256_sub_epi8(f0, bound); + f0 = _mm256_sub_epi8(eta, f0); + good = _mm256_movemask_epi8(f1); + + g0 = _mm256_castsi256_si128(f0); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm256_extracti128_si256(f0, 1); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5AES_AVX2_idxlut[good & 0xFF]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good & 0xFF); + good >>= 8; + pos += 4; + + if (ctr > N - 8) { + break; + } + g0 = _mm_bsrli_si128(g0, 8); + g1 = _mm_loadl_epi64((__m128i *)&PQCLEAN_DILITHIUM5AES_AVX2_idxlut[good]); + g1 = _mm_shuffle_epi8(g0, g1); + f1 = _mm256_cvtepi8_epi32(g1); + f2 = _mm256_mulhrs_epi16(f1, v); + f2 = _mm256_mullo_epi16(f2, p); + f1 = _mm256_add_epi32(f1, f2); + _mm256_storeu_si256((__m256i *)&r[ctr], f1); + ctr += _mm_popcnt_u32(good); + pos += 4; + } + + uint32_t t0, t1; + while (ctr < N && pos < REJ_UNIFORM_ETA_BUFLEN) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + r[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < N) { + t1 = t1 - (205 * t1 >> 10) * 5; + r[ctr++] = 2 - t1; + } + } + + return ctr; +} diff --git a/crypto_sign/dilithium5aes/avx2/rejsample.h b/crypto_sign/dilithium5aes/avx2/rejsample.h new file mode 100644 index 00000000..c493eaf1 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/rejsample.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_REJSAMPLE_H +#define PQCLEAN_DILITHIUM5AES_AVX2_REJSAMPLE_H +#include "params.h" +#include "symmetric.h" +#include + +#define REJ_UNIFORM_NBLOCKS ((768+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_BUFLEN (REJ_UNIFORM_NBLOCKS*STREAM128_BLOCKBYTES) + +#define REJ_UNIFORM_ETA_NBLOCKS ((137+STREAM128_BLOCKBYTES-1)/STREAM128_BLOCKBYTES) +#define REJ_UNIFORM_ETA_BUFLEN (REJ_UNIFORM_ETA_NBLOCKS*STREAM128_BLOCKBYTES) + +extern const uint8_t PQCLEAN_DILITHIUM5AES_AVX2_idxlut[256][8]; + +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_rej_uniform_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN + 8]); + +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_rej_eta_avx(int32_t *r, const uint8_t buf[REJ_UNIFORM_BUFLEN]); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/rounding.c b/crypto_sign/dilithium5aes/avx2/rounding.c new file mode 100644 index 00000000..1ce05aa0 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/rounding.c @@ -0,0 +1,154 @@ +#include "consts.h" +#include "params.h" +#include "rejsample.h" +#include "rounding.h" +#include +#include +#include + +#define _mm256_blendv_epi32(a,b,mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +/************************************************* +* Name: power2round +* +* Description: For finite field elements a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be positive standard representative. +* +* Arguments: - __m256i *a1: output array of length N/8 with high bits +* - __m256i *a0: output array of length N/8 with low bits a0 +* - const __m256i *a: input array of length N/8 +* +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i mask = _mm256_set1_epi32(-(1 << D)); + const __m256i half = _mm256_set1_epi32((1 << (D - 1)) - 1); + + for (i = 0; i < N / 8; ++i) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, half); + f0 = _mm256_and_si256(f1, mask); + f1 = _mm256_srli_epi32(f1, D); + f0 = _mm256_sub_epi32(f, f0); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + +/************************************************* +* Name: decompose +* +* Description: For finite field element a, compute high and low parts a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes a to be positive standard +* representative. +* +* Arguments: - __m256i *a1: output array of length N/8 with high parts +* - __m256i *a0: output array of length N/8 with low parts a0 +* - const __m256i *a: input array of length N/8 +* +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a) { + unsigned int i; + __m256i f, f0, f1; + const __m256i q = _mm256_load_si256(&PQCLEAN_DILITHIUM5AES_AVX2_qdata.vec[_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + const __m256i v = _mm256_set1_epi32(1025); + const __m256i alpha = _mm256_set1_epi32(2 * GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(512); + const __m256i mask = _mm256_set1_epi32(15); + + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + + +/************************************************* +* Name: make_hint +* +* Description: Compute indices of polynomial coefficients whose low bits +* overflow into the high bits. +* +* Arguments: - uint8_t *hint: hint array +* - const __m256i *a0: low bits of input elements +* - const __m256i *a1: high bits of input elements +* +* Returns number of overflowing low bits +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *restrict a0, const __m256i *restrict a1) { + unsigned int i, n = 0; + __m256i f0, f1, g0, g1; + uint32_t bad; + uint64_t idx; + const __m256i low = _mm256_set1_epi32(-GAMMA2); + const __m256i high = _mm256_set1_epi32(GAMMA2); + + for (i = 0; i < N / 8; ++i) { + f0 = _mm256_load_si256(&a0[i]); + f1 = _mm256_load_si256(&a1[i]); + g0 = _mm256_abs_epi32(f0); + g0 = _mm256_cmpgt_epi32(g0, high); + g1 = _mm256_cmpeq_epi32(f0, low); + g1 = _mm256_sign_epi32(g1, f1); + g0 = _mm256_or_si256(g0, g1); + + bad = _mm256_movemask_ps((__m256)g0); + memcpy(&idx, PQCLEAN_DILITHIUM5AES_AVX2_idxlut[bad], 8); + idx += (uint64_t)0x0808080808080808 * i; + memcpy(&hint[n], &idx, 8); + n += _mm_popcnt_u32(bad); + } + + return n; +} + +/************************************************* +* Name: use_hint +* +* Description: Correct high parts according to hint. +* +* Arguments: - __m256i *b: output array of length N/8 with corrected high parts +* - const __m256i *a: input array of length N/8 +* - const __m256i *a: input array of length N/8 with hint bits +* +**************************************************/ +void PQCLEAN_DILITHIUM5AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *restrict hint) { + unsigned int i; + __m256i a0[N / 8]; + __m256i f, g, h, t; + const __m256i zero = _mm256_setzero_si256(); + const __m256i mask = _mm256_set1_epi32(15); + + PQCLEAN_DILITHIUM5AES_AVX2_decompose_avx(b, a0, a); + for (i = 0; i < N / 8; i++) { + f = _mm256_load_si256(&a0[i]); + g = _mm256_load_si256(&b[i]); + h = _mm256_load_si256(&hint[i]); + t = _mm256_blendv_epi32(zero, h, f); + t = _mm256_slli_epi32(t, 1); + h = _mm256_sub_epi32(h, t); + g = _mm256_add_epi32(g, h); + g = _mm256_and_si256(g, mask); + _mm256_store_si256(&b[i], g); + } +} diff --git a/crypto_sign/dilithium5aes/avx2/rounding.h b/crypto_sign/dilithium5aes/avx2/rounding.h new file mode 100644 index 00000000..c367a125 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/rounding.h @@ -0,0 +1,12 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_ROUNDING_H +#define PQCLEAN_DILITHIUM5AES_AVX2_ROUNDING_H +#include "params.h" +#include +#include + +void PQCLEAN_DILITHIUM5AES_AVX2_power2round_avx(__m256i *a1, __m256i *a0, const __m256i *a); +void PQCLEAN_DILITHIUM5AES_AVX2_decompose_avx(__m256i *a1, __m256i *a0, const __m256i *a); +unsigned int PQCLEAN_DILITHIUM5AES_AVX2_make_hint_avx(uint8_t hint[N], const __m256i *a0, const __m256i *a1); +void PQCLEAN_DILITHIUM5AES_AVX2_use_hint_avx(__m256i *b, const __m256i *a, const __m256i *hint); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/shuffle.S b/crypto_sign/dilithium5aes/avx2/shuffle.S new file mode 100644 index 00000000..48f2891e --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/shuffle.S @@ -0,0 +1,54 @@ +#include "cdecl.h" +.include "shuffle.inc" + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +#store +vmovdqa %ymm9,(%rdi) +vmovdqa %ymm8,32(%rdi) +vmovdqa %ymm7,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm3,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx) +.global _cdecl(PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx) +cdecl(PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx): +_cdecl(PQCLEAN_DILITHIUM5AES_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret diff --git a/crypto_sign/dilithium5aes/avx2/shuffle.inc b/crypto_sign/dilithium5aes/avx2/shuffle.inc new file mode 100644 index 00000000..73e9ffe0 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/shuffle.inc @@ -0,0 +1,25 @@ +.macro shuffle8 r0,r1,r2,r3 +vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle4 r0,r1,r2,r3 +vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2 +vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle2 r0,r1,r2,r3 +#vpsllq $32,%ymm\r1,%ymm\r2 +vmovsldup %ymm\r1,%ymm\r2 +vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrlq $32,%ymm\r0,%ymm\r0 +#vmovshdup %ymm\r0,%ymm\r0 +vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm + +.macro shuffle1 r0,r1,r2,r3 +vpslld $16,%ymm\r1,%ymm\r2 +vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2 +vpsrld $16,%ymm\r0,%ymm\r0 +vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3 +.endm diff --git a/crypto_sign/dilithium5aes/avx2/sign.c b/crypto_sign/dilithium5aes/avx2/sign.c new file mode 100644 index 00000000..0977b975 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/sign.c @@ -0,0 +1,425 @@ +#include "aes256ctr.h" +#include "align.h" +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" +#include +#include + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + unsigned int i; + uint8_t seedbuf[3 * SEEDBYTES]; + const uint8_t *rho, *rhoprime, *key; + uint64_t nonce; + aes256ctr_ctx aesctx; + polyvecl rowbuf[1]; + polyvecl s1, *row = rowbuf; + polyveck s2; + poly t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Store rho, key */ + memcpy(pk, rho, SEEDBYTES); + memcpy(sk, rho, SEEDBYTES); + memcpy(sk + SEEDBYTES, key, SEEDBYTES); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0); + for (i = 0; i < L; ++i) { + nonce = i; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta_preinit(&s1.vec[i], &aesctx); + } + for (i = 0; i < K; ++i) { + nonce = L + i; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_eta_preinit(&s2.vec[i], &aesctx); + } + + /* Pack secret vectors */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM5AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + i * POLYETA_PACKEDBYTES, &s1.vec[i]); + } + for (i = 0; i < K; i++) { + PQCLEAN_DILITHIUM5AES_AVX2_polyeta_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + i)*POLYETA_PACKEDBYTES, &s2.vec[i]); + } + + /* Transform s1 */ + PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_ntt(&s1); + + PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(&aesctx, rho, 0); + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + for (unsigned int j = 0; j < L; j++) { + nonce = (i << 8) + j; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx); + PQCLEAN_DILITHIUM5AES_AVX2_poly_nttunpack(&row->vec[j]); + } + + /* Compute inner-product */ + PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_pointwise_acc_montgomery(&t1, row, &s1); + PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(&t1); + + /* Add error polynomial */ + PQCLEAN_DILITHIUM5AES_AVX2_poly_add(&t1, &t1, &s2.vec[i]); + + /* Round t and pack t1, t0 */ + PQCLEAN_DILITHIUM5AES_AVX2_poly_caddq(&t1); + PQCLEAN_DILITHIUM5AES_AVX2_poly_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM5AES_AVX2_polyt1_pack(pk + SEEDBYTES + i * POLYT1_PACKEDBYTES, &t1); + PQCLEAN_DILITHIUM5AES_AVX2_polyt0_pack(sk + 2 * SEEDBYTES + CRHBYTES + (L + K)*POLYETA_PACKEDBYTES + i * POLYT0_PACKEDBYTES, &t0); + } + + /* Compute CRH(rho, t1) and store in secret key */ + crh(sk + 2 * SEEDBYTES, pk, PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + unsigned int i, n, pos; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint8_t hintbuf[N]; + uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce = 0; + polyvecl mat[K], s1, z; + polyveck t0, s2, w1; + poly c, tmp; + union { + polyvecl y; + polyveck w0; + } tmpv; + shake256incctx state; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM5AES_AVX2_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + /* Compute CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM5AES_AVX2_polyvec_matrix_expand(mat, rho); + PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM5AES_AVX2_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM5AES_AVX2_polyveck_ntt(&t0); + + aes256ctr_ctx aesctx; + PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(&aesctx, rhoprime, 0); + +rej: + /* Sample intermediate vector y */ + for (i = 0; i < L; ++i) { + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + nonce++; + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_gamma1_preinit(&z.vec[i], &aesctx); + } + + /* Matrix-vector product */ + tmpv.y = z; + PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_ntt(&tmpv.y); + PQCLEAN_DILITHIUM5AES_AVX2_polyvec_matrix_pointwise_montgomery(&w1, mat, &tmpv.y); + PQCLEAN_DILITHIUM5AES_AVX2_polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM5AES_AVX2_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM5AES_AVX2_polyveck_decompose(&w1, &tmpv.w0, &w1); + PQCLEAN_DILITHIUM5AES_AVX2_polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM5AES_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt(&c); + + /* Compute z, reject if it reveals secret */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s1.vec[i]); + PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM5AES_AVX2_poly_add(&z.vec[i], &z.vec[i], &tmp); + PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(&z.vec[i]); + if (PQCLEAN_DILITHIUM5AES_AVX2_poly_chknorm(&z.vec[i], GAMMA1 - BETA)) { + goto rej; + } + } + + /* Zero hint vector in signature */ + pos = 0; + memset(hint, 0, OMEGA); + + for (i = 0; i < K; i++) { + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &s2.vec[i]); + PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM5AES_AVX2_poly_sub(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(&tmpv.w0.vec[i]); + if (PQCLEAN_DILITHIUM5AES_AVX2_poly_chknorm(&tmpv.w0.vec[i], GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints */ + PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery(&tmp, &c, &t0.vec[i]); + PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(&tmp); + PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(&tmp); + if (PQCLEAN_DILITHIUM5AES_AVX2_poly_chknorm(&tmp, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM5AES_AVX2_poly_add(&tmpv.w0.vec[i], &tmpv.w0.vec[i], &tmp); + n = PQCLEAN_DILITHIUM5AES_AVX2_poly_make_hint(hintbuf, &tmpv.w0.vec[i], &w1.vec[i]); + if (pos + n > OMEGA) { + goto rej; + } + + /* Store hints in signature */ + memcpy(&hint[pos], hintbuf, n); + hint[OMEGA + i] = pos = pos + n; + } + + /* Pack z into signature */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM5AES_AVX2_polyz_pack(sig + SEEDBYTES + i * POLYZ_PACKEDBYTES, &z.vec[i]); + } + + *siglen = PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, const uint8_t *m, size_t mlen, const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk) { + unsigned int i, j, pos = 0; + /* PQCLEAN_DILITHIUM5AES_AVX2_polyw1_pack writes additional 14 bytes */ + ALIGNED_UINT8(K * POLYW1_PACKEDBYTES + 14) buf; + uint8_t mu[CRHBYTES]; + const uint8_t *hint = sig + SEEDBYTES + L * POLYZ_PACKEDBYTES; + uint64_t nonce; + aes256ctr_ctx aesctx; + polyvecl rowbuf[1]; + polyvecl *row = rowbuf; + polyvecl z; + poly c, w1, h; + shake256incctx state; + + if (siglen != PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + /* Expand PQCLEAN_DILITHIUM5AES_AVX2_challenge */ + PQCLEAN_DILITHIUM5AES_AVX2_poly_challenge(&c, sig); + PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt(&c); + + /* Unpack z; shortness follows from unpacking */ + for (i = 0; i < L; i++) { + PQCLEAN_DILITHIUM5AES_AVX2_polyz_unpack(&z.vec[i], sig + SEEDBYTES + i * POLYZ_PACKEDBYTES); + PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt(&z.vec[i]); + } + + PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(&aesctx, pk, 0); + + for (i = 0; i < K; i++) { + /* Expand matrix row */ + for (j = 0; j < L; j++) { + nonce = (i << 8) + j; + aesctx.n = _mm_loadl_epi64((__m128i *)&nonce); + PQCLEAN_DILITHIUM5AES_AVX2_poly_uniform_preinit(&row->vec[j], &aesctx); + PQCLEAN_DILITHIUM5AES_AVX2_poly_nttunpack(&row->vec[j]); + } + + /* Compute i-th row of Az - c2^Dt1 */ + PQCLEAN_DILITHIUM5AES_AVX2_polyvecl_pointwise_acc_montgomery(&w1, row, &z); + + PQCLEAN_DILITHIUM5AES_AVX2_polyt1_unpack(&h, pk + SEEDBYTES + i * POLYT1_PACKEDBYTES); + PQCLEAN_DILITHIUM5AES_AVX2_poly_shiftl(&h); + PQCLEAN_DILITHIUM5AES_AVX2_poly_ntt(&h); + PQCLEAN_DILITHIUM5AES_AVX2_poly_pointwise_montgomery(&h, &c, &h); + + PQCLEAN_DILITHIUM5AES_AVX2_poly_sub(&w1, &w1, &h); + PQCLEAN_DILITHIUM5AES_AVX2_poly_reduce(&w1); + PQCLEAN_DILITHIUM5AES_AVX2_poly_invntt_tomont(&w1); + + /* Get hint polynomial and reconstruct w1 */ + memset(h.vec, 0, sizeof(poly)); + if (hint[OMEGA + i] < pos || hint[OMEGA + i] > OMEGA) { + return -1; + } + + for (j = pos; j < hint[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > pos && hint[j] <= hint[j - 1]) { + return -1; + } + h.coeffs[hint[j]] = 1; + } + pos = hint[OMEGA + i]; + + PQCLEAN_DILITHIUM5AES_AVX2_poly_caddq(&w1); + PQCLEAN_DILITHIUM5AES_AVX2_poly_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM5AES_AVX2_polyw1_pack(buf.coeffs + i * POLYW1_PACKEDBYTES, &w1); + } + + /* Extra indices are zero for strong unforgeability */ + for (j = pos; j < OMEGA; ++j) { + if (hint[j]) { + return -1; + } + } + + /* Call random oracle and verify PQCLEAN_DILITHIUM5AES_AVX2_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf.coeffs, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf.coeffs, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (buf.coeffs[i] != sig[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, const uint8_t *sm, size_t smlen, const uint8_t *pk) { + size_t i; + + if (smlen < PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES) { + goto badsig; + } + + *mlen = smlen - PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_verify(sm, PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM5AES_AVX2_CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium5aes/avx2/sign.h b/crypto_sign/dilithium5aes/avx2/sign.h new file mode 100644 index 00000000..c7d60321 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/sign.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_SIGN_H +#define PQCLEAN_DILITHIUM5AES_AVX2_SIGN_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM5AES_AVX2_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM5AES_AVX2_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium5aes/avx2/symmetric.h b/crypto_sign/dilithium5aes/avx2/symmetric.h new file mode 100644 index 00000000..4de47619 --- /dev/null +++ b/crypto_sign/dilithium5aes/avx2/symmetric.h @@ -0,0 +1,25 @@ +#ifndef PQCLEAN_DILITHIUM5AES_AVX2_SYMMETRIC_H +#define PQCLEAN_DILITHIUM5AES_AVX2_SYMMETRIC_H +#include "aes256ctr.h" +#include "fips202.h" +#include "params.h" +#include + + + +typedef aes256ctr_ctx stream128_state; +typedef aes256ctr_ctx stream256_state; + +#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES +#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream128_release(STATE) +#define stream256_init(STATE, SEED, NONCE) PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_DILITHIUM5AES_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_release(STATE) + + +#endif diff --git a/crypto_sign/dilithium5aes/clean/LICENSE b/crypto_sign/dilithium5aes/clean/LICENSE new file mode 100644 index 00000000..08473af7 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/LICENSE @@ -0,0 +1,5 @@ +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) + +For Keccak and AES we are using public-domain +code from sources and by authors listed in +comments on top of the respective files. diff --git a/crypto_sign/dilithium5aes/clean/Makefile b/crypto_sign/dilithium5aes/clean/Makefile new file mode 100644 index 00000000..ddc7e3ab --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/Makefile @@ -0,0 +1,19 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libdilithium5aes_clean.a +HEADERS=aes256ctr.h api.h ntt.h packing.h params.h poly.h polyvec.h reduce.h rounding.h sign.h symmetric.h +OBJECTS=aes256ctr.o ntt.o packing.o poly.o polyvec.o reduce.o rounding.o sign.o symmetric-aes.o + +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_sign/dilithium5aes/clean/Makefile.Microsoft_nmake b/crypto_sign/dilithium5aes/clean/Makefile.Microsoft_nmake new file mode 100644 index 00000000..9fc93357 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/Makefile.Microsoft_nmake @@ -0,0 +1,23 @@ +# This Makefile can be used with Microsoft Visual Studio's nmake using the command: +# nmake /f Makefile.Microsoft_nmake + +LIBRARY=libdilithium5aes_clean.lib +OBJECTS=aes256ctr.obj ntt.obj packing.obj poly.obj polyvec.obj reduce.obj rounding.obj sign.obj symmetric-aes.obj + +# Warning C4146 is raised when a unary minus operator is applied to an +# unsigned type; this has nonetheless been standard and portable for as +# long as there has been a C standard, and we need it for constant-time +# computations. Thus, we disable that spurious warning. +CFLAGS=/nologo /O2 /I ..\..\..\common /W4 /WX /wd4146 + +all: $(LIBRARY) + +# Make sure objects are recompiled if headers change. +$(OBJECTS): *.h + +$(LIBRARY): $(OBJECTS) + LIB.EXE /NOLOGO /WX /OUT:$@ $** + +clean: + -DEL $(OBJECTS) + -DEL $(LIBRARY) diff --git a/crypto_sign/dilithium5aes/clean/aes256ctr.c b/crypto_sign/dilithium5aes/clean/aes256ctr.c new file mode 100644 index 00000000..445ef4fc --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/aes256ctr.c @@ -0,0 +1,564 @@ +#include "aes256ctr.h" +#include +#include +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +static inline uint32_t br_dec32le(const uint8_t *src) { + return (uint32_t)src[0] + | ((uint32_t)src[1] << 8) + | ((uint32_t)src[2] << 16) + | ((uint32_t)src[3] << 24); +} + +static void br_range_dec32le(uint32_t *v, size_t num, const uint8_t *src) { + while (num-- > 0) { + *v ++ = br_dec32le(src); + src += 4; + } +} + +static inline uint32_t br_swap32(uint32_t x) { + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + +static inline void br_enc32le(uint8_t *dst, uint32_t x) { + dst[0] = (uint8_t)x; + dst[1] = (uint8_t)(x >> 8); + dst[2] = (uint8_t)(x >> 16); + dst[3] = (uint8_t)(x >> 24); +} + +static void br_range_enc32le(uint8_t *dst, const uint32_t *v, size_t num) { + while (num-- > 0) { + br_enc32le(dst, *v ++); + dst += 4; + } +} + +static void br_aes_ct64_bitslice_Sbox(uint64_t *q) { + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +static void br_aes_ct64_ortho(uint64_t *q) { +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)(cl)) | ((b & (uint64_t)(cl)) << (s)); \ + (y) = ((a & (uint64_t)(ch)) >> (s)) | (b & (uint64_t)(ch)); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +static void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) { + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +static void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) { + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const uint8_t Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t sub_word(uint32_t x) { + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +static void br_aes_ct64_keysched(uint64_t *comp_skey, const uint8_t *key) { + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + int key_len = 32; + + nk = (int)(key_len >> 2); + nkf = (int)((14 + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } +} + +static void br_aes_ct64_skey_expand(uint64_t *skey, const uint64_t *comp_skey) { + unsigned u, v, n; + + n = (14 + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} + +static inline void add_round_key(uint64_t *q, const uint64_t *sk) { + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void shift_rows(uint64_t *q) { + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t rotr32(uint64_t x) { + return (x << 32) | (x >> 32); +} + +static inline void mix_columns(uint64_t *q) { + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +static void inc4_be(uint32_t *x) { + *x = br_swap32(*x) + 4; + *x = br_swap32(*x); +} + +static void aes_ctr4x(uint8_t out[64], uint32_t ivw[16], uint64_t sk_exp[64]) { + uint32_t w[16]; + uint64_t q[8]; + int i; + + memcpy(w, ivw, sizeof(w)); + for (i = 0; i < 4; i++) { + br_aes_ct64_interleave_in(&q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + + add_round_key(q, sk_exp); + for (i = 1; i < 14; i++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, sk_exp + (i << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, sk_exp + 112); + + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out(w + (i << 2), q[i], q[i + 4]); + } + br_range_enc32le(out, w, 16); + + /* Increase counter for next 4 blocks */ + inc4_be(ivw + 3); + inc4_be(ivw + 7); + inc4_be(ivw + 11); + inc4_be(ivw + 15); +} + +static void br_aes_ct64_ctr_init(uint64_t sk_exp[120], const uint8_t *key) { + uint64_t skey[30]; + + br_aes_ct64_keysched(skey, key); + br_aes_ct64_skey_expand(sk_exp, skey); +} + +static void br_aes_ct64_ctr_run(uint64_t sk_exp[120], const uint8_t *iv, uint32_t cc, uint8_t *data, size_t len) { + uint32_t ivw[16]; + size_t i; + + br_range_dec32le(ivw, 3, iv); + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + ivw[ 3] = br_swap32(cc); + ivw[ 7] = br_swap32(cc + 1); + ivw[11] = br_swap32(cc + 2); + ivw[15] = br_swap32(cc + 3); + + while (len > 64) { + aes_ctr4x(data, ivw, sk_exp); + data += 64; + len -= 64; + } + if (len > 0) { + uint8_t tmp[64]; + aes_ctr4x(tmp, ivw, sk_exp); + for (i = 0; i < len; i++) { + data[i] = tmp[i]; + } + } +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t *key, const uint8_t *nonce) { + uint64_t sk_exp[120]; + + br_aes_ct64_ctr_init(sk_exp, key); + br_aes_ct64_ctr_run(sk_exp, nonce, 0, out, outlen); +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_init(aes256ctr_ctx *s, const uint8_t *key, const uint8_t *nonce) { + br_aes_ct64_ctr_init(s->sk_exp, key); + + br_range_dec32le(s->ivw, 3, nonce); + memcpy(s->ivw + 4, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 8, s->ivw, 3 * sizeof(uint32_t)); + memcpy(s->ivw + 12, s->ivw, 3 * sizeof(uint32_t)); + s->ivw[ 3] = br_swap32(0); + s->ivw[ 7] = br_swap32(1); + s->ivw[11] = br_swap32(2); + s->ivw[15] = br_swap32(3); +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *s) { + while (nblocks > 0) { + aes_ctr4x(out, s->ivw, s->sk_exp); + out += 64; + nblocks--; + } +} diff --git a/crypto_sign/dilithium5aes/clean/aes256ctr.h b/crypto_sign/dilithium5aes/clean/aes256ctr.h new file mode 100644 index 00000000..3d4261f0 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/aes256ctr.h @@ -0,0 +1,28 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_AES256CTR_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_AES256CTR_H + +#include +#include + +#define AES256CTR_BLOCKBYTES 64 + + +typedef struct { + uint64_t sk_exp[120]; + uint32_t ivw[16]; +} aes256ctr_ctx; + +void PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_prf(uint8_t *out, + size_t outlen, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + const uint8_t nonce[12]); + +void PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_squeezeblocks(uint8_t *out, + size_t nblocks, + aes256ctr_ctx *state); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/api.h b/crypto_sign/dilithium5aes/clean/api.h new file mode 100644 index 00000000..fe7baff2 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/api.h @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_API_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_API_H + +#include +#include + +#define PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES 2592 +#define PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_SECRETKEYBYTES 4880 +#define PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES 4595 +#define PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_ALGNAME "Dilithium5-AES" + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_signature( + uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_verify( + const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, const uint8_t *pk); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign( + uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, const uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_open( + uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/ntt.c b/crypto_sign/dilithium5aes/clean/ntt.c new file mode 100644 index 00000000..a2d46318 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/ntt.c @@ -0,0 +1,98 @@ +#include "ntt.h" +#include "params.h" +#include "reduce.h" +#include + +static const int32_t zetas[N] = { + 0, 25847, -2608894, -518909, 237124, -777960, -876248, 466468, + 1826347, 2353451, -359251, -2091905, 3119733, -2884855, 3111497, 2680103, + 2725464, 1024112, -1079900, 3585928, -549488, -1119584, 2619752, -2108549, + -2118186, -3859737, -1399561, -3277672, 1757237, -19422, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, -1661693, -3592148, -2537516, 3915439, + -3861115, -3043716, 3574422, -2867647, 3539968, -300467, 2348700, -539299, + -1699267, -1643818, 3505694, -3821735, 3507263, -2140649, -1600420, 3699596, + 811944, 531354, 954230, 3881043, 3900724, -2556880, 2071892, -2797779, + -3930395, -1528703, -3677745, -3041255, -1452451, 3475950, 2176455, -1585221, + -1257611, 1939314, -4083598, -1000202, -3190144, -3157330, -3632928, 126922, + 3412210, -983419, 2147896, 2715295, -2967645, -3693493, -411027, -2477047, + -671102, -1228525, -22981, -1308169, -381987, 1349076, 1852771, -1430430, + -3343383, 264944, 508951, 3097992, 44288, -1100098, 904516, 3958618, + -3724342, -8578, 1653064, -3249728, 2389356, -210977, 759969, -1316856, + 189548, -3553272, 3159746, -1851402, -2409325, -177440, 1315589, 1341330, + 1285669, -1584928, -812732, -1439742, -3019102, -3881060, -3628969, 3839961, + 2091667, 3407706, 2316500, 3817976, -3342478, 2244091, -2446433, -3562462, + 266997, 2434439, -1235728, 3513181, -3520352, -3759364, -1197226, -3193378, + 900702, 1859098, 909542, 819034, 495491, -1613174, -43260, -522500, + -655327, -3122442, 2031748, 3207046, -3556995, -525098, -768622, -3595838, + 342297, 286988, -2437823, 4108315, 3437287, -3342277, 1735879, 203044, + 2842341, 2691481, -2590150, 1265009, 4055324, 1247620, 2486353, 1595974, + -3767016, 1250494, 2635921, -3548272, -2994039, 1869119, 1903435, -1050970, + -1333058, 1237275, -3318210, -1430225, -451100, 1312455, 3306115, -1962642, + -1279661, 1917081, -2546312, -1374803, 1500165, 777191, 2235880, 3406031, + -542412, -2831860, -1671176, -1846953, -2584293, -3724270, 594136, -3776993, + -2013608, 2432395, 2454455, -164721, 1957272, 3369112, 185531, -1207385, + -3183426, 162844, 1616392, 3014001, 810149, 1652634, -3694233, -1799107, + -3038916, 3523897, 3866901, 269760, 2213111, -975884, 1717735, 472078, + -426683, 1723600, -1803090, 1910376, -1667432, -1104333, -260646, -3833893, + -2939036, -2235985, -420899, -2286327, 183443, -976891, 1612842, -3545687, + -554416, 3919660, -48306, -1362209, 3937738, 1400424, -846154, 1976782 +}; + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_ntt +* +* Description: Forward NTT, in-place. No modular reduction is performed after +* additions or subtractions. Output vector is in bitreversed order. +* +* Arguments: - uint32_t p[N]: input/output coefficient array +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_ntt(int32_t a[N]) { + unsigned int len, start, j, k; + int32_t zeta, t; + + k = 0; + for (len = 128; len > 0; len >>= 1) { + for (start = 0; start < N; start = j + len) { + zeta = zetas[++k]; + for (j = start; j < start + len; ++j) { + t = PQCLEAN_DILITHIUM5AES_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + a[j + len] = a[j] - t; + a[j] = a[j] + t; + } + } + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_invntt_tomont +* +* Description: Inverse NTT and multiplication by Montgomery factor 2^32. +* In-place. No modular reductions after additions or +* subtractions; input coefficients need to be smaller than +* Q in absolute value. Output coefficient are smaller than Q in +* absolute value. +* +* Arguments: - uint32_t p[N]: input/output coefficient array +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_invntt_tomont(int32_t a[N]) { + unsigned int start, len, j, k; + int32_t t, zeta; + const int32_t f = 41978; // mont^2/256 + + k = 256; + for (len = 1; len < N; len <<= 1) { + for (start = 0; start < N; start = j + len) { + zeta = -zetas[--k]; + for (j = start; j < start + len; ++j) { + t = a[j]; + a[j] = t + a[j + len]; + a[j + len] = t - a[j + len]; + a[j + len] = PQCLEAN_DILITHIUM5AES_CLEAN_montgomery_reduce((int64_t)zeta * a[j + len]); + } + } + } + + for (j = 0; j < N; ++j) { + a[j] = PQCLEAN_DILITHIUM5AES_CLEAN_montgomery_reduce((int64_t)f * a[j]); + } +} diff --git a/crypto_sign/dilithium5aes/clean/ntt.h b/crypto_sign/dilithium5aes/clean/ntt.h new file mode 100644 index 00000000..a5b67e7b --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/ntt.h @@ -0,0 +1,10 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_NTT_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_NTT_H +#include "params.h" +#include + +void PQCLEAN_DILITHIUM5AES_CLEAN_ntt(int32_t a[N]); + +void PQCLEAN_DILITHIUM5AES_CLEAN_invntt_tomont(int32_t a[N]); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/packing.c b/crypto_sign/dilithium5aes/clean/packing.c new file mode 100644 index 00000000..a3b2ffa3 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/packing.c @@ -0,0 +1,261 @@ +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_pack_pk +* +* Description: Bit-pack public key pk = (rho, t1). +* +* Arguments: - uint8_t pk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const polyveck *t1: pointer to vector t1 +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES], + const uint8_t rho[SEEDBYTES], + const polyveck *t1) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + pk[i] = rho[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyt1_pack(pk + i * POLYT1_PACKEDBYTES, &t1->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_unpack_pk +* +* Description: Unpack public key pk = (rho, t1). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const polyveck *t1: pointer to output vector t1 +* - uint8_t pk[]: byte array containing bit-packed pk +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], + polyveck *t1, + const uint8_t pk[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = pk[i]; + } + pk += SEEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyt1_unpack(&t1->vec[i], pk + i * POLYT1_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_pack_sk +* +* Description: Bit-pack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - uint8_t sk[]: output byte array +* - const uint8_t rho[]: byte array containing rho +* - const uint8_t tr[]: byte array containing tr +* - const uint8_t key[]: byte array containing key +* - const polyveck *t0: pointer to vector t0 +* - const polyvecl *s1: pointer to vector s1 +* - const polyveck *s2: pointer to vector s2 +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = rho[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + sk[i] = key[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + sk[i] = tr[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s1->vec[i]); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_pack(sk + i * POLYETA_PACKEDBYTES, &s2->vec[i]); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyt0_pack(sk + i * POLYT0_PACKEDBYTES, &t0->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_unpack_sk +* +* Description: Unpack secret key sk = (rho, tr, key, t0, s1, s2). +* +* Arguments: - const uint8_t rho[]: output byte array for rho +* - const uint8_t tr[]: output byte array for tr +* - const uint8_t key[]: output byte array for key +* - const polyveck *t0: pointer to output vector t0 +* - const polyvecl *s1: pointer to output vector s1 +* - const polyveck *s2: pointer to output vector s2 +* - uint8_t sk[]: byte array containing bit-packed sk +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_SECRETKEYBYTES]) { + unsigned int i; + + for (i = 0; i < SEEDBYTES; ++i) { + rho[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < SEEDBYTES; ++i) { + key[i] = sk[i]; + } + sk += SEEDBYTES; + + for (i = 0; i < CRHBYTES; ++i) { + tr[i] = sk[i]; + } + sk += CRHBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_unpack(&s1->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += L * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_unpack(&s2->vec[i], sk + i * POLYETA_PACKEDBYTES); + } + sk += K * POLYETA_PACKEDBYTES; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyt0_unpack(&t0->vec[i], sk + i * POLYT0_PACKEDBYTES); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_pack_sig +* +* Description: Bit-pack signature sig = (c, z, h). +* +* Arguments: - uint8_t sig[]: output byte array +* - const uint8_t *c: pointer to PQCLEAN_DILITHIUM5AES_CLEAN_challenge hash length SEEDBYTES +* - const polyvecl *z: pointer to vector z +* - const polyveck *h: pointer to hint vector h +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES], + const uint8_t c[SEEDBYTES], + const polyvecl *z, + const polyveck *h) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + sig[i] = c[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyz_pack(sig + i * POLYZ_PACKEDBYTES, &z->vec[i]); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Encode h */ + for (i = 0; i < OMEGA + K; ++i) { + sig[i] = 0; + } + + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + if (h->vec[i].coeffs[j] != 0) { + sig[k++] = (uint8_t) j; + } + } + + sig[OMEGA + i] = (uint8_t) k; + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_unpack_sig +* +* Description: Unpack signature sig = (c, z, h). +* +* Arguments: - uint8_t *c: pointer to output PQCLEAN_DILITHIUM5AES_CLEAN_challenge hash +* - polyvecl *z: pointer to output vector z +* - polyveck *h: pointer to output hint vector h +* - const uint8_t sig[]: byte array containing +* bit-packed signature +* +* Returns 1 in case of malformed signature; otherwise 0. +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], + polyvecl *z, + polyveck *h, + const uint8_t sig[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES]) { + unsigned int i, j, k; + + for (i = 0; i < SEEDBYTES; ++i) { + c[i] = sig[i]; + } + sig += SEEDBYTES; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyz_unpack(&z->vec[i], sig + i * POLYZ_PACKEDBYTES); + } + sig += L * POLYZ_PACKEDBYTES; + + /* Decode h */ + k = 0; + for (i = 0; i < K; ++i) { + for (j = 0; j < N; ++j) { + h->vec[i].coeffs[j] = 0; + } + + if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA) { + return 1; + } + + for (j = k; j < sig[OMEGA + i]; ++j) { + /* Coefficients are ordered for strong unforgeability */ + if (j > k && sig[j] <= sig[j - 1]) { + return 1; + } + h->vec[i].coeffs[sig[j]] = 1; + } + + k = sig[OMEGA + i]; + } + + /* Extra indices are zero for strong unforgeability */ + for (j = k; j < OMEGA; ++j) { + if (sig[j]) { + return 1; + } + } + + return 0; +} diff --git a/crypto_sign/dilithium5aes/clean/packing.h b/crypto_sign/dilithium5aes/clean/packing.h new file mode 100644 index 00000000..4bca683f --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/packing.h @@ -0,0 +1,31 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_PACKING_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_PACKING_H +#include "params.h" +#include "polyvec.h" +#include + +void PQCLEAN_DILITHIUM5AES_CLEAN_pack_pk(uint8_t pk[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES], const uint8_t rho[SEEDBYTES], const polyveck *t1); + +void PQCLEAN_DILITHIUM5AES_CLEAN_pack_sk(uint8_t sk[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_SECRETKEYBYTES], + const uint8_t rho[SEEDBYTES], + const uint8_t tr[CRHBYTES], + const uint8_t key[SEEDBYTES], + const polyveck *t0, + const polyvecl *s1, + const polyveck *s2); + +void PQCLEAN_DILITHIUM5AES_CLEAN_pack_sig(uint8_t sig[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES], const uint8_t c[SEEDBYTES], const polyvecl *z, const polyveck *h); + +void PQCLEAN_DILITHIUM5AES_CLEAN_unpack_pk(uint8_t rho[SEEDBYTES], polyveck *t1, const uint8_t pk[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES]); + +void PQCLEAN_DILITHIUM5AES_CLEAN_unpack_sk(uint8_t rho[SEEDBYTES], + uint8_t tr[CRHBYTES], + uint8_t key[SEEDBYTES], + polyveck *t0, + polyvecl *s1, + polyveck *s2, + const uint8_t sk[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_SECRETKEYBYTES]); + +int PQCLEAN_DILITHIUM5AES_CLEAN_unpack_sig(uint8_t c[SEEDBYTES], polyvecl *z, polyveck *h, const uint8_t sig[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES]); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/params.h b/crypto_sign/dilithium5aes/clean/params.h new file mode 100644 index 00000000..b8f9630e --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/params.h @@ -0,0 +1,41 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_PARAMS_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_PARAMS_H + + + +#define SEEDBYTES 32 +#define CRHBYTES 48 +#define N 256 +#define Q 8380417 +#define D 13 +#define ROOT_OF_UNITY 1753 + +#define K 8 +#define L 7 +#define ETA 2 +#define TAU 60 +#define BETA 120 +#define GAMMA1 (1 << 19) +#define GAMMA2 ((Q-1)/32) +#define OMEGA 75 +#define PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_ALGNAME "Dilithium5-AES" + + +#define POLYT1_PACKEDBYTES 320 +#define POLYT0_PACKEDBYTES 416 +#define POLYVECH_PACKEDBYTES (OMEGA + K) + +#define POLYZ_PACKEDBYTES 640 + +#define POLYW1_PACKEDBYTES 128 + +#define POLYETA_PACKEDBYTES 96 + +#define PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K*POLYT1_PACKEDBYTES) +#define PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_SECRETKEYBYTES (2*SEEDBYTES + CRHBYTES \ + + L*POLYETA_PACKEDBYTES \ + + K*POLYETA_PACKEDBYTES \ + + K*POLYT0_PACKEDBYTES) +#define PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES (SEEDBYTES + L*POLYZ_PACKEDBYTES + POLYVECH_PACKEDBYTES) + +#endif diff --git a/crypto_sign/dilithium5aes/clean/poly.c b/crypto_sign/dilithium5aes/clean/poly.c new file mode 100644 index 00000000..ce01b21c --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/poly.c @@ -0,0 +1,842 @@ +#include "ntt.h" +#include "params.h" +#include "poly.h" +#include "reduce.h" +#include "rounding.h" +#include "symmetric.h" +#include + +#define DBENCH_START() +#define DBENCH_STOP(t) + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_reduce +* +* Description: Inplace reduction of all coefficients of polynomial to +* representative in [-6283009,6283007]. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_reduce(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM5AES_CLEAN_reduce32(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_caddq +* +* Description: For all coefficients of in/out polynomial add Q if +* coefficient is negative. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_caddq(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM5AES_CLEAN_caddq(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_freeze +* +* Description: Inplace reduction of all coefficients of polynomial to +* standard representatives. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_freeze(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] = PQCLEAN_DILITHIUM5AES_CLEAN_freeze(a->coeffs[i]); + } + + DBENCH_STOP(*tred); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_add +* +* Description: Add polynomials. No modular reduction is performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first summand +* - const poly *b: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_add(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] + b->coeffs[i]; + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_sub +* +* Description: Subtract polynomials. No modular reduction is +* performed. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial to be +* subtraced from first input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_sub(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = a->coeffs[i] - b->coeffs[i]; + } + + DBENCH_STOP(*tadd); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_shiftl +* +* Description: Multiply polynomial by 2^D without modular reduction. Assumes +* input coefficients to be less than 2^{31-D} in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_shiftl(poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a->coeffs[i] <<= D; + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_ntt +* +* Description: Inplace forward NTT. Coefficients can grow by +* 8*Q in absolute value. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_ntt(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_CLEAN_ntt(a->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_invntt_tomont +* +* Description: Inplace inverse NTT and multiplication by 2^{32}. +* Input coefficients need to be less than Q in absolute +* value and output coefficients are again bounded by Q. +* +* Arguments: - poly *a: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_invntt_tomont(poly *a) { + DBENCH_START(); + + PQCLEAN_DILITHIUM5AES_CLEAN_invntt_tomont(a->coeffs); + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_pointwise_montgomery +* +* Description: Pointwise multiplication of polynomials in NTT domain +* representation and multiplication of resulting polynomial +* by 2^{-32}. +* +* Arguments: - poly *c: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + c->coeffs[i] = PQCLEAN_DILITHIUM5AES_CLEAN_montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); + } + + DBENCH_STOP(*tmul); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_power2round +* +* Description: For all coefficients c of the input polynomial, +* compute c0, c1 such that c mod Q = c1*2^D + c0 +* with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM5AES_CLEAN_power2round(&a0->coeffs[i], a->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_decompose +* +* Description: For all coefficients c of the input polynomial, +* compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0 +* with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we +* set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - poly *a1: pointer to output polynomial with coefficients c1 +* - poly *a0: pointer to output polynomial with coefficients c0 +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + a1->coeffs[i] = PQCLEAN_DILITHIUM5AES_CLEAN_decompose(&a0->coeffs[i], a->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_make_hint +* +* Description: Compute hint polynomial. The coefficients of which indicate +* whether the low bits of the corresponding coefficient of +* the input polynomial overflow into the high bits. +* +* Arguments: - poly *h: pointer to output hint polynomial +* - const poly *a0: pointer to low part of input polynomial +* - const poly *a1: pointer to high part of input polynomial +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5AES_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1) { + unsigned int i, s = 0; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + h->coeffs[i] = PQCLEAN_DILITHIUM5AES_CLEAN_make_hint(a0->coeffs[i], a1->coeffs[i]); + s += h->coeffs[i]; + } + + DBENCH_STOP(*tround); + return s; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_use_hint +* +* Description: Use hint polynomial to correct the high bits of a polynomial. +* +* Arguments: - poly *b: pointer to output polynomial with corrected high bits +* - const poly *a: pointer to input polynomial +* - const poly *h: pointer to input hint polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N; ++i) { + b->coeffs[i] = PQCLEAN_DILITHIUM5AES_CLEAN_use_hint(a->coeffs[i], h->coeffs[i]); + } + + DBENCH_STOP(*tround); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_chknorm +* +* Description: Check infinity norm of polynomial against given bound. +* Assumes input coefficients were reduced by PQCLEAN_DILITHIUM5AES_CLEAN_reduce32(). +* +* Arguments: - const poly *a: pointer to polynomial +* - int32_t B: norm bound +* +* Returns 0 if norm is strictly smaller than B <= (Q-1)/8 and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_poly_chknorm(const poly *a, int32_t B) { + unsigned int i; + int32_t t; + DBENCH_START(); + + if (B > (Q - 1) / 8) { + return 1; + } + + /* It is ok to leak which coefficient violates the bound since + the probability for each coefficient is independent of secret + data but we must not leak the sign of the centralized representative. */ + for (i = 0; i < N; ++i) { + /* Absolute value */ + t = a->coeffs[i] >> 31; + t = a->coeffs[i] - (t & 2 * a->coeffs[i]); + + if (t >= B) { + DBENCH_STOP(*tsample); + return 1; + } + } + + DBENCH_STOP(*tsample); + return 0; +} + +/************************************************* +* Name: rej_uniform +* +* Description: Sample uniformly random coefficients in [0, Q-1] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_uniform(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos + 3 <= buflen) { + t = buf[pos++]; + t |= (uint32_t)buf[pos++] << 8; + t |= (uint32_t)buf[pos++] << 16; + t &= 0x7FFFFF; + + if (t < Q) { + a[ctr++] = t; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform +* +* Description: Sample polynomial with uniformly random coefficients +* in [0,Q-1] by performing rejection sampling on the +* output stream of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_NBLOCKS ((768 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) { + unsigned int i, ctr, off; + unsigned int buflen = POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_NBLOCKS * STREAM128_BLOCKBYTES + 2]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_NBLOCKS, &state); + + ctr = rej_uniform(a->coeffs, N, buf, buflen); + + while (ctr < N) { + off = buflen % 3; + for (i = 0; i < off; ++i) { + buf[i] = buf[buflen - off + i]; + } + + stream128_squeezeblocks(buf + off, 1, &state); + buflen = STREAM128_BLOCKBYTES + off; + ctr += rej_uniform(a->coeffs + ctr, N - ctr, buf, buflen); + } + stream128_release(&state); +} + +/************************************************* +* Name: rej_eta +* +* Description: Sample uniformly random coefficients in [-ETA, ETA] by +* performing rejection sampling on array of random bytes. +* +* Arguments: - int32_t *a: pointer to output array (allocated) +* - unsigned int len: number of coefficients to be sampled +* - const uint8_t *buf: array of random bytes +* - unsigned int buflen: length of array of random bytes +* +* Returns number of sampled coefficients. Can be smaller than len if not enough +* random bytes were given. +**************************************************/ +static unsigned int rej_eta(int32_t *a, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr, pos; + uint32_t t0, t1; + DBENCH_START(); + + ctr = pos = 0; + while (ctr < len && pos < buflen) { + t0 = buf[pos] & 0x0F; + t1 = buf[pos++] >> 4; + + if (t0 < 15) { + t0 = t0 - (205 * t0 >> 10) * 5; + a[ctr++] = 2 - t0; + } + if (t1 < 15 && ctr < len) { + t1 = t1 - (205 * t1 >> 10) * 5; + a[ctr++] = 2 - t1; + } + } + + DBENCH_STOP(*tsample); + return ctr; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform_eta +* +* Description: Sample polynomial with uniformly random coefficients +* in [-ETA,ETA] by performing rejection sampling on the +* output stream from SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length SEEDBYTES +* - uint16_t nonce: 2-byte nonce +**************************************************/ +#define POLY_UNIFORM_ETA_NBLOCKS ((136 + STREAM128_BLOCKBYTES - 1)/STREAM128_BLOCKBYTES) +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce) { + unsigned int ctr; + unsigned int buflen = POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES; + uint8_t buf[POLY_UNIFORM_ETA_NBLOCKS * STREAM128_BLOCKBYTES]; + stream128_state state; + + stream128_init(&state, seed, nonce); + stream128_squeezeblocks(buf, POLY_UNIFORM_ETA_NBLOCKS, &state); + + ctr = rej_eta(a->coeffs, N, buf, buflen); + + while (ctr < N) { + stream128_squeezeblocks(buf, 1, &state); + ctr += rej_eta(a->coeffs + ctr, N - ctr, buf, STREAM128_BLOCKBYTES); + } + stream128_release(&state); +} + +/************************************************* +* Name: poly_uniform_gamma1m1 +* +* Description: Sample polynomial with uniformly random coefficients +* in [-(GAMMA1 - 1), GAMMA1] by unpacking output stream +* of SHAKE256(seed|nonce) or AES256CTR(seed,nonce). +* +* Arguments: - poly *a: pointer to output polynomial +* - const uint8_t seed[]: byte array with seed of length CRHBYTES +* - uint16_t nonce: 16-bit nonce +**************************************************/ +#define POLY_UNIFORM_GAMMA1_NBLOCKS ((POLYZ_PACKEDBYTES + STREAM256_BLOCKBYTES - 1)/STREAM256_BLOCKBYTES) +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce) { + uint8_t buf[POLY_UNIFORM_GAMMA1_NBLOCKS * STREAM256_BLOCKBYTES]; + stream256_state state; + + stream256_init(&state, seed, nonce); + stream256_squeezeblocks(buf, POLY_UNIFORM_GAMMA1_NBLOCKS, &state); + stream256_release(&state); + PQCLEAN_DILITHIUM5AES_CLEAN_polyz_unpack(a, buf); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_challenge +* +* Description: Implementation of H. Samples polynomial with TAU nonzero +* coefficients in {-1,1} using the output stream of +* SHAKE256(seed). +* +* Arguments: - poly *c: pointer to output polynomial +* - const uint8_t mu[]: byte array containing seed of length SEEDBYTES +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]) { + unsigned int i, b, pos; + uint64_t signs; + uint8_t buf[SHAKE256_RATE]; + shake256incctx state; + + shake256_inc_init(&state); + shake256_inc_absorb(&state, seed, SEEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(buf, sizeof buf, &state); + + signs = 0; + for (i = 0; i < 8; ++i) { + signs |= (uint64_t)buf[i] << 8 * i; + } + pos = 8; + + for (i = 0; i < N; ++i) { + c->coeffs[i] = 0; + } + for (i = N - TAU; i < N; ++i) { + do { + if (pos >= SHAKE256_RATE) { + shake256_inc_squeeze(buf, sizeof buf, &state); + pos = 0; + } + + b = buf[pos++]; + } while (b > i); + + c->coeffs[i] = c->coeffs[b]; + c->coeffs[b] = 1 - 2 * (signs & 1); + signs >>= 1; + } + shake256_inc_ctx_release(&state); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_pack +* +* Description: Bit-pack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYETA_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint8_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (uint8_t) (ETA - a->coeffs[8 * i + 0]); + t[1] = (uint8_t) (ETA - a->coeffs[8 * i + 1]); + t[2] = (uint8_t) (ETA - a->coeffs[8 * i + 2]); + t[3] = (uint8_t) (ETA - a->coeffs[8 * i + 3]); + t[4] = (uint8_t) (ETA - a->coeffs[8 * i + 4]); + t[5] = (uint8_t) (ETA - a->coeffs[8 * i + 5]); + t[6] = (uint8_t) (ETA - a->coeffs[8 * i + 6]); + t[7] = (uint8_t) (ETA - a->coeffs[8 * i + 7]); + + r[3 * i + 0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[3 * i + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[3 * i + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_unpack +* +* Description: Unpack polynomial with coefficients in [-ETA,ETA]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = (a[3 * i + 0] >> 0) & 7; + r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 7; + r->coeffs[8 * i + 2] = ((a[3 * i + 0] >> 6) | (a[3 * i + 1] << 2)) & 7; + r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 7; + r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 7; + r->coeffs[8 * i + 5] = ((a[3 * i + 1] >> 7) | (a[3 * i + 2] << 1)) & 7; + r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 7; + r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5) & 7; + + r->coeffs[8 * i + 0] = ETA - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = ETA - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = ETA - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = ETA - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = ETA - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = ETA - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = ETA - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = ETA - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyt1_pack +* +* Description: Bit-pack polynomial t1 with coefficients fitting in 10 bits. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyt1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r[5 * i + 0] = (uint8_t) (a->coeffs[4 * i + 0] >> 0); + r[5 * i + 1] = (uint8_t) ((a->coeffs[4 * i + 0] >> 8) | (a->coeffs[4 * i + 1] << 2)); + r[5 * i + 2] = (uint8_t) ((a->coeffs[4 * i + 1] >> 6) | (a->coeffs[4 * i + 2] << 4)); + r[5 * i + 3] = (uint8_t) ((a->coeffs[4 * i + 2] >> 4) | (a->coeffs[4 * i + 3] << 6)); + r[5 * i + 4] = (uint8_t) (a->coeffs[4 * i + 3] >> 2); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyt1_unpack +* +* Description: Unpack polynomial t1 with 10-bit coefficients. +* Output coefficients are standard representatives. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyt1_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 4; ++i) { + r->coeffs[4 * i + 0] = ((a[5 * i + 0] >> 0) | ((uint32_t)a[5 * i + 1] << 8)) & 0x3FF; + r->coeffs[4 * i + 1] = ((a[5 * i + 1] >> 2) | ((uint32_t)a[5 * i + 2] << 6)) & 0x3FF; + r->coeffs[4 * i + 2] = ((a[5 * i + 2] >> 4) | ((uint32_t)a[5 * i + 3] << 4)) & 0x3FF; + r->coeffs[4 * i + 3] = ((a[5 * i + 3] >> 6) | ((uint32_t)a[5 * i + 4] << 2)) & 0x3FF; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyt0_pack +* +* Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYT0_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyt0_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[8]; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + t[0] = (1 << (D - 1)) - a->coeffs[8 * i + 0]; + t[1] = (1 << (D - 1)) - a->coeffs[8 * i + 1]; + t[2] = (1 << (D - 1)) - a->coeffs[8 * i + 2]; + t[3] = (1 << (D - 1)) - a->coeffs[8 * i + 3]; + t[4] = (1 << (D - 1)) - a->coeffs[8 * i + 4]; + t[5] = (1 << (D - 1)) - a->coeffs[8 * i + 5]; + t[6] = (1 << (D - 1)) - a->coeffs[8 * i + 6]; + t[7] = (1 << (D - 1)) - a->coeffs[8 * i + 7]; + + r[13 * i + 0] = (uint8_t) t[0]; + r[13 * i + 1] = (uint8_t) (t[0] >> 8); + r[13 * i + 1] |= (uint8_t) (t[1] << 5); + r[13 * i + 2] = (uint8_t) (t[1] >> 3); + r[13 * i + 3] = (uint8_t) (t[1] >> 11); + r[13 * i + 3] |= (uint8_t) (t[2] << 2); + r[13 * i + 4] = (uint8_t) (t[2] >> 6); + r[13 * i + 4] |= (uint8_t) (t[3] << 7); + r[13 * i + 5] = (uint8_t) (t[3] >> 1); + r[13 * i + 6] = (uint8_t) (t[3] >> 9); + r[13 * i + 6] |= (uint8_t) (t[4] << 4); + r[13 * i + 7] = (uint8_t) (t[4] >> 4); + r[13 * i + 8] = (uint8_t) (t[4] >> 12); + r[13 * i + 8] |= (uint8_t) (t[5] << 1); + r[13 * i + 9] = (uint8_t) (t[5] >> 7); + r[13 * i + 9] |= (uint8_t) (t[6] << 6); + r[13 * i + 10] = (uint8_t) (t[6] >> 2); + r[13 * i + 11] = (uint8_t) (t[6] >> 10); + r[13 * i + 11] |= (uint8_t) (t[7] << 3); + r[13 * i + 12] = (uint8_t) (t[7] >> 5); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyt0_unpack +* +* Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyt0_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 8; ++i) { + r->coeffs[8 * i + 0] = a[13 * i + 0]; + r->coeffs[8 * i + 0] |= (uint32_t)a[13 * i + 1] << 8; + r->coeffs[8 * i + 0] &= 0x1FFF; + + r->coeffs[8 * i + 1] = a[13 * i + 1] >> 5; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 2] << 3; + r->coeffs[8 * i + 1] |= (uint32_t)a[13 * i + 3] << 11; + r->coeffs[8 * i + 1] &= 0x1FFF; + + r->coeffs[8 * i + 2] = a[13 * i + 3] >> 2; + r->coeffs[8 * i + 2] |= (uint32_t)a[13 * i + 4] << 6; + r->coeffs[8 * i + 2] &= 0x1FFF; + + r->coeffs[8 * i + 3] = a[13 * i + 4] >> 7; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 5] << 1; + r->coeffs[8 * i + 3] |= (uint32_t)a[13 * i + 6] << 9; + r->coeffs[8 * i + 3] &= 0x1FFF; + + r->coeffs[8 * i + 4] = a[13 * i + 6] >> 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 7] << 4; + r->coeffs[8 * i + 4] |= (uint32_t)a[13 * i + 8] << 12; + r->coeffs[8 * i + 4] &= 0x1FFF; + + r->coeffs[8 * i + 5] = a[13 * i + 8] >> 1; + r->coeffs[8 * i + 5] |= (uint32_t)a[13 * i + 9] << 7; + r->coeffs[8 * i + 5] &= 0x1FFF; + + r->coeffs[8 * i + 6] = a[13 * i + 9] >> 6; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 10] << 2; + r->coeffs[8 * i + 6] |= (uint32_t)a[13 * i + 11] << 10; + r->coeffs[8 * i + 6] &= 0x1FFF; + + r->coeffs[8 * i + 7] = a[13 * i + 11] >> 3; + r->coeffs[8 * i + 7] |= (uint32_t)a[13 * i + 12] << 5; + r->coeffs[8 * i + 7] &= 0x1FFF; + + r->coeffs[8 * i + 0] = (1 << (D - 1)) - r->coeffs[8 * i + 0]; + r->coeffs[8 * i + 1] = (1 << (D - 1)) - r->coeffs[8 * i + 1]; + r->coeffs[8 * i + 2] = (1 << (D - 1)) - r->coeffs[8 * i + 2]; + r->coeffs[8 * i + 3] = (1 << (D - 1)) - r->coeffs[8 * i + 3]; + r->coeffs[8 * i + 4] = (1 << (D - 1)) - r->coeffs[8 * i + 4]; + r->coeffs[8 * i + 5] = (1 << (D - 1)) - r->coeffs[8 * i + 5]; + r->coeffs[8 * i + 6] = (1 << (D - 1)) - r->coeffs[8 * i + 6]; + r->coeffs[8 * i + 7] = (1 << (D - 1)) - r->coeffs[8 * i + 7]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyz_pack +* +* Description: Bit-pack polynomial with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYZ_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyz_pack(uint8_t *r, const poly *a) { + unsigned int i; + uint32_t t[4]; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + t[0] = GAMMA1 - a->coeffs[2 * i + 0]; + t[1] = GAMMA1 - a->coeffs[2 * i + 1]; + + r[5 * i + 0] = (uint8_t) t[0]; + r[5 * i + 1] = (uint8_t) (t[0] >> 8); + r[5 * i + 2] = (uint8_t) (t[0] >> 16); + r[5 * i + 2] |= (uint8_t) (t[1] << 4); + r[5 * i + 3] = (uint8_t) (t[1] >> 4); + r[5 * i + 4] = (uint8_t) (t[1] >> 12); + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyz_unpack +* +* Description: Unpack polynomial z with coefficients +* in [-(GAMMA1 - 1), GAMMA1]. +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: byte array with bit-packed polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyz_unpack(poly *r, const uint8_t *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r->coeffs[2 * i + 0] = a[5 * i + 0]; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8; + r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 2] << 16; + r->coeffs[2 * i + 0] &= 0xFFFFF; + + r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4; + r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12; + r->coeffs[2 * i + 0] &= 0xFFFFF; + + r->coeffs[2 * i + 0] = GAMMA1 - r->coeffs[2 * i + 0]; + r->coeffs[2 * i + 1] = GAMMA1 - r->coeffs[2 * i + 1]; + } + + DBENCH_STOP(*tpack); +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyw1_pack +* +* Description: Bit-pack polynomial w1 with coefficients in [0,15] or [0,43]. +* Input coefficients are assumed to be standard representatives. +* +* Arguments: - uint8_t *r: pointer to output byte array with at least +* POLYW1_PACKEDBYTES bytes +* - const poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyw1_pack(uint8_t *r, const poly *a) { + unsigned int i; + DBENCH_START(); + + for (i = 0; i < N / 2; ++i) { + r[i] = (uint8_t) (a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4)); + } + + DBENCH_STOP(*tpack); +} diff --git a/crypto_sign/dilithium5aes/clean/poly.h b/crypto_sign/dilithium5aes/clean/poly.h new file mode 100644 index 00000000..e3055748 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/poly.h @@ -0,0 +1,53 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_POLY_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_POLY_H +#include "params.h" +#include + +typedef struct { + int32_t coeffs[N]; +} poly; + +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_reduce(poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_caddq(poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_freeze(poly *a); + +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_add(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_sub(poly *c, const poly *a, const poly *b); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_shiftl(poly *a); + +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_ntt(poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_invntt_tomont(poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_pointwise_montgomery(poly *c, const poly *a, const poly *b); + +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_power2round(poly *a1, poly *a0, const poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_decompose(poly *a1, poly *a0, const poly *a); +unsigned int PQCLEAN_DILITHIUM5AES_CLEAN_poly_make_hint(poly *h, const poly *a0, const poly *a1); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_use_hint(poly *b, const poly *a, const poly *h); + +int PQCLEAN_DILITHIUM5AES_CLEAN_poly_chknorm(const poly *a, int32_t B); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform_eta(poly *a, + const uint8_t seed[SEEDBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform_gamma1(poly *a, + const uint8_t seed[CRHBYTES], + uint16_t nonce); +void PQCLEAN_DILITHIUM5AES_CLEAN_poly_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyeta_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyt1_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyt1_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyt0_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyt0_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyz_pack(uint8_t *r, const poly *a); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyz_unpack(poly *r, const uint8_t *a); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyw1_pack(uint8_t *r, const poly *a); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/polyvec.c b/crypto_sign/dilithium5aes/clean/polyvec.c new file mode 100644 index 00000000..06609473 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/polyvec.c @@ -0,0 +1,448 @@ +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include + +/************************************************* +* Name: expand_mat +* +* Description: Implementation of ExpandA. Generates matrix A with uniformly +* random coefficients a_{i,j} by performing rejection +* sampling on the output stream of SHAKE128(rho|j|i) +* or AES256CTR(rho,j|i). +* +* Arguments: - polyvecl mat[K]: output matrix +* - const uint8_t rho[]: byte array containing seed rho +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]) { + unsigned int i, j; + + for (i = 0; i < K; ++i) { + for (j = 0; j < L; ++j) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform(&mat[i].vec[j], rho, (uint16_t) ((i << 8) + j)); + } + } +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_pointwise_acc_montgomery(&t->vec[i], &mat[i], v); + } +} + +/**************************************************************/ +/************ Vectors of polynomials of length L **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform_gamma1(&v->vec[i], seed, (uint16_t) (L * nonce + i)); + } +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_reduce(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_freeze +* +* Description: Reduce coefficients of polynomials in vector of length L +* to standard representatives. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_freeze(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_add +* +* Description: Add vectors of polynomials of length L. +* No modular reduction is performed. +* +* Arguments: - polyvecl *w: pointer to output vector +* - const polyvecl *u: pointer to first summand +* - const polyvecl *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_ntt +* +* Description: Forward NTT of all polynomials in vector of length L. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyvecl *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_ntt(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_ntt(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_invntt_tomont(polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v) { + unsigned int i; + + for (i = 0; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_pointwise_acc_montgomery +* +* Description: Pointwise multiply vectors of polynomials of length L, multiply +* resulting vector by 2^{-32} and add (accumulate) polynomials +* in it. Input/output vectors are in NTT domain representation. +* +* Arguments: - poly *w: output polynomial +* - const polyvecl *u: pointer to first input vector +* - const polyvecl *v: pointer to second input vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v) { + unsigned int i; + poly t; + + PQCLEAN_DILITHIUM5AES_CLEAN_poly_pointwise_montgomery(w, &u->vec[0], &v->vec[0]); + for (i = 1; i < L; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_pointwise_montgomery(&t, &u->vec[i], &v->vec[i]); + PQCLEAN_DILITHIUM5AES_CLEAN_poly_add(w, w, &t); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_chknorm +* +* Description: Check infinity norm of polynomials in vector of length L. +* Assumes input polyvecl to be reduced by PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_reduce(). +* +* Arguments: - const polyvecl *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials is strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < L; ++i) { + if (PQCLEAN_DILITHIUM5AES_CLEAN_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/**************************************************************/ +/************ Vectors of polynomials of length K **************/ +/**************************************************************/ + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_uniform_eta(&v->vec[i], seed, nonce++); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce +* +* Description: Reduce coefficients of polynomials in vector of length K +* to representatives in [-6283009,6283007]. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_reduce(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_caddq +* +* Description: For all coefficients of polynomials in vector of length K +* add Q if coefficient is negative. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_caddq(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_caddq(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_freeze +* +* Description: Reduce coefficients of polynomials in vector of length K +* to standard representatives. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_freeze(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_freeze(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_add +* +* Description: Add vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first summand +* - const polyveck *v: pointer to second summand +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_add(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_sub +* +* Description: Subtract vectors of polynomials of length K. +* No modular reduction is performed. +* +* Arguments: - polyveck *w: pointer to output vector +* - const polyveck *u: pointer to first input vector +* - const polyveck *v: pointer to second input vector to be +* subtracted from first input vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_sub(&w->vec[i], &u->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_shiftl +* +* Description: Multiply vector of polynomials of Length K by 2^D without modular +* reduction. Assumes input coefficients to be less than 2^{31-D}. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_shiftl(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_shiftl(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_ntt +* +* Description: Forward NTT of all polynomials in vector of length K. Output +* coefficients can be up to 16*Q larger than input coefficients. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_ntt(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_ntt(&v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_invntt_tomont +* +* Description: Inverse NTT and multiplication by 2^{32} of polynomials +* in vector of length K. Input coefficients need to be less +* than 2*Q. +* +* Arguments: - polyveck *v: pointer to input/output vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_invntt_tomont(polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_invntt_tomont(&v->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_pointwise_montgomery(&r->vec[i], a, &v->vec[i]); + } +} + + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_chknorm +* +* Description: Check infinity norm of polynomials in vector of length K. +* Assumes input polyveck to be reduced by PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce(). +* +* Arguments: - const polyveck *v: pointer to vector +* - int32_t B: norm bound +* +* Returns 0 if norm of all polynomials are strictly smaller than B <= (Q-1)/8 +* and 1 otherwise. +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_chknorm(const polyveck *v, int32_t bound) { + unsigned int i; + + for (i = 0; i < K; ++i) { + if (PQCLEAN_DILITHIUM5AES_CLEAN_poly_chknorm(&v->vec[i], bound)) { + return 1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_power2round +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute a0, a1 such that a mod^+ Q = a1*2^D + a0 +* with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be +* standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_power2round(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_decompose +* +* Description: For all coefficients a of polynomials in vector of length K, +* compute high and low bits a0, a1 such a mod^+ Q = a1*ALPHA + a0 +* with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we +* set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. +* Assumes coefficients to be standard representatives. +* +* Arguments: - polyveck *v1: pointer to output vector of polynomials with +* coefficients a1 +* - polyveck *v0: pointer to output vector of polynomials with +* coefficients a0 +* - const polyveck *v: pointer to input vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_decompose(&v1->vec[i], &v0->vec[i], &v->vec[i]); + } +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_make_hint +* +* Description: Compute hint vector. +* +* Arguments: - polyveck *h: pointer to output vector +* - const polyveck *v0: pointer to low part of input vector +* - const polyveck *v1: pointer to high part of input vector +* +* Returns number of 1 bits. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1) { + unsigned int i, s = 0; + + for (i = 0; i < K; ++i) { + s += PQCLEAN_DILITHIUM5AES_CLEAN_poly_make_hint(&h->vec[i], &v0->vec[i], &v1->vec[i]); + } + + return s; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_use_hint +* +* Description: Use hint vector to correct the high bits of input vector. +* +* Arguments: - polyveck *w: pointer to output vector of polynomials with +* corrected high bits +* - const polyveck *u: pointer to input vector +* - const polyveck *h: pointer to input hint vector +**************************************************/ +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_poly_use_hint(&w->vec[i], &u->vec[i], &h->vec[i]); + } +} + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1) { + unsigned int i; + + for (i = 0; i < K; ++i) { + PQCLEAN_DILITHIUM5AES_CLEAN_polyw1_pack(&r[i * POLYW1_PACKEDBYTES], &w1->vec[i]); + } +} diff --git a/crypto_sign/dilithium5aes/clean/polyvec.h b/crypto_sign/dilithium5aes/clean/polyvec.h new file mode 100644 index 00000000..139a99ca --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/polyvec.h @@ -0,0 +1,68 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_POLYVEC_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_POLYVEC_H +#include "params.h" +#include "poly.h" +#include + +/* Vectors of polynomials of length L */ +typedef struct { + poly vec[L]; +} polyvecl; + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_uniform_eta(polyvecl *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_uniform_gamma1(polyvecl *v, const uint8_t seed[CRHBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_reduce(polyvecl *v); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_freeze(polyvecl *v); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_ntt(polyvecl *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_invntt_tomont(polyvecl *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_pointwise_poly_montgomery(polyvecl *r, const poly *a, const polyvecl *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_pointwise_acc_montgomery(poly *w, + const polyvecl *u, + const polyvecl *v); + + +int PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_chknorm(const polyvecl *v, int32_t B); + + + +/* Vectors of polynomials of length K */ +typedef struct { + poly vec[K]; +} polyveck; + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_uniform_eta(polyveck *v, const uint8_t seed[SEEDBYTES], uint16_t nonce); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce(polyveck *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_caddq(polyveck *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_freeze(polyveck *v); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_add(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_shiftl(polyveck *v); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_ntt(polyveck *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_invntt_tomont(polyveck *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pointwise_poly_montgomery(polyveck *r, const poly *a, const polyveck *v); + +int PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_chknorm(const polyveck *v, int32_t B); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v); +unsigned int PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_make_hint(polyveck *h, + const polyveck *v0, + const polyveck *v1); +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pack_w1(uint8_t r[K * POLYW1_PACKEDBYTES], const polyveck *w1); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_expand(polyvecl mat[K], const uint8_t rho[SEEDBYTES]); + +void PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_pointwise_montgomery(polyveck *t, const polyvecl mat[K], const polyvecl *v); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/reduce.c b/crypto_sign/dilithium5aes/clean/reduce.c new file mode 100644 index 00000000..a98bdb6c --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/reduce.c @@ -0,0 +1,69 @@ +#include "params.h" +#include "reduce.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_montgomery_reduce +* +* Description: For finite field element a with -2^{31}Q <= a <= Q*2^31, +* compute r \equiv a*2^{-32} (mod Q) such that -Q < r < Q. +* +* Arguments: - int64_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_montgomery_reduce(int64_t a) { + int32_t t; + + t = (int32_t)((uint64_t)a * (uint64_t)QINV); + t = (a - (int64_t)t * Q) >> 32; + return t; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_reduce32 +* +* Description: For finite field element a with a <= 2^{31} - 2^{22} - 1, +* compute r \equiv a (mod Q) such that -6283009 <= r <= 6283007. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_reduce32(int32_t a) { + int32_t t; + + t = (a + (1 << 22)) >> 23; + t = a - t * Q; + return t; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_caddq +* +* Description: Add Q if input coefficient is negative. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_caddq(int32_t a) { + a += (a >> 31) & Q; + return a; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_freeze +* +* Description: For finite field element a, compute standard +* representative r = a mod^+ Q. +* +* Arguments: - int32_t: finite field element a +* +* Returns r. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_freeze(int32_t a) { + a = PQCLEAN_DILITHIUM5AES_CLEAN_reduce32(a); + a = PQCLEAN_DILITHIUM5AES_CLEAN_caddq(a); + return a; +} diff --git a/crypto_sign/dilithium5aes/clean/reduce.h b/crypto_sign/dilithium5aes/clean/reduce.h new file mode 100644 index 00000000..b52d4e9b --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/reduce.h @@ -0,0 +1,17 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_REDUCE_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_REDUCE_H +#include "params.h" +#include + +#define MONT (-4186625) // 2^32 % Q +#define QINV 58728449 // q^(-1) mod 2^32 + +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_montgomery_reduce(int64_t a); + +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_reduce32(int32_t a); + +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_caddq(int32_t a); + +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_freeze(int32_t a); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/rounding.c b/crypto_sign/dilithium5aes/clean/rounding.c new file mode 100644 index 00000000..ee8a6cb1 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/rounding.c @@ -0,0 +1,92 @@ +#include "params.h" +#include "rounding.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_power2round +* +* Description: For finite field element a, compute a0, a1 such that +* a mod^+ Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}. +* Assumes a to be standard representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_power2round(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + (1 << (D - 1)) - 1) >> D; + *a0 = a - (a1 << D); + return a1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_decompose +* +* Description: For finite field element a, compute high and low bits a0, a1 such +* that a mod^+ Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except +* if a1 = (Q-1)/ALPHA where we set a1 = 0 and +* -ALPHA/2 <= a0 = a mod^+ Q - Q < 0. Assumes a to be standard +* representative. +* +* Arguments: - int32_t a: input element +* - int32_t *a0: pointer to output element a0 +* +* Returns a1. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_decompose(int32_t *a0, int32_t a) { + int32_t a1; + + a1 = (a + 127) >> 7; + a1 = (a1 * 1025 + (1 << 21)) >> 22; + a1 &= 15; + + *a0 = a - a1 * 2 * GAMMA2; + *a0 -= (((Q - 1) / 2 - *a0) >> 31) & Q; + return a1; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_make_hint +* +* Description: Compute hint bit indicating whether the low bits of the +* input element overflow into the high bits. +* +* Arguments: - int32_t a0: low bits of input element +* - int32_t a1: high bits of input element +* +* Returns 1 if overflow. +**************************************************/ +unsigned int PQCLEAN_DILITHIUM5AES_CLEAN_make_hint(int32_t a0, int32_t a1) { + if (a0 > GAMMA2 || a0 < -GAMMA2 || (a0 == -GAMMA2 && a1 != 0)) { + return 1; + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_use_hint +* +* Description: Correct high bits according to hint. +* +* Arguments: - int32_t a: input element +* - unsigned int hint: hint bit +* +* Returns corrected high bits. +**************************************************/ +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_use_hint(int32_t a, unsigned int hint) { + int32_t a0, a1; + + a1 = PQCLEAN_DILITHIUM5AES_CLEAN_decompose(&a0, a); + if (hint == 0) { + return a1; + } + + if (a0 > 0) { + return (a1 + 1) & 15; + } + return (a1 - 1) & 15; +} diff --git a/crypto_sign/dilithium5aes/clean/rounding.h b/crypto_sign/dilithium5aes/clean/rounding.h new file mode 100644 index 00000000..5ddebe00 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/rounding.h @@ -0,0 +1,14 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_ROUNDING_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_ROUNDING_H +#include "params.h" +#include + +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_power2round(int32_t *a0, int32_t a); + +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_decompose(int32_t *a0, int32_t a); + +unsigned int PQCLEAN_DILITHIUM5AES_CLEAN_make_hint(int32_t a0, int32_t a1); + +int32_t PQCLEAN_DILITHIUM5AES_CLEAN_use_hint(int32_t a, unsigned int hint); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/sign.c b/crypto_sign/dilithium5aes/clean/sign.c new file mode 100644 index 00000000..3ce07c18 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/sign.c @@ -0,0 +1,343 @@ +#include "fips202.h" +#include "packing.h" +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include "randombytes.h" +#include "sign.h" +#include "symmetric.h" +#include + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_keypair +* +* Description: Generates public and private key. +* +* Arguments: - uint8_t *pk: pointer to output public key (allocated +* array of PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key (allocated +* array of PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_SECRETKEYBYTES bytes) +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk) { + uint8_t seedbuf[3 * SEEDBYTES]; + uint8_t tr[CRHBYTES]; + const uint8_t *rho, *rhoprime, *key; + polyvecl mat[K]; + polyvecl s1, s1hat; + polyveck s2, t1, t0; + + /* Get randomness for rho, rhoprime and key */ + randombytes(seedbuf, SEEDBYTES); + shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES); + rho = seedbuf; + rhoprime = seedbuf + SEEDBYTES; + key = seedbuf + 2 * SEEDBYTES; + + /* Expand matrix */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_expand(mat, rho); + + /* Sample short vectors s1 and s2 */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_uniform_eta(&s1, rhoprime, 0); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_uniform_eta(&s2, rhoprime, L); + + /* Matrix-vector multiplication */ + s1hat = s1; + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_ntt(&s1hat); + PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_pointwise_montgomery(&t1, mat, &s1hat); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce(&t1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_invntt_tomont(&t1); + + /* Add error vector s2 */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_add(&t1, &t1, &s2); + + /* Extract t1 and write public key */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_caddq(&t1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_power2round(&t1, &t0, &t1); + PQCLEAN_DILITHIUM5AES_CLEAN_pack_pk(pk, rho, &t1); + + /* Compute CRH(rho, t1) and write secret key */ + crh(tr, pk, PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES); + PQCLEAN_DILITHIUM5AES_CLEAN_pack_sk(sk, rho, tr, key, &t0, &s1, &s2); + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_signature +* +* Description: Computes signature. +* +* Arguments: - uint8_t *sig: pointer to output signature (of length PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES) +* - size_t *siglen: pointer to output length of signature +* - uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_signature(uint8_t *sig, + size_t *siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + unsigned int n; + uint8_t seedbuf[2 * SEEDBYTES + 3 * CRHBYTES]; + uint8_t *rho, *tr, *key, *mu, *rhoprime; + uint16_t nonce = 0; + polyvecl mat[K], s1, y, z; + polyveck t0, s2, w1, w0, h; + poly cp; + shake256incctx state; + + rho = seedbuf; + tr = rho + SEEDBYTES; + key = tr + CRHBYTES; + mu = key + SEEDBYTES; + rhoprime = mu + CRHBYTES; + PQCLEAN_DILITHIUM5AES_CLEAN_unpack_sk(rho, tr, key, &t0, &s1, &s2, sk); + + /* Compute CRH(tr, msg) */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, tr, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + crh(rhoprime, key, SEEDBYTES + CRHBYTES); + + /* Expand matrix and transform vectors */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_expand(mat, rho); + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_ntt(&s1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_ntt(&s2); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_ntt(&t0); + +rej: + /* Sample intermediate vector y */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_uniform_gamma1(&y, rhoprime, nonce++); + + /* Matrix-vector multiplication */ + z = y; + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_invntt_tomont(&w1); + + /* Decompose w and call the random oracle */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_decompose(&w1, &w0, &w1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pack_w1(sig, &w1); + + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, sig, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(sig, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + PQCLEAN_DILITHIUM5AES_CLEAN_poly_challenge(&cp, sig); + PQCLEAN_DILITHIUM5AES_CLEAN_poly_ntt(&cp); + + /* Compute z, reject if it reveals secret */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_pointwise_poly_montgomery(&z, &cp, &s1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_invntt_tomont(&z); + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_add(&z, &z, &y); + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_reduce(&z); + if (PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + goto rej; + } + + /* Check that subtracting cs2 does not change high bits of w and low bits + * do not reveal secret information */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &s2); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_sub(&w0, &w0, &h); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce(&w0); + if (PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_chknorm(&w0, GAMMA2 - BETA)) { + goto rej; + } + + /* Compute hints for w1 */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pointwise_poly_montgomery(&h, &cp, &t0); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_invntt_tomont(&h); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce(&h); + if (PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_chknorm(&h, GAMMA2)) { + goto rej; + } + + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_add(&w0, &w0, &h); + n = PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_make_hint(&h, &w0, &w1); + if (n > OMEGA) { + goto rej; + } + + /* Write signature */ + PQCLEAN_DILITHIUM5AES_CLEAN_pack_sig(sig, sig, &z, &h); + *siglen = PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign +* +* Description: Compute signed message. +* +* Arguments: - uint8_t *sm: pointer to output signed message (allocated +* array with PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES + mlen bytes), +* can be equal to m +* - size_t *smlen: pointer to output length of signed +* message +* - const uint8_t *m: pointer to message to be signed +* - size_t mlen: length of message +* - const uint8_t *sk: pointer to bit-packed secret key +* +* Returns 0 (success) +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign(uint8_t *sm, + size_t *smlen, + const uint8_t *m, + size_t mlen, + const uint8_t *sk) { + size_t i; + + for (i = 0; i < mlen; ++i) { + sm[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES + mlen - 1 - i] = m[mlen - 1 - i]; + } + PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_signature(sm, smlen, sm + PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES, mlen, sk); + *smlen += mlen; + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_verify +* +* Description: Verifies signature. +* +* Arguments: - uint8_t *m: pointer to input signature +* - size_t siglen: length of signature +* - const uint8_t *m: pointer to message +* - size_t mlen: length of message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signature could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_verify(const uint8_t *sig, + size_t siglen, + const uint8_t *m, + size_t mlen, + const uint8_t *pk) { + unsigned int i; + uint8_t buf[K * POLYW1_PACKEDBYTES]; + uint8_t rho[SEEDBYTES]; + uint8_t mu[CRHBYTES]; + uint8_t c[SEEDBYTES]; + uint8_t c2[SEEDBYTES]; + poly cp; + polyvecl mat[K], z; + polyveck t1, w1, h; + shake256incctx state; + + if (siglen != PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES) { + return -1; + } + + PQCLEAN_DILITHIUM5AES_CLEAN_unpack_pk(rho, &t1, pk); + if (PQCLEAN_DILITHIUM5AES_CLEAN_unpack_sig(c, &z, &h, sig)) { + return -1; + } + if (PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_chknorm(&z, GAMMA1 - BETA)) { + return -1; + } + + /* Compute CRH(CRH(rho, t1), msg) */ + crh(mu, pk, PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_PUBLICKEYBYTES); + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, m, mlen); + shake256_inc_finalize(&state); + shake256_inc_squeeze(mu, CRHBYTES, &state); + shake256_inc_ctx_release(&state); + + /* Matrix-vector multiplication; compute Az - c2^dt1 */ + PQCLEAN_DILITHIUM5AES_CLEAN_poly_challenge(&cp, c); + PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_expand(mat, rho); + + PQCLEAN_DILITHIUM5AES_CLEAN_polyvecl_ntt(&z); + PQCLEAN_DILITHIUM5AES_CLEAN_polyvec_matrix_pointwise_montgomery(&w1, mat, &z); + + PQCLEAN_DILITHIUM5AES_CLEAN_poly_ntt(&cp); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_shiftl(&t1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_ntt(&t1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pointwise_poly_montgomery(&t1, &cp, &t1); + + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_sub(&w1, &w1, &t1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_reduce(&w1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_invntt_tomont(&w1); + + /* Reconstruct w1 */ + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_caddq(&w1); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_use_hint(&w1, &w1, &h); + PQCLEAN_DILITHIUM5AES_CLEAN_polyveck_pack_w1(buf, &w1); + + /* Call random oracle and verify PQCLEAN_DILITHIUM5AES_CLEAN_challenge */ + shake256_inc_init(&state); + shake256_inc_absorb(&state, mu, CRHBYTES); + shake256_inc_absorb(&state, buf, K * POLYW1_PACKEDBYTES); + shake256_inc_finalize(&state); + shake256_inc_squeeze(c2, SEEDBYTES, &state); + shake256_inc_ctx_release(&state); + for (i = 0; i < SEEDBYTES; ++i) { + if (c[i] != c2[i]) { + return -1; + } + } + + return 0; +} + +/************************************************* +* Name: PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_open +* +* Description: Verify signed message. +* +* Arguments: - uint8_t *m: pointer to output message (allocated +* array with smlen bytes), can be equal to sm +* - size_t *mlen: pointer to output length of message +* - const uint8_t *sm: pointer to signed message +* - size_t smlen: length of signed message +* - const uint8_t *pk: pointer to bit-packed public key +* +* Returns 0 if signed message could be verified correctly and -1 otherwise +**************************************************/ +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_open(uint8_t *m, + size_t *mlen, + const uint8_t *sm, + size_t smlen, + const uint8_t *pk) { + size_t i; + + if (smlen < PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES) { + goto badsig; + } + + *mlen = smlen - PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES; + if (PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_verify(sm, PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES, sm + PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES, *mlen, pk)) { + goto badsig; + } else { + /* All good, copy msg, return 0 */ + for (i = 0; i < *mlen; ++i) { + m[i] = sm[PQCLEAN_DILITHIUM5AES_CLEAN_CRYPTO_BYTES + i]; + } + return 0; + } + +badsig: + /* Signature verification failed */ + *mlen = (size_t) -1; + for (i = 0; i < smlen; ++i) { + m[i] = 0; + } + + return -1; +} diff --git a/crypto_sign/dilithium5aes/clean/sign.h b/crypto_sign/dilithium5aes/clean/sign.h new file mode 100644 index 00000000..5f79f48a --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/sign.h @@ -0,0 +1,29 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_SIGN_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_SIGN_H +#include "params.h" +#include "poly.h" +#include "polyvec.h" +#include +#include + +void PQCLEAN_DILITHIUM5AES_CLEAN_challenge(poly *c, const uint8_t seed[SEEDBYTES]); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_signature(uint8_t *sig, size_t *siglen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign(uint8_t *sm, size_t *smlen, + const uint8_t *m, size_t mlen, + const uint8_t *sk); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_verify(const uint8_t *sig, size_t siglen, + const uint8_t *m, size_t mlen, + const uint8_t *pk); + +int PQCLEAN_DILITHIUM5AES_CLEAN_crypto_sign_open(uint8_t *m, size_t *mlen, + const uint8_t *sm, size_t smlen, + const uint8_t *pk); + +#endif diff --git a/crypto_sign/dilithium5aes/clean/symmetric-aes.c b/crypto_sign/dilithium5aes/clean/symmetric-aes.c new file mode 100644 index 00000000..d69ce894 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/symmetric-aes.c @@ -0,0 +1,12 @@ +#include "aes256ctr.h" +#include "symmetric.h" +#include + +void PQCLEAN_DILITHIUM5AES_CLEAN_dilithium_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint16_t nonce) { + uint8_t expnonce[12] = {0}; + expnonce[0] = (uint8_t) nonce; + expnonce[1] = (uint8_t) (nonce >> 8); + PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_init(state, key, expnonce); +} diff --git a/crypto_sign/dilithium5aes/clean/symmetric.h b/crypto_sign/dilithium5aes/clean/symmetric.h new file mode 100644 index 00000000..8799e987 --- /dev/null +++ b/crypto_sign/dilithium5aes/clean/symmetric.h @@ -0,0 +1,33 @@ +#ifndef PQCLEAN_DILITHIUM5AES_CLEAN_SYMMETRIC_H +#define PQCLEAN_DILITHIUM5AES_CLEAN_SYMMETRIC_H +#include "aes256ctr.h" +#include "fips202.h" +#include "params.h" +#include + + + +typedef aes256ctr_ctx stream128_state; +typedef aes256ctr_ctx stream256_state; + +void PQCLEAN_DILITHIUM5AES_CLEAN_dilithium_aes256ctr_init(aes256ctr_ctx *state, + const uint8_t key[32], + uint16_t nonce); + +#define STREAM128_BLOCKBYTES AES256CTR_BLOCKBYTES +#define STREAM256_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define crh(OUT, IN, INBYTES) shake256(OUT, CRHBYTES, IN, INBYTES) +#define stream128_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM5AES_CLEAN_dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream128_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream128_release(STATE) +#define stream256_init(STATE, SEED, NONCE) \ + PQCLEAN_DILITHIUM5AES_CLEAN_dilithium_aes256ctr_init(STATE, SEED, NONCE) +#define stream256_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_DILITHIUM5AES_CLEAN_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define stream256_release(STATE) + + +#endif diff --git a/test/duplicate_consistency/dilithium2_avx2.yml b/test/duplicate_consistency/dilithium2_avx2.yml index 136b3719..8b459d35 100644 --- a/test/duplicate_consistency/dilithium2_avx2.yml +++ b/test/duplicate_consistency/dilithium2_avx2.yml @@ -1,63 +1,133 @@ consistency_checks: -- source: - scheme: dilithium3 - implementation: avx2 - files: - - alignment.h - - fips202x4.c - - fips202x4.h - - nttconsts.c - - nttconsts.h - - ntt.S - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.h - - reduce.S - - rounding.c - - rounding.h - - rejsample.h - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium4 - implementation: avx2 - files: - - alignment.h - - fips202x4.c - - fips202x4.h - - nttconsts.c - - nttconsts.h - - ntt.S - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.h - - reduce.S - - rounding.c - - rounding.h - - rejsample.h - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium2 - implementation: clean - files: - - api.h - - packing.c - - packing.h - - polyvec.h - - params.h - - stream.c - - stream.h - - symmetric.h + - source: + scheme: dilithium2 + implementation: clean + files: + - api.h + - packing.h + - params.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c + - source: + scheme: dilithium3 + implementation: clean + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - fips202x4.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rounding.h + - sign.h + - symmetric.h + - consts.c + - fips202x4.c + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rounding.h + - sign.h + - consts.c + - packing.c + - source: + scheme: dilithium5 + implementation: clean + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - fips202x4.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rejsample.h + - rounding.h + - sign.h + - symmetric.h + - consts.c + - fips202x4.c + - packing.c + - rejsample.c + - symmetric-shake.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c diff --git a/test/duplicate_consistency/dilithium2_clean.yml b/test/duplicate_consistency/dilithium2_clean.yml index 6d90ddca..dc8c6f1e 100644 --- a/test/duplicate_consistency/dilithium2_clean.yml +++ b/test/duplicate_consistency/dilithium2_clean.yml @@ -1,54 +1,137 @@ consistency_checks: -- source: - scheme: dilithium3 - implementation: clean - files: - - ntt.c - - ntt.h - - packing.c - - packing.h - - poly.c - - poly.h - - polyvec.c - - polyvec.h - - reduce.c - - reduce.h - - rounding.c - - rounding.h - - sign.c - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium4 - implementation: clean - files: - - ntt.c - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.c - - reduce.h - - rounding.c - - rounding.h - - sign.c - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium2 - implementation: avx2 - files: - - api.h - - packing.c - - packing.h - - polyvec.h - - params.h - - stream.c - - stream.h - - symmetric.h + - source: + scheme: dilithium2 + implementation: avx2 + files: + - api.h + - packing.h + - params.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - poly.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - symmetric-shake.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - symmetric-shake.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c diff --git a/test/duplicate_consistency/dilithium2aes_avx2.yml b/test/duplicate_consistency/dilithium2aes_avx2.yml new file mode 100644 index 00000000..4500bac6 --- /dev/null +++ b/test/duplicate_consistency/dilithium2aes_avx2.yml @@ -0,0 +1,129 @@ +consistency_checks: + - source: + scheme: dilithium2 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - api.h + - packing.h + - params.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rounding.h + - sign.h + - consts.c + - packing.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - aes256ctr.h + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - consts.c + - packing.c + - polyvec.c + - sign.c + - source: + scheme: dilithium5 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - aes256ctr.h + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rejsample.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - consts.c + - packing.c + - polyvec.c + - rejsample.c + - sign.c diff --git a/test/duplicate_consistency/dilithium2aes_clean.yml b/test/duplicate_consistency/dilithium2aes_clean.yml new file mode 100644 index 00000000..43da6b25 --- /dev/null +++ b/test/duplicate_consistency/dilithium2aes_clean.yml @@ -0,0 +1,135 @@ +consistency_checks: + - source: + scheme: dilithium2 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - poly.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - api.h + - packing.h + - params.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - aes256ctr.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - symmetric-aes.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - aes256ctr.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - symmetric-aes.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c diff --git a/test/duplicate_consistency/dilithium3_avx2.yml b/test/duplicate_consistency/dilithium3_avx2.yml index d272857f..6d7de84d 100644 --- a/test/duplicate_consistency/dilithium3_avx2.yml +++ b/test/duplicate_consistency/dilithium3_avx2.yml @@ -1,63 +1,132 @@ consistency_checks: -- source: - scheme: dilithium2 - implementation: avx2 - files: - - alignment.h - - fips202x4.c - - fips202x4.h - - nttconsts.c - - nttconsts.h - - ntt.S - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.h - - reduce.S - - rounding.c - - rounding.h - - rejsample.h - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium4 - implementation: avx2 - files: - - alignment.h - - fips202x4.c - - fips202x4.h - - nttconsts.c - - nttconsts.h - - ntt.S - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.h - - reduce.S - - rounding.c - - rounding.h - - rejsample.h - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium3 - implementation: clean - files: - - api.h - - packing.c - - packing.h - - polyvec.h - - params.h - - stream.c - - stream.h - - symmetric.h + - source: + scheme: dilithium2 + implementation: clean + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - fips202x4.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rounding.h + - sign.h + - symmetric.h + - consts.c + - fips202x4.c + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rounding.h + - sign.h + - consts.c + - packing.c + - source: + scheme: dilithium3 + implementation: clean + files: + - api.h + - packing.h + - params.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c + - source: + scheme: dilithium5 + implementation: clean + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - fips202x4.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rounding.h + - sign.h + - symmetric.h + - consts.c + - fips202x4.c + - packing.c + - rounding.c + - symmetric-shake.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c diff --git a/test/duplicate_consistency/dilithium3_clean.yml b/test/duplicate_consistency/dilithium3_clean.yml index 57a38b3a..9c3f97a3 100644 --- a/test/duplicate_consistency/dilithium3_clean.yml +++ b/test/duplicate_consistency/dilithium3_clean.yml @@ -1,54 +1,139 @@ consistency_checks: -- source: - scheme: dilithium2 - implementation: clean - files: - - ntt.c - - ntt.h - - packing.c - - packing.h - - poly.c - - poly.h - - polyvec.c - - polyvec.h - - reduce.c - - reduce.h - - rounding.c - - rounding.h - - sign.c - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium4 - implementation: clean - files: - - ntt.c - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.c - - reduce.h - - rounding.c - - rounding.h - - sign.c - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium3 - implementation: avx2 - files: - - api.h - - packing.c - - packing.h - - polyvec.h - - params.h - - stream.c - - stream.h - - symmetric.h + - source: + scheme: dilithium2 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - symmetric-shake.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - api.h + - packing.h + - params.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - poly.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - symmetric-shake.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c diff --git a/test/duplicate_consistency/dilithium3aes_avx2.yml b/test/duplicate_consistency/dilithium3aes_avx2.yml new file mode 100644 index 00000000..f8292b03 --- /dev/null +++ b/test/duplicate_consistency/dilithium3aes_avx2.yml @@ -0,0 +1,128 @@ +consistency_checks: + - source: + scheme: dilithium2 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rounding.h + - sign.h + - consts.c + - packing.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - aes256ctr.h + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - consts.c + - packing.c + - polyvec.c + - sign.c + - source: + scheme: dilithium3 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - api.h + - packing.h + - params.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - aes256ctr.h + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - consts.c + - packing.c + - polyvec.c + - rounding.c + - sign.c diff --git a/test/duplicate_consistency/dilithium3aes_clean.yml b/test/duplicate_consistency/dilithium3aes_clean.yml new file mode 100644 index 00000000..eb3f4514 --- /dev/null +++ b/test/duplicate_consistency/dilithium3aes_clean.yml @@ -0,0 +1,137 @@ +consistency_checks: + - source: + scheme: dilithium2 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - aes256ctr.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - symmetric-aes.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - poly.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - api.h + - packing.h + - params.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - aes256ctr.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - symmetric-aes.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c diff --git a/test/duplicate_consistency/dilithium4_avx2.yml b/test/duplicate_consistency/dilithium4_avx2.yml deleted file mode 100644 index 9486f63a..00000000 --- a/test/duplicate_consistency/dilithium4_avx2.yml +++ /dev/null @@ -1,63 +0,0 @@ -consistency_checks: -- source: - scheme: dilithium2 - implementation: avx2 - files: - - alignment.h - - fips202x4.c - - fips202x4.h - - nttconsts.c - - nttconsts.h - - ntt.S - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.h - - reduce.S - - rounding.c - - rounding.h - - rejsample.h - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium3 - implementation: avx2 - files: - - alignment.h - - fips202x4.c - - fips202x4.h - - nttconsts.c - - nttconsts.h - - ntt.S - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.h - - reduce.S - - rounding.c - - rounding.h - - rejsample.h - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium4 - implementation: clean - files: - - api.h - - packing.c - - packing.h - - polyvec.h - - params.h - - stream.c - - stream.h - - symmetric.h diff --git a/test/duplicate_consistency/dilithium4_clean.yml b/test/duplicate_consistency/dilithium4_clean.yml deleted file mode 100644 index 96eb0cbe..00000000 --- a/test/duplicate_consistency/dilithium4_clean.yml +++ /dev/null @@ -1,53 +0,0 @@ -consistency_checks: -- source: - scheme: dilithium2 - implementation: clean - files: - - ntt.c - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.c - - reduce.h - - rounding.c - - rounding.h - - sign.c - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium3 - implementation: clean - files: - - ntt.c - - ntt.h - - packing.c - - packing.h - - poly.h - - polyvec.c - - polyvec.h - - reduce.c - - reduce.h - - rounding.c - - rounding.h - - sign.c - - sign.h - - stream.c - - stream.h - - symmetric.h -- source: - scheme: dilithium4 - implementation: avx2 - files: - - api.h - - packing.c - - packing.h - - polyvec.h - - params.h - - stream.c - - stream.h - - symmetric.h diff --git a/test/duplicate_consistency/dilithium5_avx2.yml b/test/duplicate_consistency/dilithium5_avx2.yml new file mode 100644 index 00000000..f050e1b1 --- /dev/null +++ b/test/duplicate_consistency/dilithium5_avx2.yml @@ -0,0 +1,135 @@ +consistency_checks: + - source: + scheme: dilithium2 + implementation: clean + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - fips202x4.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rejsample.h + - rounding.h + - sign.h + - symmetric.h + - consts.c + - fips202x4.c + - packing.c + - rejsample.c + - symmetric-shake.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - source: + scheme: dilithium3 + implementation: clean + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - fips202x4.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rounding.h + - sign.h + - symmetric.h + - consts.c + - fips202x4.c + - packing.c + - rounding.c + - symmetric-shake.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c + - source: + scheme: dilithium5 + implementation: clean + files: + - api.h + - packing.h + - params.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c diff --git a/test/duplicate_consistency/dilithium5_clean.yml b/test/duplicate_consistency/dilithium5_clean.yml new file mode 100644 index 00000000..a9a5366e --- /dev/null +++ b/test/duplicate_consistency/dilithium5_clean.yml @@ -0,0 +1,139 @@ +consistency_checks: + - source: + scheme: dilithium2 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - symmetric-shake.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - symmetric-shake.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - packing.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - api.h + - packing.h + - params.h + - sign.h + - symmetric.h + - packing.c + - symmetric-shake.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - poly.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c diff --git a/test/duplicate_consistency/dilithium5aes_avx2.yml b/test/duplicate_consistency/dilithium5aes_avx2.yml new file mode 100644 index 00000000..f8bf3b45 --- /dev/null +++ b/test/duplicate_consistency/dilithium5aes_avx2.yml @@ -0,0 +1,131 @@ +consistency_checks: + - source: + scheme: dilithium2 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - aes256ctr.h + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rejsample.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - consts.c + - packing.c + - polyvec.c + - rejsample.c + - sign.c + - source: + scheme: dilithium3 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - aes256ctr.h + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - consts.c + - packing.c + - polyvec.c + - rounding.c + - sign.c + - source: + scheme: dilithium5 + implementation: clean + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - align.h + - cdecl.h + - consts.h + - ntt.h + - packing.h + - rejsample.h + - rounding.h + - sign.h + - consts.c + - packing.c + - rounding.c + - source: + scheme: dilithium5aes + implementation: clean + files: + - api.h + - packing.h + - params.h + - sign.h + - packing.c diff --git a/test/duplicate_consistency/dilithium5aes_clean.yml b/test/duplicate_consistency/dilithium5aes_clean.yml new file mode 100644 index 00000000..970ecc88 --- /dev/null +++ b/test/duplicate_consistency/dilithium5aes_clean.yml @@ -0,0 +1,137 @@ +consistency_checks: + - source: + scheme: dilithium2 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - source: + scheme: dilithium2 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium2aes + implementation: clean + files: + - aes256ctr.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - sign.c + - symmetric-aes.c + - source: + scheme: dilithium2aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium3 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium3aes + implementation: clean + files: + - aes256ctr.h + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - symmetric.h + - aes256ctr.c + - ntt.c + - packing.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - symmetric-aes.c + - source: + scheme: dilithium3aes + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5 + implementation: clean + files: + - ntt.h + - packing.h + - poly.h + - polyvec.h + - reduce.h + - rounding.h + - sign.h + - ntt.c + - packing.c + - poly.c + - polyvec.c + - reduce.c + - rounding.c + - sign.c + - source: + scheme: dilithium5 + implementation: avx2 + files: + - packing.h + - sign.h + - packing.c + - source: + scheme: dilithium5aes + implementation: avx2 + files: + - api.h + - packing.h + - params.h + - sign.h + - packing.c