From c0f56ccdc263826a3a831bcf993e34cd79183c2c Mon Sep 17 00:00:00 2001 From: Thom Wiggers Date: Fri, 31 Jul 2020 08:17:42 +0200 Subject: [PATCH] Update Kyber from upstream Makes Kyber-AVX run on MacOS (#251) --- crypto_kem/kyber1024-90s/META.yml | 1 + crypto_kem/kyber1024-90s/avx2/LICENSE | 12 +- crypto_kem/kyber1024-90s/avx2/Makefile | 42 +- crypto_kem/kyber1024-90s/avx2/aes256ctr.c | 145 ++-- crypto_kem/kyber1024-90s/avx2/aes256ctr.h | 19 +- crypto_kem/kyber1024-90s/avx2/align.h | 22 + crypto_kem/kyber1024-90s/avx2/basemul.S | 82 ++- crypto_kem/kyber1024-90s/avx2/cbd.c | 14 +- crypto_kem/kyber1024-90s/avx2/cbd.h | 9 +- crypto_kem/kyber1024-90s/avx2/cdecl.inc | 30 + crypto_kem/kyber1024-90s/avx2/consts.c | 177 ++++- crypto_kem/kyber1024-90s/avx2/consts.h | 28 +- crypto_kem/kyber1024-90s/avx2/fq.S | 129 ++++ crypto_kem/kyber1024-90s/avx2/fq.inc | 11 +- crypto_kem/kyber1024-90s/avx2/indcpa.c | 277 ++++--- crypto_kem/kyber1024-90s/avx2/indcpa.h | 23 +- .../invntt.s => kyber1024-90s/avx2/invntt.S} | 70 +- crypto_kem/kyber1024-90s/avx2/invntt.s | 217 ------ crypto_kem/kyber1024-90s/avx2/kem.c | 112 +-- crypto_kem/kyber1024-90s/avx2/kem.h | 19 + crypto_kem/kyber1024-90s/avx2/ntt.S | 220 ++++++ crypto_kem/kyber1024-90s/avx2/ntt.h | 32 +- crypto_kem/kyber1024-90s/avx2/params.h | 20 +- crypto_kem/kyber1024-90s/avx2/poly.c | 427 ++++++----- crypto_kem/kyber1024-90s/avx2/poly.h | 42 +- crypto_kem/kyber1024-90s/avx2/polyvec.c | 182 +++-- crypto_kem/kyber1024-90s/avx2/polyvec.h | 30 +- crypto_kem/kyber1024-90s/avx2/reduce.h | 12 +- crypto_kem/kyber1024-90s/avx2/rejsample.c | 676 +++++++++--------- crypto_kem/kyber1024-90s/avx2/rejsample.h | 9 +- crypto_kem/kyber1024-90s/avx2/shuffle.S | 255 +++++++ crypto_kem/kyber1024-90s/avx2/shuffle.inc | 2 + crypto_kem/kyber1024-90s/avx2/symmetric.h | 24 +- crypto_kem/kyber1024-90s/avx2/verify.c | 41 +- crypto_kem/kyber1024-90s/avx2/verify.h | 9 +- crypto_kem/kyber1024-90s/clean/LICENSE | 12 +- crypto_kem/kyber1024-90s/clean/Makefile | 25 +- .../clean/Makefile.Microsoft_nmake | 2 +- crypto_kem/kyber1024-90s/clean/cbd.c | 28 +- crypto_kem/kyber1024-90s/clean/cbd.h | 9 +- crypto_kem/kyber1024-90s/clean/indcpa.c | 201 +++--- crypto_kem/kyber1024-90s/clean/indcpa.h | 23 +- crypto_kem/kyber1024-90s/clean/kem.c | 94 ++- crypto_kem/kyber1024-90s/clean/kem.h | 19 + crypto_kem/kyber1024-90s/clean/ntt.c | 119 +-- crypto_kem/kyber1024-90s/clean/ntt.h | 19 +- crypto_kem/kyber1024-90s/clean/params.h | 20 +- crypto_kem/kyber1024-90s/clean/poly.c | 278 +++---- crypto_kem/kyber1024-90s/clean/poly.h | 41 +- crypto_kem/kyber1024-90s/clean/polyvec.c | 171 +++-- crypto_kem/kyber1024-90s/clean/polyvec.h | 30 +- crypto_kem/kyber1024-90s/clean/reduce.c | 31 +- crypto_kem/kyber1024-90s/clean/reduce.h | 12 +- .../clean/{aes256ctr.c => symmetric-aes.c} | 5 +- .../clean/{aes256ctr.h => symmetric-aes.h} | 0 crypto_kem/kyber1024-90s/clean/symmetric.h | 14 +- crypto_kem/kyber1024-90s/clean/verify.c | 23 +- crypto_kem/kyber1024-90s/clean/verify.h | 9 +- crypto_kem/kyber1024/META.yml | 1 + crypto_kem/kyber1024/avx2/LICENSE | 12 +- crypto_kem/kyber1024/avx2/Makefile | 48 +- crypto_kem/kyber1024/avx2/align.h | 22 + crypto_kem/kyber1024/avx2/basemul.S | 82 ++- crypto_kem/kyber1024/avx2/cbd.c | 14 +- crypto_kem/kyber1024/avx2/cbd.h | 9 +- crypto_kem/kyber1024/avx2/cdecl.inc | 30 + crypto_kem/kyber1024/avx2/consts.c | 177 ++++- crypto_kem/kyber1024/avx2/consts.h | 28 +- crypto_kem/kyber1024/avx2/fips202x4.c | 321 ++++----- crypto_kem/kyber1024/avx2/fips202x4.h | 40 +- .../avx2/fq.s => kyber1024/avx2/fq.S} | 93 ++- crypto_kem/kyber1024/avx2/fq.inc | 11 +- crypto_kem/kyber1024/avx2/indcpa.c | 280 +++++--- crypto_kem/kyber1024/avx2/indcpa.h | 23 +- .../avx2/invntt.s => kyber1024/avx2/invntt.S} | 70 +- crypto_kem/kyber1024/avx2/kem.c | 112 +-- crypto_kem/kyber1024/avx2/kem.h | 19 + .../avx2/ntt.s => kyber1024/avx2/ntt.S} | 65 +- crypto_kem/kyber1024/avx2/ntt.h | 32 +- crypto_kem/kyber1024/avx2/params.h | 20 +- crypto_kem/kyber1024/avx2/poly.c | 460 ++++++------ crypto_kem/kyber1024/avx2/poly.h | 43 +- crypto_kem/kyber1024/avx2/polyvec.c | 182 +++-- crypto_kem/kyber1024/avx2/polyvec.h | 30 +- crypto_kem/kyber1024/avx2/reduce.h | 12 +- crypto_kem/kyber1024/avx2/rejsample.c | 676 +++++++++--------- crypto_kem/kyber1024/avx2/rejsample.h | 9 +- .../shuffle.s => kyber1024/avx2/shuffle.S} | 111 ++- crypto_kem/kyber1024/avx2/shuffle.inc | 2 + crypto_kem/kyber1024/avx2/symmetric-fips202.c | 63 -- crypto_kem/kyber1024/avx2/symmetric-shake.c | 60 ++ crypto_kem/kyber1024/avx2/symmetric.h | 30 +- crypto_kem/kyber1024/avx2/verify.c | 41 +- crypto_kem/kyber1024/avx2/verify.h | 9 +- crypto_kem/kyber1024/clean/LICENSE | 12 +- crypto_kem/kyber1024/clean/Makefile | 4 +- .../kyber1024/clean/Makefile.Microsoft_nmake | 2 +- crypto_kem/kyber1024/clean/cbd.c | 28 +- crypto_kem/kyber1024/clean/cbd.h | 9 +- crypto_kem/kyber1024/clean/indcpa.c | 201 +++--- crypto_kem/kyber1024/clean/indcpa.h | 23 +- crypto_kem/kyber1024/clean/kem.c | 94 ++- crypto_kem/kyber1024/clean/kem.h | 19 + crypto_kem/kyber1024/clean/ntt.c | 119 +-- crypto_kem/kyber1024/clean/ntt.h | 19 +- crypto_kem/kyber1024/clean/params.h | 20 +- crypto_kem/kyber1024/clean/poly.c | 278 +++---- crypto_kem/kyber1024/clean/poly.h | 41 +- crypto_kem/kyber1024/clean/polyvec.c | 171 +++-- crypto_kem/kyber1024/clean/polyvec.h | 30 +- crypto_kem/kyber1024/clean/reduce.c | 31 +- crypto_kem/kyber1024/clean/reduce.h | 12 +- .../kyber1024/clean/symmetric-fips202.c | 63 -- crypto_kem/kyber1024/clean/symmetric-shake.c | 60 ++ crypto_kem/kyber1024/clean/symmetric.h | 30 +- crypto_kem/kyber1024/clean/verify.c | 23 +- crypto_kem/kyber1024/clean/verify.h | 9 +- crypto_kem/kyber512-90s/META.yml | 1 + crypto_kem/kyber512-90s/avx2/LICENSE | 12 +- crypto_kem/kyber512-90s/avx2/Makefile | 42 +- crypto_kem/kyber512-90s/avx2/aes256ctr.c | 145 ++-- crypto_kem/kyber512-90s/avx2/aes256ctr.h | 19 +- crypto_kem/kyber512-90s/avx2/align.h | 22 + crypto_kem/kyber512-90s/avx2/basemul.S | 82 ++- crypto_kem/kyber512-90s/avx2/cbd.c | 14 +- crypto_kem/kyber512-90s/avx2/cbd.h | 9 +- crypto_kem/kyber512-90s/avx2/cdecl.inc | 30 + crypto_kem/kyber512-90s/avx2/consts.c | 177 ++++- crypto_kem/kyber512-90s/avx2/consts.h | 28 +- crypto_kem/kyber512-90s/avx2/{fq.s => fq.S} | 93 ++- crypto_kem/kyber512-90s/avx2/fq.inc | 11 +- crypto_kem/kyber512-90s/avx2/indcpa.c | 277 ++++--- crypto_kem/kyber512-90s/avx2/indcpa.h | 23 +- .../invntt.s => kyber512-90s/avx2/invntt.S} | 70 +- crypto_kem/kyber512-90s/avx2/kem.c | 112 +-- crypto_kem/kyber512-90s/avx2/kem.h | 19 + .../avx2/ntt.s => kyber512-90s/avx2/ntt.S} | 65 +- crypto_kem/kyber512-90s/avx2/ntt.h | 32 +- crypto_kem/kyber512-90s/avx2/params.h | 20 +- crypto_kem/kyber512-90s/avx2/poly.c | 423 ++++++----- crypto_kem/kyber512-90s/avx2/poly.h | 42 +- crypto_kem/kyber512-90s/avx2/polyvec.c | 162 +++-- crypto_kem/kyber512-90s/avx2/polyvec.h | 30 +- crypto_kem/kyber512-90s/avx2/reduce.h | 12 +- crypto_kem/kyber512-90s/avx2/rejsample.c | 676 +++++++++--------- crypto_kem/kyber512-90s/avx2/rejsample.h | 9 +- .../shuffle.s => kyber512-90s/avx2/shuffle.S} | 111 ++- crypto_kem/kyber512-90s/avx2/shuffle.inc | 2 + crypto_kem/kyber512-90s/avx2/symmetric.h | 24 +- crypto_kem/kyber512-90s/avx2/verify.c | 41 +- crypto_kem/kyber512-90s/avx2/verify.h | 9 +- crypto_kem/kyber512-90s/clean/LICENSE | 12 +- crypto_kem/kyber512-90s/clean/Makefile | 25 +- .../clean/Makefile.Microsoft_nmake | 2 +- crypto_kem/kyber512-90s/clean/cbd.c | 28 +- crypto_kem/kyber512-90s/clean/cbd.h | 9 +- crypto_kem/kyber512-90s/clean/indcpa.c | 201 +++--- crypto_kem/kyber512-90s/clean/indcpa.h | 23 +- crypto_kem/kyber512-90s/clean/kem.c | 94 ++- crypto_kem/kyber512-90s/clean/kem.h | 19 + crypto_kem/kyber512-90s/clean/ntt.c | 119 +-- crypto_kem/kyber512-90s/clean/ntt.h | 19 +- crypto_kem/kyber512-90s/clean/params.h | 20 +- crypto_kem/kyber512-90s/clean/poly.c | 274 +++---- crypto_kem/kyber512-90s/clean/poly.h | 41 +- crypto_kem/kyber512-90s/clean/polyvec.c | 151 ++-- crypto_kem/kyber512-90s/clean/polyvec.h | 30 +- crypto_kem/kyber512-90s/clean/reduce.c | 31 +- crypto_kem/kyber512-90s/clean/reduce.h | 12 +- .../clean/{aes256ctr.c => symmetric-aes.c} | 5 +- .../clean/{aes256ctr.h => symmetric-aes.h} | 0 crypto_kem/kyber512-90s/clean/symmetric.h | 14 +- crypto_kem/kyber512-90s/clean/verify.c | 23 +- crypto_kem/kyber512-90s/clean/verify.h | 9 +- crypto_kem/kyber512/META.yml | 23 +- crypto_kem/kyber512/avx2/LICENSE | 12 +- crypto_kem/kyber512/avx2/Makefile | 48 +- crypto_kem/kyber512/avx2/align.h | 22 + crypto_kem/kyber512/avx2/basemul.S | 82 ++- crypto_kem/kyber512/avx2/cbd.c | 14 +- crypto_kem/kyber512/avx2/cbd.h | 9 +- crypto_kem/kyber512/avx2/cdecl.inc | 30 + crypto_kem/kyber512/avx2/consts.c | 177 ++++- crypto_kem/kyber512/avx2/consts.h | 28 +- crypto_kem/kyber512/avx2/fips202x4.c | 321 ++++----- crypto_kem/kyber512/avx2/fips202x4.h | 40 +- .../avx2/fq.s => kyber512/avx2/fq.S} | 93 ++- crypto_kem/kyber512/avx2/fq.inc | 11 +- crypto_kem/kyber512/avx2/indcpa.c | 270 ++++--- crypto_kem/kyber512/avx2/indcpa.h | 23 +- .../avx2/invntt.s => kyber512/avx2/invntt.S} | 70 +- crypto_kem/kyber512/avx2/kem.c | 112 +-- crypto_kem/kyber512/avx2/kem.h | 19 + .../avx2/ntt.s => kyber512/avx2/ntt.S} | 65 +- crypto_kem/kyber512/avx2/ntt.h | 32 +- crypto_kem/kyber512/avx2/params.h | 20 +- crypto_kem/kyber512/avx2/poly.c | 456 ++++++------ crypto_kem/kyber512/avx2/poly.h | 43 +- crypto_kem/kyber512/avx2/polyvec.c | 162 +++-- crypto_kem/kyber512/avx2/polyvec.h | 30 +- crypto_kem/kyber512/avx2/reduce.h | 12 +- crypto_kem/kyber512/avx2/rejsample.c | 676 +++++++++--------- crypto_kem/kyber512/avx2/rejsample.h | 9 +- .../shuffle.s => kyber512/avx2/shuffle.S} | 111 ++- crypto_kem/kyber512/avx2/shuffle.inc | 2 + crypto_kem/kyber512/avx2/symmetric-fips202.c | 63 -- crypto_kem/kyber512/avx2/symmetric-shake.c | 60 ++ crypto_kem/kyber512/avx2/symmetric.h | 30 +- crypto_kem/kyber512/avx2/verify.c | 41 +- crypto_kem/kyber512/avx2/verify.h | 9 +- crypto_kem/kyber512/clean/LICENSE | 12 +- crypto_kem/kyber512/clean/Makefile | 4 +- .../kyber512/clean/Makefile.Microsoft_nmake | 2 +- crypto_kem/kyber512/clean/cbd.c | 28 +- crypto_kem/kyber512/clean/cbd.h | 9 +- crypto_kem/kyber512/clean/indcpa.c | 201 +++--- crypto_kem/kyber512/clean/indcpa.h | 23 +- crypto_kem/kyber512/clean/kem.c | 94 ++- crypto_kem/kyber512/clean/kem.h | 19 + crypto_kem/kyber512/clean/ntt.c | 119 +-- crypto_kem/kyber512/clean/ntt.h | 19 +- crypto_kem/kyber512/clean/params.h | 20 +- crypto_kem/kyber512/clean/poly.c | 274 +++---- crypto_kem/kyber512/clean/poly.h | 41 +- crypto_kem/kyber512/clean/polyvec.c | 151 ++-- crypto_kem/kyber512/clean/polyvec.h | 30 +- crypto_kem/kyber512/clean/reduce.c | 31 +- crypto_kem/kyber512/clean/reduce.h | 12 +- crypto_kem/kyber512/clean/symmetric-fips202.c | 63 -- crypto_kem/kyber512/clean/symmetric-shake.c | 60 ++ crypto_kem/kyber512/clean/symmetric.h | 30 +- crypto_kem/kyber512/clean/verify.c | 23 +- crypto_kem/kyber512/clean/verify.h | 9 +- crypto_kem/kyber768-90s/META.yml | 27 +- crypto_kem/kyber768-90s/avx2/LICENSE | 12 +- crypto_kem/kyber768-90s/avx2/Makefile | 42 +- crypto_kem/kyber768-90s/avx2/aes256ctr.c | 145 ++-- crypto_kem/kyber768-90s/avx2/aes256ctr.h | 19 +- crypto_kem/kyber768-90s/avx2/align.h | 22 + crypto_kem/kyber768-90s/avx2/basemul.S | 82 ++- crypto_kem/kyber768-90s/avx2/cbd.c | 14 +- crypto_kem/kyber768-90s/avx2/cbd.h | 9 +- crypto_kem/kyber768-90s/avx2/cdecl.inc | 30 + crypto_kem/kyber768-90s/avx2/consts.c | 177 ++++- crypto_kem/kyber768-90s/avx2/consts.h | 28 +- crypto_kem/kyber768-90s/avx2/fq.S | 129 ++++ crypto_kem/kyber768-90s/avx2/fq.inc | 11 +- crypto_kem/kyber768-90s/avx2/fq.s | 112 --- crypto_kem/kyber768-90s/avx2/indcpa.c | 277 ++++--- crypto_kem/kyber768-90s/avx2/indcpa.h | 23 +- crypto_kem/kyber768-90s/avx2/invntt.S | 225 ++++++ crypto_kem/kyber768-90s/avx2/invntt.s | 217 ------ crypto_kem/kyber768-90s/avx2/kem.c | 112 +-- crypto_kem/kyber768-90s/avx2/kem.h | 19 + crypto_kem/kyber768-90s/avx2/ntt.S | 220 ++++++ crypto_kem/kyber768-90s/avx2/ntt.h | 32 +- crypto_kem/kyber768-90s/avx2/ntt.s | 209 ------ crypto_kem/kyber768-90s/avx2/params.h | 20 +- crypto_kem/kyber768-90s/avx2/poly.c | 415 +++++------ crypto_kem/kyber768-90s/avx2/poly.h | 42 +- crypto_kem/kyber768-90s/avx2/polyvec.c | 162 +++-- crypto_kem/kyber768-90s/avx2/polyvec.h | 30 +- crypto_kem/kyber768-90s/avx2/reduce.h | 12 +- crypto_kem/kyber768-90s/avx2/rejsample.c | 676 +++++++++--------- crypto_kem/kyber768-90s/avx2/rejsample.h | 9 +- crypto_kem/kyber768-90s/avx2/shuffle.S | 255 +++++++ crypto_kem/kyber768-90s/avx2/shuffle.inc | 2 + crypto_kem/kyber768-90s/avx2/shuffle.s | 206 ------ crypto_kem/kyber768-90s/avx2/symmetric.h | 24 +- crypto_kem/kyber768-90s/avx2/verify.c | 41 +- crypto_kem/kyber768-90s/avx2/verify.h | 9 +- crypto_kem/kyber768-90s/clean/LICENSE | 12 +- crypto_kem/kyber768-90s/clean/Makefile | 25 +- .../clean/Makefile.Microsoft_nmake | 2 +- crypto_kem/kyber768-90s/clean/cbd.c | 28 +- crypto_kem/kyber768-90s/clean/cbd.h | 9 +- crypto_kem/kyber768-90s/clean/indcpa.c | 201 +++--- crypto_kem/kyber768-90s/clean/indcpa.h | 23 +- crypto_kem/kyber768-90s/clean/kem.c | 94 ++- crypto_kem/kyber768-90s/clean/kem.h | 19 + crypto_kem/kyber768-90s/clean/ntt.c | 119 +-- crypto_kem/kyber768-90s/clean/ntt.h | 19 +- crypto_kem/kyber768-90s/clean/params.h | 20 +- crypto_kem/kyber768-90s/clean/poly.c | 266 +++---- crypto_kem/kyber768-90s/clean/poly.h | 41 +- crypto_kem/kyber768-90s/clean/polyvec.c | 151 ++-- crypto_kem/kyber768-90s/clean/polyvec.h | 30 +- crypto_kem/kyber768-90s/clean/reduce.c | 31 +- crypto_kem/kyber768-90s/clean/reduce.h | 12 +- .../clean/{aes256ctr.c => symmetric-aes.c} | 5 +- .../clean/{aes256ctr.h => symmetric-aes.h} | 0 crypto_kem/kyber768-90s/clean/symmetric.h | 14 +- crypto_kem/kyber768-90s/clean/verify.c | 23 +- crypto_kem/kyber768-90s/clean/verify.h | 9 +- crypto_kem/kyber768/META.yml | 25 +- crypto_kem/kyber768/avx2/LICENSE | 12 +- crypto_kem/kyber768/avx2/Makefile | 48 +- crypto_kem/kyber768/avx2/align.h | 22 + crypto_kem/kyber768/avx2/basemul.S | 82 ++- crypto_kem/kyber768/avx2/cbd.c | 14 +- crypto_kem/kyber768/avx2/cbd.h | 9 +- crypto_kem/kyber768/avx2/cdecl.inc | 30 + crypto_kem/kyber768/avx2/consts.c | 177 ++++- crypto_kem/kyber768/avx2/consts.h | 28 +- crypto_kem/kyber768/avx2/fips202x4.c | 321 ++++----- crypto_kem/kyber768/avx2/fips202x4.h | 40 +- .../avx2/fq.s => kyber768/avx2/fq.S} | 93 ++- crypto_kem/kyber768/avx2/fq.inc | 11 +- crypto_kem/kyber768/avx2/fq.s | 112 --- crypto_kem/kyber768/avx2/indcpa.c | 354 +++++---- crypto_kem/kyber768/avx2/indcpa.h | 23 +- crypto_kem/kyber768/avx2/invntt.S | 225 ++++++ crypto_kem/kyber768/avx2/kem.c | 112 +-- crypto_kem/kyber768/avx2/kem.h | 19 + .../avx2/ntt.s => kyber768/avx2/ntt.S} | 65 +- crypto_kem/kyber768/avx2/ntt.h | 32 +- crypto_kem/kyber768/avx2/ntt.s | 209 ------ crypto_kem/kyber768/avx2/params.h | 20 +- crypto_kem/kyber768/avx2/poly.c | 448 ++++++------ crypto_kem/kyber768/avx2/poly.h | 43 +- crypto_kem/kyber768/avx2/polyvec.c | 162 +++-- crypto_kem/kyber768/avx2/polyvec.h | 30 +- crypto_kem/kyber768/avx2/reduce.h | 12 +- crypto_kem/kyber768/avx2/rejsample.c | 676 +++++++++--------- crypto_kem/kyber768/avx2/rejsample.h | 9 +- .../shuffle.s => kyber768/avx2/shuffle.S} | 111 ++- crypto_kem/kyber768/avx2/shuffle.inc | 2 + crypto_kem/kyber768/avx2/shuffle.s | 206 ------ crypto_kem/kyber768/avx2/symmetric-fips202.c | 63 -- crypto_kem/kyber768/avx2/symmetric-shake.c | 60 ++ crypto_kem/kyber768/avx2/symmetric.h | 30 +- crypto_kem/kyber768/avx2/verify.c | 41 +- crypto_kem/kyber768/avx2/verify.h | 9 +- crypto_kem/kyber768/clean/LICENSE | 12 +- crypto_kem/kyber768/clean/Makefile | 4 +- .../kyber768/clean/Makefile.Microsoft_nmake | 2 +- crypto_kem/kyber768/clean/cbd.c | 28 +- crypto_kem/kyber768/clean/cbd.h | 9 +- crypto_kem/kyber768/clean/indcpa.c | 201 +++--- crypto_kem/kyber768/clean/indcpa.h | 23 +- crypto_kem/kyber768/clean/kem.c | 94 ++- crypto_kem/kyber768/clean/kem.h | 19 + crypto_kem/kyber768/clean/ntt.c | 119 +-- crypto_kem/kyber768/clean/ntt.h | 19 +- crypto_kem/kyber768/clean/params.h | 20 +- crypto_kem/kyber768/clean/poly.c | 266 +++---- crypto_kem/kyber768/clean/poly.h | 41 +- crypto_kem/kyber768/clean/polyvec.c | 151 ++-- crypto_kem/kyber768/clean/polyvec.h | 30 +- crypto_kem/kyber768/clean/reduce.c | 31 +- crypto_kem/kyber768/clean/reduce.h | 12 +- crypto_kem/kyber768/clean/symmetric-fips202.c | 63 -- crypto_kem/kyber768/clean/symmetric-shake.c | 60 ++ crypto_kem/kyber768/clean/symmetric.h | 30 +- crypto_kem/kyber768/clean/verify.c | 23 +- crypto_kem/kyber768/clean/verify.h | 9 +- .../kyber1024-90s_avx2.yml | 18 +- test/duplicate_consistency/kyber1024_avx2.yml | 12 +- .../duplicate_consistency/kyber1024_clean.yml | 2 +- .../kyber512-90s_avx2.yml | 17 +- test/duplicate_consistency/kyber512_avx2.yml | 12 +- test/duplicate_consistency/kyber512_clean.yml | 2 +- .../kyber768-90s_avx2.yml | 17 +- test/duplicate_consistency/kyber768_avx2.yml | 12 +- test/duplicate_consistency/kyber768_clean.yml | 2 +- 365 files changed, 15895 insertions(+), 12343 deletions(-) create mode 100644 crypto_kem/kyber1024-90s/avx2/align.h create mode 100644 crypto_kem/kyber1024-90s/avx2/cdecl.inc create mode 100644 crypto_kem/kyber1024-90s/avx2/fq.S rename crypto_kem/{kyber768/avx2/invntt.s => kyber1024-90s/avx2/invntt.S} (76%) delete mode 100644 crypto_kem/kyber1024-90s/avx2/invntt.s create mode 100644 crypto_kem/kyber1024-90s/avx2/kem.h create mode 100644 crypto_kem/kyber1024-90s/avx2/ntt.S create mode 100644 crypto_kem/kyber1024-90s/avx2/shuffle.S create mode 100644 crypto_kem/kyber1024-90s/clean/kem.h rename crypto_kem/kyber1024-90s/clean/{aes256ctr.c => symmetric-aes.c} (98%) rename crypto_kem/kyber1024-90s/clean/{aes256ctr.h => symmetric-aes.h} (100%) create mode 100644 crypto_kem/kyber1024/avx2/align.h create mode 100644 crypto_kem/kyber1024/avx2/cdecl.inc rename crypto_kem/{kyber512/avx2/fq.s => kyber1024/avx2/fq.S} (54%) rename crypto_kem/{kyber512/avx2/invntt.s => kyber1024/avx2/invntt.S} (76%) create mode 100644 crypto_kem/kyber1024/avx2/kem.h rename crypto_kem/{kyber1024-90s/avx2/ntt.s => kyber1024/avx2/ntt.S} (81%) rename crypto_kem/{kyber512-90s/avx2/shuffle.s => kyber1024/avx2/shuffle.S} (66%) delete mode 100644 crypto_kem/kyber1024/avx2/symmetric-fips202.c create mode 100644 crypto_kem/kyber1024/avx2/symmetric-shake.c create mode 100644 crypto_kem/kyber1024/clean/kem.h delete mode 100644 crypto_kem/kyber1024/clean/symmetric-fips202.c create mode 100644 crypto_kem/kyber1024/clean/symmetric-shake.c create mode 100644 crypto_kem/kyber512-90s/avx2/align.h create mode 100644 crypto_kem/kyber512-90s/avx2/cdecl.inc rename crypto_kem/kyber512-90s/avx2/{fq.s => fq.S} (54%) rename crypto_kem/{kyber1024/avx2/invntt.s => kyber512-90s/avx2/invntt.S} (76%) create mode 100644 crypto_kem/kyber512-90s/avx2/kem.h rename crypto_kem/{kyber512/avx2/ntt.s => kyber512-90s/avx2/ntt.S} (81%) rename crypto_kem/{kyber1024-90s/avx2/shuffle.s => kyber512-90s/avx2/shuffle.S} (65%) create mode 100644 crypto_kem/kyber512-90s/clean/kem.h rename crypto_kem/kyber512-90s/clean/{aes256ctr.c => symmetric-aes.c} (98%) rename crypto_kem/kyber512-90s/clean/{aes256ctr.h => symmetric-aes.h} (100%) create mode 100644 crypto_kem/kyber512/avx2/align.h create mode 100644 crypto_kem/kyber512/avx2/cdecl.inc rename crypto_kem/{kyber1024-90s/avx2/fq.s => kyber512/avx2/fq.S} (54%) rename crypto_kem/{kyber512-90s/avx2/invntt.s => kyber512/avx2/invntt.S} (75%) create mode 100644 crypto_kem/kyber512/avx2/kem.h rename crypto_kem/{kyber1024/avx2/ntt.s => kyber512/avx2/ntt.S} (81%) rename crypto_kem/{kyber1024/avx2/shuffle.s => kyber512/avx2/shuffle.S} (66%) delete mode 100644 crypto_kem/kyber512/avx2/symmetric-fips202.c create mode 100644 crypto_kem/kyber512/avx2/symmetric-shake.c create mode 100644 crypto_kem/kyber512/clean/kem.h delete mode 100644 crypto_kem/kyber512/clean/symmetric-fips202.c create mode 100644 crypto_kem/kyber512/clean/symmetric-shake.c create mode 100644 crypto_kem/kyber768-90s/avx2/align.h create mode 100644 crypto_kem/kyber768-90s/avx2/cdecl.inc create mode 100644 crypto_kem/kyber768-90s/avx2/fq.S delete mode 100644 crypto_kem/kyber768-90s/avx2/fq.s create mode 100644 crypto_kem/kyber768-90s/avx2/invntt.S delete mode 100644 crypto_kem/kyber768-90s/avx2/invntt.s create mode 100644 crypto_kem/kyber768-90s/avx2/kem.h create mode 100644 crypto_kem/kyber768-90s/avx2/ntt.S delete mode 100644 crypto_kem/kyber768-90s/avx2/ntt.s create mode 100644 crypto_kem/kyber768-90s/avx2/shuffle.S delete mode 100644 crypto_kem/kyber768-90s/avx2/shuffle.s create mode 100644 crypto_kem/kyber768-90s/clean/kem.h rename crypto_kem/kyber768-90s/clean/{aes256ctr.c => symmetric-aes.c} (98%) rename crypto_kem/kyber768-90s/clean/{aes256ctr.h => symmetric-aes.h} (100%) create mode 100644 crypto_kem/kyber768/avx2/align.h create mode 100644 crypto_kem/kyber768/avx2/cdecl.inc rename crypto_kem/{kyber1024/avx2/fq.s => kyber768/avx2/fq.S} (54%) delete mode 100644 crypto_kem/kyber768/avx2/fq.s create mode 100644 crypto_kem/kyber768/avx2/invntt.S create mode 100644 crypto_kem/kyber768/avx2/kem.h rename crypto_kem/{kyber512-90s/avx2/ntt.s => kyber768/avx2/ntt.S} (81%) delete mode 100644 crypto_kem/kyber768/avx2/ntt.s rename crypto_kem/{kyber512/avx2/shuffle.s => kyber768/avx2/shuffle.S} (66%) delete mode 100644 crypto_kem/kyber768/avx2/shuffle.s delete mode 100644 crypto_kem/kyber768/avx2/symmetric-fips202.c create mode 100644 crypto_kem/kyber768/avx2/symmetric-shake.c create mode 100644 crypto_kem/kyber768/clean/kem.h delete mode 100644 crypto_kem/kyber768/clean/symmetric-fips202.c create mode 100644 crypto_kem/kyber768/clean/symmetric-shake.c diff --git a/crypto_kem/kyber1024-90s/META.yml b/crypto_kem/kyber1024-90s/META.yml index 025729aa..6b52fe0b 100644 --- a/crypto_kem/kyber1024-90s/META.yml +++ b/crypto_kem/kyber1024-90s/META.yml @@ -28,6 +28,7 @@ implementations: - architecture: x86_64 operating_systems: - Linux + - Darwin required_flags: - aes - avx2 diff --git a/crypto_kem/kyber1024-90s/avx2/LICENSE b/crypto_kem/kyber1024-90s/avx2/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber1024-90s/avx2/LICENSE +++ b/crypto_kem/kyber1024-90s/avx2/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber1024-90s/avx2/Makefile b/crypto_kem/kyber1024-90s/avx2/Makefile index aac0b7b7..25635c4b 100644 --- a/crypto_kem/kyber1024-90s/avx2/Makefile +++ b/crypto_kem/kyber1024-90s/avx2/Makefile @@ -1,9 +1,40 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber1024-90s_avx2.a -HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h aes256ctr.h -OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \ - verify.o indcpa.o rejsample.o aes256ctr.o +HEADERS= \ + aes256ctr.h \ + align.h \ + api.h \ + cbd.h \ + cdecl.inc \ + consts.h \ + fq.inc \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + rejsample.h \ + shuffle.inc \ + symmetric.h \ + verify.h +OBJECTS= \ + aes256ctr.o \ + basemul.o \ + cbd.o \ + consts.o \ + fq.o \ + indcpa.o \ + invntt.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + rejsample.o \ + shuffle.o \ + verify.o CFLAGS=-mavx2 -maes -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ -Wmissing-prototypes -Wredundant-decls -std=c99 \ @@ -14,11 +45,8 @@ all: $(LIB) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) $(CFLAGS) -c -o $@ $< $(LIB): $(OBJECTS) $(AR) -r $@ $(OBJECTS) diff --git a/crypto_kem/kyber1024-90s/avx2/aes256ctr.c b/crypto_kem/kyber1024-90s/avx2/aes256ctr.c index b7dbd90d..404794a0 100644 --- a/crypto_kem/kyber1024-90s/avx2/aes256ctr.c +++ b/crypto_kem/kyber1024-90s/avx2/aes256ctr.c @@ -1,94 +1,68 @@ /* - crypto_stream_aes256ctr - based heavily on public-domain code by Romain Dolbeau + Based heavily on public-domain code by Romain Dolbeau Different handling of nonce+counter than original version - using separated 96-bit nonce and internal 32-bit counter, starting from zero + using separated 64-bit nonce and internal 64-bit counter, starting from zero Public Domain */ #include "aes256ctr.h" - #include +#include #include -static inline void aesni_encrypt8(uint8_t *out, +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { - __m128i nv0; - __m128i nv1; - __m128i nv2; - __m128i nv3; - __m128i nv4; - __m128i nv5; - __m128i nv6; - __m128i nv7; + __m128i f, f0, f1, f2, f3, t; /* Load current counter value */ - __m128i nv0i = _mm_load_si128(n); - - /* Increase counter in 8 consecutive blocks */ - nv0 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(0, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv1 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(1, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv2 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(2, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv3 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(3, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv4 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(4, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv5 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(5, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv6 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(6, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv7 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(7, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - - /* Write counter for next iteration, increased by 8 */ - _mm_store_si128(n, _mm_add_epi32(nv0i, _mm_set_epi64x(8, 0))); - - /* Actual AES encryption, 8x interleaved */ - __m128i temp0 = _mm_xor_si128(nv0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(nv1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(nv2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(nv3, rkeys[0]); - __m128i temp4 = _mm_xor_si128(nv4, rkeys[0]); - __m128i temp5 = _mm_xor_si128(nv5, rkeys[0]); - __m128i temp6 = _mm_xor_si128(nv6, rkeys[0]); - __m128i temp7 = _mm_xor_si128(nv7, rkeys[0]); - - for (uint8_t i = 1; i < 14; i++) { - temp0 = _mm_aesenc_si128(temp0, rkeys[i]); - temp1 = _mm_aesenc_si128(temp1, rkeys[i]); - temp2 = _mm_aesenc_si128(temp2, rkeys[i]); - temp3 = _mm_aesenc_si128(temp3, rkeys[i]); - temp4 = _mm_aesenc_si128(temp4, rkeys[i]); - temp5 = _mm_aesenc_si128(temp5, rkeys[i]); - temp6 = _mm_aesenc_si128(temp6, rkeys[i]); - temp7 = _mm_aesenc_si128(temp7, rkeys[i]); + f = _mm_load_si128(n); + + /* Increase counter in 4 consecutive blocks */ + t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); + + /* Write counter for next iteration, increased by 4 */ + _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); + + /* Actual AES encryption, 4x interleaved */ + t = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, t); + f1 = _mm_xor_si128(f1, t); + f2 = _mm_xor_si128(f2, t); + f3 = _mm_xor_si128(f3, t); + + for (int i = 1; i < 14; i++) { + t = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, t); + f1 = _mm_aesenc_si128(f1, t); + f2 = _mm_aesenc_si128(f2, t); + f3 = _mm_aesenc_si128(f3, t); } - temp0 = _mm_aesenclast_si128(temp0, rkeys[14]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[14]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[14]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[14]); - temp4 = _mm_aesenclast_si128(temp4, rkeys[14]); - temp5 = _mm_aesenclast_si128(temp5, rkeys[14]); - temp6 = _mm_aesenclast_si128(temp6, rkeys[14]); - temp7 = _mm_aesenclast_si128(temp7, rkeys[14]); + t = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, t); + f1 = _mm_aesenclast_si128(f1, t); + f2 = _mm_aesenclast_si128(f2, t); + f3 = _mm_aesenclast_si128(f3, t); /* Write results */ - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); - _mm_storeu_si128((__m128i *)(out + 64), temp4); - _mm_storeu_si128((__m128i *)(out + 80), temp5); - _mm_storeu_si128((__m128i *)(out + 96), temp6); - _mm_storeu_si128((__m128i *)(out + 112), temp7); + _mm_storeu_si128((__m128i *)(out + 0), f0); + _mm_storeu_si128((__m128i *)(out + 16), f1); + _mm_storeu_si128((__m128i *)(out + 32), f2); + _mm_storeu_si128((__m128i *)(out + 48), f3); } -void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, - const uint8_t *key, - uint16_t nonce) { - __m128i key0 = _mm_loadu_si128((__m128i *)(key + 0)); - __m128i key1 = _mm_loadu_si128((__m128i *)(key + 16)); - __m128i temp0, temp1, temp2, temp4; - size_t idx = 0; +void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) { + __m128i key0, key1, temp0, temp1, temp2, temp4; + int idx = 0; - state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48); + key0 = _mm_loadu_si128((__m128i *)(key + 0)); + key1 = _mm_loadu_si128((__m128i *)(key + 16)); + state->n = _mm_loadl_epi64((__m128i *)&nonce); state->rkeys[idx++] = key0; temp0 = key0; @@ -137,38 +111,33 @@ void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, state->rkeys[idx++] = temp0; } -void PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce) { - state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48); -} - void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *state) { - size_t i; - + size_t i = 0; for (i = 0; i < nblocks; i++) { - aesni_encrypt8(out, &state->n, state->rkeys); - out += 128; + aesni_encrypt4(out, &state->n, state->rkeys); + out += 64; } } void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, size_t outlen, - const uint8_t *seed, - uint8_t nonce) { - size_t i; - uint8_t buf[128]; + const uint8_t seed[32], + uint64_t nonce) { + unsigned int i = 0; + uint8_t buf[64]; aes256ctr_ctx state; - PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, (uint16_t)nonce << 8); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, nonce); - while (outlen >= 128) { - aesni_encrypt8(out, &state.n, state.rkeys); - outlen -= 128; + while (outlen >= 64) { + aesni_encrypt4(out, &state.n, state.rkeys); + outlen -= 64; } if (outlen) { - aesni_encrypt8(buf, &state.n, state.rkeys); + aesni_encrypt4(buf, &state.n, state.rkeys); for (i = 0; i < outlen; i++) { out[i] = buf[i]; } diff --git a/crypto_kem/kyber1024-90s/avx2/aes256ctr.h b/crypto_kem/kyber1024-90s/avx2/aes256ctr.h index 5f5f2281..d40f1463 100644 --- a/crypto_kem/kyber1024-90s/avx2/aes256ctr.h +++ b/crypto_kem/kyber1024-90s/avx2/aes256ctr.h @@ -5,22 +5,17 @@ #include #include +#define AES256CTR_NAMESPACE(s) pqcrystals_aes256ctr_avx2##s + +#define AES256CTR_BLOCKBYTES 64 + typedef struct { __m128i rkeys[16]; __m128i n; } aes256ctr_ctx; -void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, - const uint8_t *key, - uint16_t nonce); -void PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce); -void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, - size_t nblocks, - aes256ctr_ctx *state); - -void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, - size_t outlen, - const uint8_t *seed, - uint8_t nonce); +void PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce); +void PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *state); +void PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t seed[32], uint64_t nonce); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/align.h b/crypto_kem/kyber1024-90s/avx2/align.h new file mode 100644 index 00000000..7227b8f0 --- /dev/null +++ b/crypto_kem/kyber1024-90s/avx2/align.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_KYBER102490S_AVX2_ALIGN_H +#define PQCLEAN_KYBER102490S_AVX2_ALIGN_H +#include + +#define ALIGN16_TYPE(t) \ + union { \ + __m128i vec; \ + t orig; \ + } + +#define ALIGN32_ARRAY(t, s) \ + union { \ + __m256i vec; \ + t arr[(s)]; \ + } + +#define ALIGN32_ARRAY_2D(t, n, m) \ + union { \ + __m256i vec; \ + t arr[(n)][(m)]; \ + } +#endif diff --git a/crypto_kem/kyber1024-90s/avx2/basemul.S b/crypto_kem/kyber1024-90s/avx2/basemul.S index 11331222..abd3eda5 100644 --- a/crypto_kem/kyber1024-90s/avx2/basemul.S +++ b/crypto_kem/kyber1024-90s/avx2/basemul.S @@ -1,4 +1,5 @@ #include "params.h" +#include "cdecl.inc" .macro schoolbook off,sign #load @@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0 vpaddd %ymm7,%ymm8,%ymm7 # y1 .endm -.macro red a0,a1,b0,b1 x,y,z +.macro red a0,a1,b0,b1,x,y,z #pack vpxor %ymm\x,%ymm\x,%ymm\x vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y @@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0 vpsubw %ymm\y,%ymm\b0,%ymm\b0 .endm -.global PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx -PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx: -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 - +.text +basemul64_acc_avx: poly0.0: schoolbook 0,0 @@ -117,7 +113,7 @@ vpaddd %ymm12,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,(%rdi) @@ -160,7 +156,7 @@ vpaddd %ymm12,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,64(%rdi) @@ -168,17 +164,40 @@ vmovdqa %ymm5,96(%rdi) ret -.global PQCLEAN_KYBER102490S_AVX2_basemul_avx -PQCLEAN_KYBER102490S_AVX2_basemul_avx: +.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx): #consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +ret +basemul64_avx: schoolbook 0,0 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,(%rdi) @@ -187,10 +206,39 @@ vmovdqa %ymm12,32(%rdi) schoolbook 64,1 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,64(%rdi) vmovdqa %ymm12,96(%rdi) ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_basemul_avx): +#consts +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +ret diff --git a/crypto_kem/kyber1024-90s/avx2/cbd.c b/crypto_kem/kyber1024-90s/avx2/cbd.c index a4f7e484..6007d47a 100644 --- a/crypto_kem/kyber1024-90s/avx2/cbd.c +++ b/crypto_kem/kyber1024-90s/avx2/cbd.c @@ -1,27 +1,27 @@ -#include "cbd.h" #include "params.h" - +#include "cbd.h" #include #include /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER102490S_AVX2_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA * * Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *buf: pointer to input byte array +* - const unsigned char *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t *buf) { +void PQCLEAN_KYBER102490S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { + unsigned int i = 0; __m256i vec0, vec1, vec2, vec3, tmp; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); - for (size_t i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]); + for (i = 0; i < KYBER_N / 64; i++) { + vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); vec1 = _mm256_srli_epi32(vec0, 1); vec0 = _mm256_and_si256(mask55, vec0); diff --git a/crypto_kem/kyber1024-90s/avx2/cbd.h b/crypto_kem/kyber1024-90s/avx2/cbd.h index 7858b4cf..72190f5d 100644 --- a/crypto_kem/kyber1024-90s/avx2/cbd.h +++ b/crypto_kem/kyber1024-90s/avx2/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER102490S_AVX2_CBD_H +#define PQCLEAN_KYBER102490S_AVX2_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER102490S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/cdecl.inc b/crypto_kem/kyber1024-90s/avx2/cdecl.inc new file mode 100644 index 00000000..8ded53b1 --- /dev/null +++ b/crypto_kem/kyber1024-90s/avx2/cdecl.inc @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL +#define PQCLEAN_DILITHIUM2_AVX2_CDECL + +#define _16XQ 0 +#define _16XQINV 16 +#define _16XV 32 +#define _16XFLO 48 +#define _16XFHI 64 +#define _16XMONTSQLO 80 +#define _16XMONTSQHI 96 +#define _16XMASK 112 +#define _ZETAS_EXP 128 +#define _ZETAS_INV_EXP 528 + + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_kem/kyber1024-90s/avx2/consts.c b/crypto_kem/kyber1024-90s/avx2/consts.c index 557f888a..0a2dd000 100644 --- a/crypto_kem/kyber1024-90s/avx2/consts.c +++ b/crypto_kem/kyber1024-90s/avx2/consts.c @@ -1,34 +1,155 @@ -#include "consts.h" #include "params.h" - -const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628}; -const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932}; +#include "consts.h" +#include #define Q KYBER_Q -#define MONT ((1U << 16) % KYBER_Q) +#define MONT ((1U << 16) % Q) #define QINV 62209 // q^-1 mod 2^16 -#define V ((1U << 26)/KYBER_Q + 1) -#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q) -#define FLO (FHI * QINV % 65536) -#define MONTSQHI (MONT * MONT % KYBER_Q) -#define MONTSQLO (MONTSQHI * QINV % 65536) +#define V (((1U << 26) + Q/2)/Q) +#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) +#define FLO (FHI*QINV % 65536) +#define MONTSQHI (MONT*MONT % Q) +#define MONTSQLO (MONTSQHI*QINV % 65536) #define MASK 4095 -const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}}; -const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}}; -const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}}; -const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}}; -const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}}; -const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}}; - -#undef Q -#undef QINV -#undef MONT -#undef V -#undef FLO -#undef FHI -#undef MONTSQLO -#undef MONTSQHI -#undef MASK + +const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata = {.as_arr = { +#define _16XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, + +#define _16XQINV 16 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +#define _16XV 32 + V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + +#define _16XFLO 48 + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + +#define _16XFHI 64 + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + +#define _16XMONTSQLO 80 + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + +#define _16XMONTSQHI 96 + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + +#define _16XMASK 112 + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + +#define _ZETAS_EXP 128 + 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, + 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, + 3158, 3158, 3158, 3158, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, + 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, + 573, 573, 2004, 2004, 264, 264, 383, 383, + 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, + 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, + 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, + 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, + 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, + 2226, 555, 2078, 1550, 422, 177, 3038, 1574, + 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, + 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, + 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, + 430, 843, 871, 105, 587, 3094, 2869, 1653, + 778, 3182, 1483, 1119, 644, 349, 329, 3254, + 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, + 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, + 48842, 48842, 48842, 48842, 287, 287, 287, 287, + 287, 287, 287, 287, 202, 202, 202, 202, + 202, 202, 202, 202, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, + 31164, 31164, 31164, 31164, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, + 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, + 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, + 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, + 732, 732, 608, 608, 1787, 1787, 411, 411, + 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, + 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, + 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, + 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, + 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, + 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, + 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, + 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, + 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, + 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, + 3193, 1994, 220, 1670, 1799, 794, 2475, 478, + 3021, 991, 1869, 1628, 0, 0, 0, 0, + +#define _ZETAS_INV_EXP 528 + 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, + 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, + 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, + 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, + 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, + 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, + 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, + 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, + 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, + 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, + 951, 247, 1421, 3222, 2499, 271, 90, 853, + 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, + 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, + 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, + 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, + 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, + 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, + 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, + 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, + 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, + 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, + 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, + 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, + 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, + 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, + 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, + 2210, 1846, 147, 2551, 1676, 460, 235, 2742, + 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, + 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, + 45043, 32227, 11478, 335, 156, 2911, 872, 1590, + 602, 777, 2170, 246, 1755, 291, 3152, 2907, + 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, + 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, + 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, + 666, 320, 8, 2813, 1544, 282, 1838, 1293, + 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, + 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, + 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, + 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, + 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, + 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, + 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, + 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, + 171, 171, 171, 171, 12403, 12403, 12403, 12403, + 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, + 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, + 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, + 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, + 60300, 60300, 1932, 1932, 0, 0, 0, 0 + } +}; diff --git a/crypto_kem/kyber1024-90s/avx2/consts.h b/crypto_kem/kyber1024-90s/avx2/consts.h index 7b0b44fc..968cbe07 100644 --- a/crypto_kem/kyber1024-90s/avx2/consts.h +++ b/crypto_kem/kyber1024-90s/avx2/consts.h @@ -1,24 +1,20 @@ -#ifndef CONSTS_H -#define CONSTS_H +#ifndef PQCLEAN_KYBER102490S_AVX2_CONSTS_H +#define PQCLEAN_KYBER102490S_AVX2_CONSTS_H +#include "cdecl.inc" + +#include "params.h" #include #include -typedef union { - uint16_t as_arr[16]; - __m256i as_vec; -} aligned_uint16_t; +#define ALIGNED_UINT16_T(N) \ + union { \ + __m256i as_vec; \ + uint16_t as_arr[(N)]; \ + } -extern const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_exp[396]; -extern const uint16_t PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp[396]; +typedef ALIGNED_UINT16_T(928) qdata_t; -extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xq; -extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xqinv; -extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xv; -extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xflo; -extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xfhi; -extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqlo; -extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmontsqhi; -extern const aligned_uint16_t PQCLEAN_KYBER102490S_AVX2_16xmask; +extern const qdata_t PQCLEAN_KYBER102490S_AVX2_qdata; #endif diff --git a/crypto_kem/kyber1024-90s/avx2/fq.S b/crypto_kem/kyber1024-90s/avx2/fq.S new file mode 100644 index 00000000..23ddb9fe --- /dev/null +++ b/crypto_kem/kyber1024-90s/avx2/fq.S @@ -0,0 +1,129 @@ +#include "cdecl.inc" +.include "fq.inc" + +.text +reduce128_avx: +#load +vmovdqa (%rdi),%ymm2 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm4 +vmovdqa 96(%rdi),%ymm5 +vmovdqa 128(%rdi),%ymm6 +vmovdqa 160(%rdi),%ymm7 +vmovdqa 192(%rdi),%ymm8 +vmovdqa 224(%rdi),%ymm9 + +red16 2,10 +red16 3,11 +red16 4,12 +red16 5,13 +red16 6,14 +red16 7,15 +red16 8,10 +red16 9,11 + +#store +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm4,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm6,128(%rdi) +vmovdqa %ymm7,160(%rdi) +vmovdqa %ymm8,192(%rdi) +vmovdqa %ymm9,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_reduce_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XV*2(%rsi),%ymm1 +call reduce128_avx +add $256,%rdi +call reduce128_avx +ret + +csubq128_avx: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm2 +vmovdqa 64(%rdi),%ymm3 +vmovdqa 96(%rdi),%ymm4 +vmovdqa 128(%rdi),%ymm5 +vmovdqa 160(%rdi),%ymm6 +vmovdqa 192(%rdi),%ymm7 +vmovdqa 224(%rdi),%ymm8 + +csubq 1,9 +csubq 2,10 +csubq 3,11 +csubq 4,12 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,9 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm2,32(%rdi) +vmovdqa %ymm3,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm6,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm8,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_csubq_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +call csubq128_avx +add $256,%rdi +call csubq128_avx +ret + +tomont128_avx: +#load +vmovdqa (%rdi),%ymm3 +vmovdqa 32(%rdi),%ymm4 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm6 +vmovdqa 128(%rdi),%ymm7 +vmovdqa 160(%rdi),%ymm8 +vmovdqa 192(%rdi),%ymm9 +vmovdqa 224(%rdi),%ymm10 + +fqmulprecomp 1,2,3,11 +fqmulprecomp 1,2,4,12 +fqmulprecomp 1,2,5,13 +fqmulprecomp 1,2,6,14 +fqmulprecomp 1,2,7,15 +fqmulprecomp 1,2,8,11 +fqmulprecomp 1,2,9,12 +fqmulprecomp 1,2,10,13 + +#store +vmovdqa %ymm3,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm7,128(%rdi) +vmovdqa %ymm8,160(%rdi) +vmovdqa %ymm9,192(%rdi) +vmovdqa %ymm10,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_tomont_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx +add $256,%rdi +call tomont128_avx +ret diff --git a/crypto_kem/kyber1024-90s/avx2/fq.inc b/crypto_kem/kyber1024-90s/avx2/fq.inc index 56e36a02..75df098a 100644 --- a/crypto_kem/kyber1024-90s/avx2/fq.inc +++ b/crypto_kem/kyber1024-90s/avx2/fq.inc @@ -1,24 +1,27 @@ -.macro red16 r x=12 +.macro red16 r,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x vpsraw $10,%ymm\x,%ymm\x vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro csubq r x=12 +.macro csubq r,x=12 vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r +#vpcmpgtw %ymm0,%ymm\r,%ymm\x +#vpand %ymm0,%ymm\x,%ymm\x +#vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro caddq r x=12 +.macro caddq r,x=12 vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r .endm -.macro fqmulprecomp al,ah,b x=12 +.macro fqmulprecomp al,ah,b,x=12 vpmullw %ymm\al,%ymm\b,%ymm\x vpmulhw %ymm\ah,%ymm\b,%ymm\b vpmulhw %ymm0,%ymm\x,%ymm\x diff --git a/crypto_kem/kyber1024-90s/avx2/indcpa.c b/crypto_kem/kyber1024-90s/avx2/indcpa.c index 8e3be6c0..35343d57 100644 --- a/crypto_kem/kyber1024-90s/avx2/indcpa.c +++ b/crypto_kem/kyber1024-90s/avx2/indcpa.c @@ -1,26 +1,33 @@ +#include "align.h" #include "cbd.h" #include "indcpa.h" #include "ntt.h" +#include "params.h" #include "poly.h" #include "polyvec.h" #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include +#include /************************************************* * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* compressed and serialized vector of polynomials pk +* serialized vector of polynomials pk * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { /************************************************* * Name: unpack_pk * -* Description: De-serialize and decompress public key from a byte array; +* Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(r, sk); } @@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials +* (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(sk, packedsk); } @@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER102490S_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER102490S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER102490S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } -static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - unsigned int ctr, pos; - uint16_t val; +/************************************************* +* Name: rej_uniform +* +* Description: Run rejection sampling on uniform random bytes to generate +* uniform random integers mod q +* +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes +* +* Returns number of sampled 16-bit integers (at most len) +**************************************************/ +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { @@ -116,46 +150,47 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER102490S_AVX2_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER102490S_AVX2_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER102490S_AVX2_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - union { - uint8_t x[XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS]; - __m256i _dummy; - } buf; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; + ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf; aes256ctr_ctx state; PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, seed, 0); - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_K; j++) { + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_K; j++) { if (transposed) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (i << 8) + j); + nonce.orig = (j << 8) | i; } else { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (j << 8) + i); + nonce.orig = (i << 8) | j; } - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.x, GEN_MATRIX_MAXNBLOCKS, &state); - ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf.x, GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES); + state.n = _mm_loadl_epi64(&nonce.vec); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state); + ctr = PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr); while (ctr < KYBER_N) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.x, 1, &state); - ctr += rej_uniform_ref(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.x, XOF_BLOCKBYTES); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr, + XOF_BLOCKBYTES); } PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(&a[i].vec[j]); @@ -164,47 +199,53 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], skpv, e, pkpv; - uint8_t buf[2 * KYBER_SYMBYTES]; - const uint8_t *publicseed = buf; - const uint8_t *noiseseed = buf + KYBER_SYMBYTES; - uint8_t nonce = 0; +void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + const uint8_t *publicseed = buf.arr; + const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf, KYBER_SYMBYTES); - hash_g(buf, buf, KYBER_SYMBYTES); + randombytes(buf.arr, KYBER_SYMBYTES); + hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); gen_a(a, publicseed); + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; aes256ctr_ctx state; - uint8_t coins[128]; - PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, 0); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state); - PQCLEAN_KYBER102490S_AVX2_cbd(skpv.vec + i, coins); + ALIGN32_ARRAY(uint8_t, 128) coins; + PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER102490S_AVX2_cbd(&skpv.vec[i], coins.arr); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state); - PQCLEAN_KYBER102490S_AVX2_cbd(e.vec + i, coins); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER102490S_AVX2_cbd(&e.vec[i], coins.arr); } PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&skpv); PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv); - PQCLEAN_KYBER102490S_AVX2_poly_frommont(pkpv.vec + i); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER102490S_AVX2_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER102490S_AVX2_polyvec_add(&pkpv, &pkpv, &e); @@ -215,58 +256,67 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec at[KYBER_K], pkpv, sp, ep, bp; - poly k, v, epp; - uint8_t seed[KYBER_SYMBYTES]; - uint8_t nonce = 0; - - unpack_pk(&pkpv, seed, pk); +void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; + + unpack_pk(&pkpv, seed.arr, pk); PQCLEAN_KYBER102490S_AVX2_poly_frommsg(&k, m); - gen_at(at, seed); + gen_at(at, seed.arr); + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; aes256ctr_ctx state; - uint8_t buf[128]; - PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, 0); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER102490S_AVX2_cbd(sp.vec + i, buf); + ALIGN32_ARRAY(uint8_t, 128) buf; + PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(&state, coins, nonce.orig++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER102490S_AVX2_cbd(&sp.vec[i], buf.arr); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER102490S_AVX2_cbd(ep.vec + i, buf); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER102490S_AVX2_cbd(&ep.vec[i], buf.arr); } - PQCLEAN_KYBER102490S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf); + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER102490S_AVX2_cbd(&epp, buf.arr); PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(&bp); - PQCLEAN_KYBER102490S_AVX2_poly_invntt(&v); + PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&v); PQCLEAN_KYBER102490S_AVX2_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER102490S_AVX2_poly_add(&v, &v, &epp); @@ -278,18 +328,21 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER102490S_AVX2_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -297,8 +350,8 @@ void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER102490S_AVX2_poly_invntt(&mp); + PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER102490S_AVX2_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER102490S_AVX2_poly_reduce(&mp); diff --git a/crypto_kem/kyber1024-90s/avx2/indcpa.h b/crypto_kem/kyber1024-90s/avx2/indcpa.h index 031b713d..3813ad08 100644 --- a/crypto_kem/kyber1024-90s/avx2/indcpa.h +++ b/crypto_kem/kyber1024-90s/avx2/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER102490S_AVX2_INDCPA_H +#define PQCLEAN_KYBER102490S_AVX2_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER102490S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER102490S_AVX2_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER102490S_AVX2_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER102490S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER102490S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber768/avx2/invntt.s b/crypto_kem/kyber1024-90s/avx2/invntt.S similarity index 76% rename from crypto_kem/kyber768/avx2/invntt.s rename to crypto_kem/kyber1024-90s/avx2/invntt.S index fd175443..84d19a7e 100644 --- a/crypto_kem/kyber768/avx2/invntt.s +++ b/crypto_kem/kyber1024-90s/avx2/invntt.S @@ -1,7 +1,8 @@ +#include "cdecl.inc" .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 #update & mul vpsubw %ymm\rh0,%ymm\rl0,%ymm12 vpsubw %ymm\rh1,%ymm\rl1,%ymm13 @@ -36,12 +37,8 @@ vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.global PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx -.p2align 5 -PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 - +.text +invntt_levels0t5_avx: level0: #zetas vmovdqu (%rsi),%ymm15 @@ -59,14 +56,14 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -butterfly 4,5,8,9,6,7,10,11 15,3,1,2 +butterfly 4,5,8,9,6,7,10,11,15,3,1,2 level1: #zetas vmovdqu 128(%rsi),%ymm3 vmovdqu 160(%rsi),%ymm2 -butterfly 4,5,6,7,8,9,10,11 3,3,2,2 +butterfly 4,5,6,7,8,9,10,11,3,3,2,2 shuffle1 4,5,3,5 shuffle1 6,7,4,7 @@ -79,9 +76,9 @@ vmovdqu 192(%rsi),%ymm10 vmovdqu 224(%rsi),%ymm2 #consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xv(%rip),%ymm1 +vmovdqa _16XV*2(%rdx),%ymm1 -butterfly 3,4,6,8,5,7,9,11 10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,10,10,2,2 red16 3 @@ -95,7 +92,7 @@ level3: vmovdqu 256(%rsi),%ymm9 vmovdqu 288(%rsi),%ymm2 -butterfly 10,3,6,5,4,8,7,11 9,9,2,2 +butterfly 10,3,6,5,4,8,7,11,9,9,2,2 red16 10 @@ -109,7 +106,7 @@ level4: vmovdqu 320(%rsi),%ymm7 vmovdqu 352(%rsi),%ymm2 -butterfly 9,10,6,4,3,5,8,11 7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,7,7,2,2 red16 9 @@ -123,7 +120,7 @@ level5: vpbroadcastd 384(%rsi),%ymm8 vpbroadcastd 388(%rsi),%ymm2 -butterfly 7,9,6,3,10,4,5,11 8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,8,8,2,2 red16 7 @@ -139,11 +136,7 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_KYBER768_AVX2_invntt_level6_avx -PQCLEAN_KYBER768_AVX2_invntt_level6_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 - +invntt_level6_avx: #zetas vpbroadcastd (%rsi),%ymm1 vpbroadcastd 4(%rsi),%ymm2 @@ -161,8 +154,8 @@ vmovdqa 352(%rdi),%ymm11 butterfly 4,5,6,7,8,9,10,11 #consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER768_AVX2_16xfhi(%rip),%ymm13 +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 #store vmovdqa %ymm8,256(%rdi) @@ -170,10 +163,10 @@ vmovdqa %ymm9,288(%rdi) vmovdqa %ymm10,320(%rdi) vmovdqa %ymm11,352(%rdi) -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 #store vmovdqa %ymm4,(%rdi) @@ -194,8 +187,8 @@ vmovdqa 480(%rdi),%ymm11 butterfly 4,5,6,7,8,9,10,11 #consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER768_AVX2_16xfhi(%rip),%ymm13 +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 #store vmovdqa %ymm8,384(%rdi) @@ -203,10 +196,10 @@ vmovdqa %ymm9,416(%rdi) vmovdqa %ymm10,448(%rdi) vmovdqa %ymm11,480(%rdi) -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 #store vmovdqa %ymm4,128(%rdi) @@ -215,3 +208,18 @@ vmovdqa %ymm6,192(%rdi) vmovdqa %ymm7,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_invntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_INV_EXP*2,%rsi +call invntt_levels0t5_avx +add $256,%rdi +add $392,%rsi +call invntt_levels0t5_avx +sub $256,%rdi +add $392,%rsi +call invntt_level6_avx +ret diff --git a/crypto_kem/kyber1024-90s/avx2/invntt.s b/crypto_kem/kyber1024-90s/avx2/invntt.s deleted file mode 100644 index 923fffad..00000000 --- a/crypto_kem/kyber1024-90s/avx2/invntt.s +++ /dev/null @@ -1,217 +0,0 @@ -.include "shuffle.inc" -.include "fq.inc" - -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 -#update & mul -vpsubw %ymm\rh0,%ymm\rl0,%ymm12 -vpsubw %ymm\rh1,%ymm\rl1,%ymm13 -vpsubw %ymm\rh2,%ymm\rl2,%ymm14 - -vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 -vpmullw %ymm\zl0,%ymm12,%ymm\rh0 - -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 -vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpsubw %ymm\rh3,%ymm\rl3,%ymm15 - -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 -vpmullw %ymm\zl1,%ymm14,%ymm\rh2 -vpmullw %ymm\zl1,%ymm15,%ymm\rh3 - -vpmulhw %ymm\zh0,%ymm12,%ymm12 -vpmulhw %ymm\zh0,%ymm13,%ymm13 - -vpmulhw %ymm\zh1,%ymm14,%ymm14 -vpmulhw %ymm\zh1,%ymm15,%ymm15 - -#reduce -vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 -vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 -vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 -vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 -vpsubw %ymm\rh0,%ymm12,%ymm\rh0 -vpsubw %ymm\rh1,%ymm13,%ymm\rh1 -vpsubw %ymm\rh2,%ymm14,%ymm\rh2 -vpsubw %ymm\rh3,%ymm15,%ymm\rh3 -.endm - -.global PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx -.p2align 5 -PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx: -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 - -level0: -#zetas -vmovdqu (%rsi),%ymm15 -vmovdqu 64(%rsi),%ymm3 -vmovdqu 32(%rsi),%ymm1 -vmovdqu 96(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly 4,5,8,9,6,7,10,11 15,3,1,2 - -level1: -#zetas -vmovdqu 128(%rsi),%ymm3 -vmovdqu 160(%rsi),%ymm2 - -butterfly 4,5,6,7,8,9,10,11 3,3,2,2 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 -shuffle1 10,11,8,11 - -level2: -#zetas -vmovdqu 192(%rsi),%ymm10 -vmovdqu 224(%rsi),%ymm2 - -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xv(%rip),%ymm1 - -butterfly 3,4,6,8,5,7,9,11 10,10,2,2 - -red16 3 - -shuffle2 3,4,10,4 -shuffle2 6,8,3,8 -shuffle2 5,7,6,7 -shuffle2 9,11,5,11 - -level3: -#zetas -vmovdqu 256(%rsi),%ymm9 -vmovdqu 288(%rsi),%ymm2 - -butterfly 10,3,6,5,4,8,7,11 9,9,2,2 - -red16 10 - -shuffle4 10,3,9,3 -shuffle4 6,5,10,5 -shuffle4 4,8,6,8 -shuffle4 7,11,4,11 - -level4: -#zetas -vmovdqu 320(%rsi),%ymm7 -vmovdqu 352(%rsi),%ymm2 - -butterfly 9,10,6,4,3,5,8,11 7,7,2,2 - -red16 9 - -shuffle8 9,10,7,10 -shuffle8 6,4,9,4 -shuffle8 3,5,6,5 -shuffle8 8,11,3,11 - -level5: -#zetas -vpbroadcastd 384(%rsi),%ymm8 -vpbroadcastd 388(%rsi),%ymm2 - -butterfly 7,9,6,3,10,4,5,11 8,8,2,2 - -red16 7 - -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx -PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx: -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 - -#zetas -vpbroadcastd (%rsi),%ymm1 -vpbroadcastd 4(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xfhi(%rip),%ymm13 - -#store -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -#load -vmovdqa 128(%rdi),%ymm4 -vmovdqa 160(%rdi),%ymm5 -vmovdqa 192(%rdi),%ymm6 -vmovdqa 224(%rdi),%ymm7 -vmovdqa 384(%rdi),%ymm8 -vmovdqa 416(%rdi),%ymm9 -vmovdqa 448(%rdi),%ymm10 -vmovdqa 480(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xfhi(%rip),%ymm13 - -#store -vmovdqa %ymm8,384(%rdi) -vmovdqa %ymm9,416(%rdi) -vmovdqa %ymm10,448(%rdi) -vmovdqa %ymm11,480(%rdi) - -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 - -#store -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm5,160(%rdi) -vmovdqa %ymm6,192(%rdi) -vmovdqa %ymm7,224(%rdi) - -ret diff --git a/crypto_kem/kyber1024-90s/avx2/kem.c b/crypto_kem/kyber1024-90s/avx2/kem.c index 716aab33..f2a11073 100644 --- a/crypto_kem/kyber1024-90s/avx2/kem.c +++ b/crypto_kem/kyber1024-90s/avx2/kem.c @@ -1,103 +1,127 @@ -#include "api.h" +#include "align.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include + -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER102490S_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - uint8_t buf[2 * KYBER_SYMBYTES]; +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + randombytes(buf.arr, KYBER_SYMBYTES); + /* Don't release system RNG output */ + hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER102490S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - union { - uint8_t x[KYBER_CIPHERTEXTBYTES]; - __m256i __dummy; - } _cmp; - uint8_t *cmp = _cmp.x; - uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf, ct, sk); + PQCLEAN_KYBER102490S_AVX2_indcpa_dec(buf.arr, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER102490S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER102490S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER102490S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER102490S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber1024-90s/avx2/kem.h b/crypto_kem/kyber1024-90s/avx2/kem.h new file mode 100644 index 00000000..6953252c --- /dev/null +++ b/crypto_kem/kyber1024-90s/avx2/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER102490S_AVX2_KEM_H +#define PQCLEAN_KYBER102490S_AVX2_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER102490S_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber1024-90s/avx2/ntt.S b/crypto_kem/kyber1024-90s/avx2/ntt.S new file mode 100644 index 00000000..5625d5ee --- /dev/null +++ b/crypto_kem/kyber1024-90s/avx2/ntt.S @@ -0,0 +1,220 @@ +#include "cdecl.inc" +.include "shuffle.inc" +.include "fq.inc" + +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 +#mul +vpmullw %ymm\zl0,%ymm\rh0,%ymm12 +vpmullw %ymm\zl0,%ymm\rh1,%ymm13 +vpmullw %ymm\zl1,%ymm\rh2,%ymm14 +vpmullw %ymm\zl1,%ymm\rh3,%ymm15 +vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 +vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 +vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 +vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 + +#reduce +vpmulhw %ymm0,%ymm12,%ymm12 +vpmulhw %ymm0,%ymm13,%ymm13 +vpmulhw %ymm0,%ymm14,%ymm14 +vpmulhw %ymm0,%ymm15,%ymm15 +vpsubw %ymm12,%ymm\rh0,%ymm12 +vpsubw %ymm13,%ymm\rh1,%ymm13 +vpsubw %ymm14,%ymm\rh2,%ymm14 +vpsubw %ymm15,%ymm\rh3,%ymm15 + +#update +vpsubw %ymm12,%ymm\rl0,%ymm\rh0 +vpaddw %ymm12,%ymm\rl0,%ymm\rl0 +vpsubw %ymm13,%ymm\rl1,%ymm\rh1 +vpaddw %ymm13,%ymm\rl1,%ymm\rl1 +vpsubw %ymm14,%ymm\rl2,%ymm\rh2 +vpaddw %ymm14,%ymm\rl2,%ymm\rl2 +vpsubw %ymm15,%ymm\rl3,%ymm\rh3 +vpaddw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +# We break the dependency chains with the cost of slightly more additions. +# But they can be run in parallel to the multiplications on execution port 5 +# (multiplications only go to ports 0 and 1) +.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 +#mul +vpmullw %ymm\zl0,%ymm\rh0,%ymm12 +vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x +vpmullw %ymm\zl0,%ymm\rh1,%ymm13 +vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 +vpmullw %ymm\zl1,%ymm\rh2,%ymm14 +vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y +vpmullw %ymm\zl1,%ymm\rh3,%ymm15 +vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 + +#reduce +vpmulhw %ymm0,%ymm12,%ymm12 +vpmulhw %ymm0,%ymm13,%ymm13 +vpmulhw %ymm0,%ymm14,%ymm14 +vpmulhw %ymm0,%ymm15,%ymm15 + +vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 +vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 +vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 +vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 +vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 + +#update +vpaddw %ymm12,%ymm\rh0,%ymm\rh0 +vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpaddw %ymm13,%ymm\rh1,%ymm\rh1 +vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpaddw %ymm14,%ymm\rh2,%ymm\rh2 +vpsubw %ymm14,%ymm\rl2,%ymm\rl2 +vpaddw %ymm15,%ymm\rh3,%ymm\rh3 +vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.text +ntt_level0_avx: +level0: +#zetas +vpbroadcastd (%rsi),%ymm15 +vpbroadcastd 4(%rsi),%ymm1 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 256(%rdi),%ymm8 +vmovdqa 288(%rdi),%ymm9 +vmovdqa 320(%rdi),%ymm10 +vmovdqa 352(%rdi),%ymm11 + +butterfly2 4,5,6,7,8,9,10,11 + +#store +vmovdqa %ymm4,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm7,96(%rdi) +vmovdqa %ymm8,256(%rdi) +vmovdqa %ymm9,288(%rdi) +vmovdqa %ymm10,320(%rdi) +vmovdqa %ymm11,352(%rdi) + +ret + +ntt_levels1t6_avx: +level1: +#zetas +vpbroadcastd (%rsi),%ymm15 +vpbroadcastd 4(%rsi),%ymm1 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +butterfly2 4,5,6,7,8,9,10,11,3 + +level2: +#zetas +vmovdqu 8(%rsi),%ymm15 +vmovdqu 40(%rsi),%ymm1 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +butterfly2 3,8,4,9,5,10,6,11,7 + +level3: +#zetas +vmovdqu 72(%rsi),%ymm15 +vmovdqu 104(%rsi),%ymm1 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +butterfly2 7,5,3,10,8,6,4,11,9 + +level4: +#zetas +vmovdqu 136(%rsi),%ymm15 +vmovdqu 168(%rsi),%ymm1 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +butterfly2 9,8,7,6,5,4,3,11,10 + +level5: +#zetas +vmovdqu 200(%rsi),%ymm15 +vmovdqu 232(%rsi),%ymm1 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +butterfly2 10,5,9,4,8,3,7,11,6 + +level6: +#zetas +vmovdqu 264(%rsi),%ymm14 +vmovdqu 328(%rsi),%ymm15 +vmovdqu 296(%rsi),%ymm1 +vmovdqu 360(%rsi),%ymm2 + +butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 + +vmovdqa _16XV*2(%rdx),%ymm1 +red16 10,12 +red16 5,13 +red16 9,14 +red16 4,15 +red16 8,2 +red16 3,6 +red16 7,12 +red16 11,13 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_ntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_EXP*2,%rsi +call ntt_level0_avx +add $128,%rdi +call ntt_level0_avx +sub $128,%rdi +add $8,%rsi +call ntt_levels1t6_avx +add $256,%rdi +add $392,%rsi +call ntt_levels1t6_avx +ret diff --git a/crypto_kem/kyber1024-90s/avx2/ntt.h b/crypto_kem/kyber1024-90s/avx2/ntt.h index 8606d7e9..9df5c48c 100644 --- a/crypto_kem/kyber1024-90s/avx2/ntt.h +++ b/crypto_kem/kyber1024-90s/avx2/ntt.h @@ -2,19 +2,27 @@ #define NTT_H #include "consts.h" - +#include "params.h" #include -void PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER102490S_AVX2_nttpack_avx(int16_t *r); -void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r); -void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); -void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); - -void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a); -void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a); + +void PQCLEAN_KYBER102490S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + +void PQCLEAN_KYBER102490S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + + +void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + +void PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + + +void PQCLEAN_KYBER102490S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + +void PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + + +void PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + +void PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/params.h b/crypto_kem/kyber1024-90s/avx2/params.h index 85dcf73a..cbf12dca 100644 --- a/crypto_kem/kyber1024-90s/avx2/params.h +++ b/crypto_kem/kyber1024-90s/avx2/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER102490S_AVX2_PARAMS_H +#define PQCLEAN_KYBER102490S_AVX2_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 4 #define KYBER_POLYCOMPRESSEDBYTES 160 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber1024-90s/avx2/poly.c b/crypto_kem/kyber1024-90s/avx2/poly.c index 7dac78dd..af88a7a2 100644 --- a/crypto_kem/kyber1024-90s/avx2/poly.c +++ b/crypto_kem/kyber1024-90s/avx2/poly.c @@ -1,113 +1,210 @@ +#include "align.h" #include "cbd.h" +#include "consts.h" #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include #include /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER102490S_AVX2_poly_compress * * Description: Compression and subsequent serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t i, j, k = 0; PQCLEAN_KYBER102490S_AVX2_poly_csubq(a); - for (i = 0; i < KYBER_N; i += 8) { + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = (uint8_t)(((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31); + t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; } - r[k] = (uint8_t)( t[0] | (t[1] << 5)); - r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4)); - r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3)); - k += 5; + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER102490S_AVX2_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER102490S_AVX2_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t *a) { - size_t i; - for (i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5); +void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *restrict r, + const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + unsigned int j = 0; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 5) | (a[1] << 3); + t[2] = (a[1] >> 2); + t[3] = (a[1] >> 7) | (a[2] << 1); + t[4] = (a[2] >> 4) | (a[3] << 4); + t[5] = (a[3] >> 1); + t[6] = (a[3] >> 6) | (a[4] << 2); + t[7] = (a[4] >> 3); a += 5; + + for (j = 0; j < 8; j++) { + r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; + } } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER102490S_AVX2_poly_tobytes * * Description: Serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t *r, poly *a) { - PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs); - PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128); +void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER102490S_AVX2_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER102490S_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) +**************************************************/ +void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER102490S_AVX2_qdata); +} + +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *restrict r, + const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; + const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); + const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2); + +#define FROMMSG64(i) \ + g3 = _mm256_shuffle_epi32(f,0x55*(i)); \ + g3 = _mm256_sllv_epi32(g3,shift); \ + g3 = _mm256_shuffle_epi8(g3,idx); \ + g0 = _mm256_slli_epi16(g3,12); \ + g1 = _mm256_slli_epi16(g3,8); \ + g2 = _mm256_slli_epi16(g3,4); \ + g0 = _mm256_srai_epi16(g0,15); \ + g1 = _mm256_srai_epi16(g1,15); \ + g2 = _mm256_srai_epi16(g2,15); \ + g3 = _mm256_srai_epi16(g3,15); \ + g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ + g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ + g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ + g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ + h0 = _mm256_unpacklo_epi64(g0,g1); \ + h2 = _mm256_unpackhi_epi64(g0,g1); \ + h1 = _mm256_unpacklo_epi64(g2,g3); \ + h3 = _mm256_unpackhi_epi64(g2,g3); \ + g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ + g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ + g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ + g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + + f = _mm256_load_si256((__m256i *)msg); + FROMMSG64(0); + FROMMSG64(1); + FROMMSG64(2); + FROMMSG64(3); +} + +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t *a) { - PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs, a); - PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192); +void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { + unsigned int i = 0; + uint32_t small = 0; + __m256i f0, f1, g0, g1; + const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); + f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); + f0 = _mm256_sub_epi16(hqs, f0); + f1 = _mm256_sub_epi16(hqs, f1); + g0 = _mm256_srai_epi16(f0, 15); + g1 = _mm256_srai_epi16(f1, 15); + f0 = _mm256_xor_si256(f0, g0); + f1 = _mm256_xor_si256(f1, g1); + f0 = _mm256_sub_epi16(hhqs, f0); + f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_packs_epi16(f0, f1); + small = _mm256_movemask_epi8(f0); + small = ~small; + msg[4 * i + 0] = small; + msg[4 * i + 1] = small >> 16; + msg[4 * i + 2] = small >> 8; + msg[4 * i + 3] = small >> 24; + } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER102490S_AVX2_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); - PQCLEAN_KYBER102490S_AVX2_cbd(r, buf); +void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; + prf(buf.arr, sizeof(buf.arr), seed, nonce); + PQCLEAN_KYBER102490S_AVX2_cbd(r, buf.arr); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER102490S_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -116,73 +213,78 @@ void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8 * Arguments: - uint16_t *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_exp); - PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER102490S_AVX2_zetas_exp); - PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_exp + 4); - PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER102490S_AVX2_zetas_exp + 200); + PQCLEAN_KYBER102490S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_invntt(poly *r) { - PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp); - PQCLEAN_KYBER102490S_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp + 196); - PQCLEAN_KYBER102490S_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER102490S_AVX2_zetas_inv_exp + 392); +void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r) { + PQCLEAN_KYBER102490S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } -// FIXME void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs); - PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs + 128); + PQCLEAN_KYBER102490S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } -//XXX Add comment -void PQCLEAN_KYBER102490S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs, - a->coeffs, - b->coeffs, - PQCLEAN_KYBER102490S_AVX2_zetas_exp + 152); - PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 64, - a->coeffs + 64, - b->coeffs + 64, - PQCLEAN_KYBER102490S_AVX2_zetas_exp + 184); - PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 128, - a->coeffs + 128, - b->coeffs + 128, - PQCLEAN_KYBER102490S_AVX2_zetas_exp + 348); - PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs + 192, - a->coeffs + 192, - b->coeffs + 192, - PQCLEAN_KYBER102490S_AVX2_zetas_exp + 380); +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + PQCLEAN_KYBER102490S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } -// FIXME -void PQCLEAN_KYBER102490S_AVX2_poly_frommont(poly *r) { - PQCLEAN_KYBER102490S_AVX2_frommont_avx(r->coeffs); - PQCLEAN_KYBER102490S_AVX2_frommont_avx(r->coeffs + 128); +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_poly_tomont +* +* Description: Inplace conversion of all coefficients of a polynomial +* from normal domain to Montgomery domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r) { + PQCLEAN_KYBER102490S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs); - PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs + 128); + PQCLEAN_KYBER102490S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_poly_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs); - PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs + 128); + PQCLEAN_KYBER102490S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER102490S_AVX2_poly_add * * Description: Add two polynomials * @@ -191,18 +293,19 @@ void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_add_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_add_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER102490S_AVX2_poly_sub * * Description: Subtract two polynomials * @@ -211,127 +314,13 @@ void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_sub_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); - } -} - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - __m128i tmp; - __m256i a[4], d0, d1, d2, d3; - const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i zeros = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi32(1); - const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2); - - tmp = _mm_loadu_si128((__m128i *)msg); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2); - } - - tmp = _mm_loadu_si128((__m128i *)&msg[16]); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2); - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint32_t small; - __m256i vec, tmp; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); - - for (size_t i = 0; i < KYBER_N / 16; i++) { - vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]); - vec = _mm256_sub_epi16(hqs, vec); - tmp = _mm256_srai_epi16(vec, 15); - vec = _mm256_xor_si256(vec, tmp); - vec = _mm256_sub_epi16(hhqs, vec); - small = (uint32_t)_mm256_movemask_epi8(vec); - small = _pext_u32(small, 0xAAAAAAAA); - small = ~small; - msg[2 * i + 0] = (uint8_t)small; - msg[2 * i + 1] = (uint8_t)(small >> 8); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_sub_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } diff --git a/crypto_kem/kyber1024-90s/avx2/poly.h b/crypto_kem/kyber1024-90s/avx2/poly.h index 634e4d04..63bf3f01 100644 --- a/crypto_kem/kyber1024-90s/avx2/poly.h +++ b/crypto_kem/kyber1024-90s/avx2/poly.h @@ -1,8 +1,7 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER102490S_AVX2_POLY_H +#define PQCLEAN_KYBER102490S_AVX2_POLY_H #include "params.h" - #include #include @@ -11,32 +10,47 @@ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] */ typedef union { + __m256i dummy; int16_t coeffs[KYBER_N]; - __m256i _dummy; } poly; -void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER102490S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER102490S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER102490S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER102490S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER102490S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER102490S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER102490S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER102490S_AVX2_poly_ntt(poly *r); -void PQCLEAN_KYBER102490S_AVX2_poly_invntt(poly *r); + +void PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(poly *r); + void PQCLEAN_KYBER102490S_AVX2_poly_nttunpack(poly *r); -void PQCLEAN_KYBER102490S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER102490S_AVX2_poly_frommont(poly *r); + +void PQCLEAN_KYBER102490S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER102490S_AVX2_poly_tomont(poly *r); + void PQCLEAN_KYBER102490S_AVX2_poly_reduce(poly *r); + void PQCLEAN_KYBER102490S_AVX2_poly_csubq(poly *r); + void PQCLEAN_KYBER102490S_AVX2_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER102490S_AVX2_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/polyvec.c b/crypto_kem/kyber1024-90s/avx2/polyvec.c index ea6fd941..c51c6f78 100644 --- a/crypto_kem/kyber1024-90s/avx2/polyvec.c +++ b/crypto_kem/kyber1024-90s/avx2/polyvec.c @@ -1,167 +1,198 @@ +#include "params.h" +#include "consts.h" #include "ntt.h" #include "poly.h" #include "polyvec.h" - #include /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_compress * * Description: Compress and serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], + polyvec *restrict a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(a); uint16_t t[8]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 8; j++) { - for (size_t k = 0; k < 8; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + for (k = 0; k < 8; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) + / KYBER_Q) & 0x7ff; + } } - r[11 * j + 0] = (uint8_t)t[0]; - r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3)); - r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6)); - r[11 * j + 3] = (uint8_t)((t[2] >> 2)); - r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1)); - r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4)); - r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7)); - r[11 * j + 7] = (uint8_t)((t[5] >> 1)); - r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2)); - r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5)); - r[11 * j + 10] = (uint8_t)((t[7] >> 3)); + r[ 0] = (t[0] >> 0); + r[ 1] = (t[0] >> 8) | (t[1] << 3); + r[ 2] = (t[1] >> 5) | (t[2] << 6); + r[ 3] = (t[2] >> 2); + r[ 4] = (t[2] >> 10) | (t[3] << 1); + r[ 5] = (t[3] >> 7) | (t[4] << 4); + r[ 6] = (t[4] >> 4) | (t[5] << 7); + r[ 7] = (t[5] >> 1); + r[ 8] = (t[5] >> 9) | (t[6] << 2); + r[ 9] = (t[6] >> 6) | (t[7] << 5); + r[10] = (t[7] >> 3); + r += 11; } - r += 352; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 8; j++) { - r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11); +void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *restrict r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[8]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); + t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); + t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); + t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); + t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); + t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); + t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); + t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); + a += 11; + + for (k = 0; k < 8; k++) { + r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; + } } - a += 352; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes * * Description: Serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_AVX2_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_AVX2_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_AVX2_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { - PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs, - a->vec->coeffs, - b->vec->coeffs, - PQCLEAN_KYBER102490S_AVX2_zetas_exp + 152); - PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 64, - a->vec->coeffs + 64, - b->vec->coeffs + 64, - PQCLEAN_KYBER102490S_AVX2_zetas_exp + 184); - PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 128, - a->vec->coeffs + 128, - b->vec->coeffs + 128, - PQCLEAN_KYBER102490S_AVX2_zetas_exp + 348); - PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs + 192, - a->vec->coeffs + 192, - b->vec->coeffs + 192, - PQCLEAN_KYBER102490S_AVX2_zetas_exp + 380); +void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + PQCLEAN_KYBER102490S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER102490S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_AVX2_poly_reduce(&r->vec[i]); } } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of each element of a vector of polynomials +* for details of conditional subtraction of q see comments in +* reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_AVX2_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER102490S_AVX2_polyvec_add * * Description: Add vectors of polynomials * @@ -170,7 +201,8 @@ void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber1024-90s/avx2/polyvec.h b/crypto_kem/kyber1024-90s/avx2/polyvec.h index 4692fbef..511cd5d3 100644 --- a/crypto_kem/kyber1024-90s/avx2/polyvec.h +++ b/crypto_kem/kyber1024-90s/avx2/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER102490S_AVX2_POLYVEC_H +#define PQCLEAN_KYBER102490S_AVX2_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER102490S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER102490S_AVX2_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER102490S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER102490S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER102490S_AVX2_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER102490S_AVX2_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER102490S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER102490S_AVX2_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER102490S_AVX2_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER102490S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/reduce.h b/crypto_kem/kyber1024-90s/avx2/reduce.h index 360ba637..8c7b116b 100644 --- a/crypto_kem/kyber1024-90s/avx2/reduce.h +++ b/crypto_kem/kyber1024-90s/avx2/reduce.h @@ -3,8 +3,14 @@ #include -int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r); -int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r); -int16_t PQCLEAN_KYBER102490S_AVX2_frommont_avx(int16_t *r); +#include "consts.h" +#include "params.h" + + +int16_t PQCLEAN_KYBER102490S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + +int16_t PQCLEAN_KYBER102490S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); + +int16_t PQCLEAN_KYBER102490S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER102490S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/rejsample.c b/crypto_kem/kyber1024-90s/avx2/rejsample.c index 1d8f73e8..a75068f1 100644 --- a/crypto_kem/kyber1024-90s/avx2/rejsample.c +++ b/crypto_kem/kyber1024-90s/avx2/rejsample.c @@ -1,386 +1,360 @@ +#include "align.h" #include "consts.h" #include "params.h" #include "rejsample.h" - -#include #include - -static const uint8_t idx[256][8] = { - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 2, 0, 0, 0, 0, 0, 0, 0}, - { 0, 2, 0, 0, 0, 0, 0, 0}, - { 4, 0, 0, 0, 0, 0, 0, 0}, - { 0, 4, 0, 0, 0, 0, 0, 0}, - { 2, 4, 0, 0, 0, 0, 0, 0}, - { 0, 2, 4, 0, 0, 0, 0, 0}, - { 6, 0, 0, 0, 0, 0, 0, 0}, - { 0, 6, 0, 0, 0, 0, 0, 0}, - { 2, 6, 0, 0, 0, 0, 0, 0}, - { 0, 2, 6, 0, 0, 0, 0, 0}, - { 4, 6, 0, 0, 0, 0, 0, 0}, - { 0, 4, 6, 0, 0, 0, 0, 0}, - { 2, 4, 6, 0, 0, 0, 0, 0}, - { 0, 2, 4, 6, 0, 0, 0, 0}, - { 8, 0, 0, 0, 0, 0, 0, 0}, - { 0, 8, 0, 0, 0, 0, 0, 0}, - { 2, 8, 0, 0, 0, 0, 0, 0}, - { 0, 2, 8, 0, 0, 0, 0, 0}, - { 4, 8, 0, 0, 0, 0, 0, 0}, - { 0, 4, 8, 0, 0, 0, 0, 0}, - { 2, 4, 8, 0, 0, 0, 0, 0}, - { 0, 2, 4, 8, 0, 0, 0, 0}, - { 6, 8, 0, 0, 0, 0, 0, 0}, - { 0, 6, 8, 0, 0, 0, 0, 0}, - { 2, 6, 8, 0, 0, 0, 0, 0}, - { 0, 2, 6, 8, 0, 0, 0, 0}, - { 4, 6, 8, 0, 0, 0, 0, 0}, - { 0, 4, 6, 8, 0, 0, 0, 0}, - { 2, 4, 6, 8, 0, 0, 0, 0}, - { 0, 2, 4, 6, 8, 0, 0, 0}, - {10, 0, 0, 0, 0, 0, 0, 0}, - { 0, 10, 0, 0, 0, 0, 0, 0}, - { 2, 10, 0, 0, 0, 0, 0, 0}, - { 0, 2, 10, 0, 0, 0, 0, 0}, - { 4, 10, 0, 0, 0, 0, 0, 0}, - { 0, 4, 10, 0, 0, 0, 0, 0}, - { 2, 4, 10, 0, 0, 0, 0, 0}, - { 0, 2, 4, 10, 0, 0, 0, 0}, - { 6, 10, 0, 0, 0, 0, 0, 0}, - { 0, 6, 10, 0, 0, 0, 0, 0}, - { 2, 6, 10, 0, 0, 0, 0, 0}, - { 0, 2, 6, 10, 0, 0, 0, 0}, - { 4, 6, 10, 0, 0, 0, 0, 0}, - { 0, 4, 6, 10, 0, 0, 0, 0}, - { 2, 4, 6, 10, 0, 0, 0, 0}, - { 0, 2, 4, 6, 10, 0, 0, 0}, - { 8, 10, 0, 0, 0, 0, 0, 0}, - { 0, 8, 10, 0, 0, 0, 0, 0}, - { 2, 8, 10, 0, 0, 0, 0, 0}, - { 0, 2, 8, 10, 0, 0, 0, 0}, - { 4, 8, 10, 0, 0, 0, 0, 0}, - { 0, 4, 8, 10, 0, 0, 0, 0}, - { 2, 4, 8, 10, 0, 0, 0, 0}, - { 0, 2, 4, 8, 10, 0, 0, 0}, - { 6, 8, 10, 0, 0, 0, 0, 0}, - { 0, 6, 8, 10, 0, 0, 0, 0}, - { 2, 6, 8, 10, 0, 0, 0, 0}, - { 0, 2, 6, 8, 10, 0, 0, 0}, - { 4, 6, 8, 10, 0, 0, 0, 0}, - { 0, 4, 6, 8, 10, 0, 0, 0}, - { 2, 4, 6, 8, 10, 0, 0, 0}, - { 0, 2, 4, 6, 8, 10, 0, 0}, - {12, 0, 0, 0, 0, 0, 0, 0}, - { 0, 12, 0, 0, 0, 0, 0, 0}, - { 2, 12, 0, 0, 0, 0, 0, 0}, - { 0, 2, 12, 0, 0, 0, 0, 0}, - { 4, 12, 0, 0, 0, 0, 0, 0}, - { 0, 4, 12, 0, 0, 0, 0, 0}, - { 2, 4, 12, 0, 0, 0, 0, 0}, - { 0, 2, 4, 12, 0, 0, 0, 0}, - { 6, 12, 0, 0, 0, 0, 0, 0}, - { 0, 6, 12, 0, 0, 0, 0, 0}, - { 2, 6, 12, 0, 0, 0, 0, 0}, - { 0, 2, 6, 12, 0, 0, 0, 0}, - { 4, 6, 12, 0, 0, 0, 0, 0}, - { 0, 4, 6, 12, 0, 0, 0, 0}, - { 2, 4, 6, 12, 0, 0, 0, 0}, - { 0, 2, 4, 6, 12, 0, 0, 0}, - { 8, 12, 0, 0, 0, 0, 0, 0}, - { 0, 8, 12, 0, 0, 0, 0, 0}, - { 2, 8, 12, 0, 0, 0, 0, 0}, - { 0, 2, 8, 12, 0, 0, 0, 0}, - { 4, 8, 12, 0, 0, 0, 0, 0}, - { 0, 4, 8, 12, 0, 0, 0, 0}, - { 2, 4, 8, 12, 0, 0, 0, 0}, - { 0, 2, 4, 8, 12, 0, 0, 0}, - { 6, 8, 12, 0, 0, 0, 0, 0}, - { 0, 6, 8, 12, 0, 0, 0, 0}, - { 2, 6, 8, 12, 0, 0, 0, 0}, - { 0, 2, 6, 8, 12, 0, 0, 0}, - { 4, 6, 8, 12, 0, 0, 0, 0}, - { 0, 4, 6, 8, 12, 0, 0, 0}, - { 2, 4, 6, 8, 12, 0, 0, 0}, - { 0, 2, 4, 6, 8, 12, 0, 0}, - {10, 12, 0, 0, 0, 0, 0, 0}, - { 0, 10, 12, 0, 0, 0, 0, 0}, - { 2, 10, 12, 0, 0, 0, 0, 0}, - { 0, 2, 10, 12, 0, 0, 0, 0}, - { 4, 10, 12, 0, 0, 0, 0, 0}, - { 0, 4, 10, 12, 0, 0, 0, 0}, - { 2, 4, 10, 12, 0, 0, 0, 0}, - { 0, 2, 4, 10, 12, 0, 0, 0}, - { 6, 10, 12, 0, 0, 0, 0, 0}, - { 0, 6, 10, 12, 0, 0, 0, 0}, - { 2, 6, 10, 12, 0, 0, 0, 0}, - { 0, 2, 6, 10, 12, 0, 0, 0}, - { 4, 6, 10, 12, 0, 0, 0, 0}, - { 0, 4, 6, 10, 12, 0, 0, 0}, - { 2, 4, 6, 10, 12, 0, 0, 0}, - { 0, 2, 4, 6, 10, 12, 0, 0}, - { 8, 10, 12, 0, 0, 0, 0, 0}, - { 0, 8, 10, 12, 0, 0, 0, 0}, - { 2, 8, 10, 12, 0, 0, 0, 0}, - { 0, 2, 8, 10, 12, 0, 0, 0}, - { 4, 8, 10, 12, 0, 0, 0, 0}, - { 0, 4, 8, 10, 12, 0, 0, 0}, - { 2, 4, 8, 10, 12, 0, 0, 0}, - { 0, 2, 4, 8, 10, 12, 0, 0}, - { 6, 8, 10, 12, 0, 0, 0, 0}, - { 0, 6, 8, 10, 12, 0, 0, 0}, - { 2, 6, 8, 10, 12, 0, 0, 0}, - { 0, 2, 6, 8, 10, 12, 0, 0}, - { 4, 6, 8, 10, 12, 0, 0, 0}, - { 0, 4, 6, 8, 10, 12, 0, 0}, - { 2, 4, 6, 8, 10, 12, 0, 0}, - { 0, 2, 4, 6, 8, 10, 12, 0}, - {14, 0, 0, 0, 0, 0, 0, 0}, - { 0, 14, 0, 0, 0, 0, 0, 0}, - { 2, 14, 0, 0, 0, 0, 0, 0}, - { 0, 2, 14, 0, 0, 0, 0, 0}, - { 4, 14, 0, 0, 0, 0, 0, 0}, - { 0, 4, 14, 0, 0, 0, 0, 0}, - { 2, 4, 14, 0, 0, 0, 0, 0}, - { 0, 2, 4, 14, 0, 0, 0, 0}, - { 6, 14, 0, 0, 0, 0, 0, 0}, - { 0, 6, 14, 0, 0, 0, 0, 0}, - { 2, 6, 14, 0, 0, 0, 0, 0}, - { 0, 2, 6, 14, 0, 0, 0, 0}, - { 4, 6, 14, 0, 0, 0, 0, 0}, - { 0, 4, 6, 14, 0, 0, 0, 0}, - { 2, 4, 6, 14, 0, 0, 0, 0}, - { 0, 2, 4, 6, 14, 0, 0, 0}, - { 8, 14, 0, 0, 0, 0, 0, 0}, - { 0, 8, 14, 0, 0, 0, 0, 0}, - { 2, 8, 14, 0, 0, 0, 0, 0}, - { 0, 2, 8, 14, 0, 0, 0, 0}, - { 4, 8, 14, 0, 0, 0, 0, 0}, - { 0, 4, 8, 14, 0, 0, 0, 0}, - { 2, 4, 8, 14, 0, 0, 0, 0}, - { 0, 2, 4, 8, 14, 0, 0, 0}, - { 6, 8, 14, 0, 0, 0, 0, 0}, - { 0, 6, 8, 14, 0, 0, 0, 0}, - { 2, 6, 8, 14, 0, 0, 0, 0}, - { 0, 2, 6, 8, 14, 0, 0, 0}, - { 4, 6, 8, 14, 0, 0, 0, 0}, - { 0, 4, 6, 8, 14, 0, 0, 0}, - { 2, 4, 6, 8, 14, 0, 0, 0}, - { 0, 2, 4, 6, 8, 14, 0, 0}, - {10, 14, 0, 0, 0, 0, 0, 0}, - { 0, 10, 14, 0, 0, 0, 0, 0}, - { 2, 10, 14, 0, 0, 0, 0, 0}, - { 0, 2, 10, 14, 0, 0, 0, 0}, - { 4, 10, 14, 0, 0, 0, 0, 0}, - { 0, 4, 10, 14, 0, 0, 0, 0}, - { 2, 4, 10, 14, 0, 0, 0, 0}, - { 0, 2, 4, 10, 14, 0, 0, 0}, - { 6, 10, 14, 0, 0, 0, 0, 0}, - { 0, 6, 10, 14, 0, 0, 0, 0}, - { 2, 6, 10, 14, 0, 0, 0, 0}, - { 0, 2, 6, 10, 14, 0, 0, 0}, - { 4, 6, 10, 14, 0, 0, 0, 0}, - { 0, 4, 6, 10, 14, 0, 0, 0}, - { 2, 4, 6, 10, 14, 0, 0, 0}, - { 0, 2, 4, 6, 10, 14, 0, 0}, - { 8, 10, 14, 0, 0, 0, 0, 0}, - { 0, 8, 10, 14, 0, 0, 0, 0}, - { 2, 8, 10, 14, 0, 0, 0, 0}, - { 0, 2, 8, 10, 14, 0, 0, 0}, - { 4, 8, 10, 14, 0, 0, 0, 0}, - { 0, 4, 8, 10, 14, 0, 0, 0}, - { 2, 4, 8, 10, 14, 0, 0, 0}, - { 0, 2, 4, 8, 10, 14, 0, 0}, - { 6, 8, 10, 14, 0, 0, 0, 0}, - { 0, 6, 8, 10, 14, 0, 0, 0}, - { 2, 6, 8, 10, 14, 0, 0, 0}, - { 0, 2, 6, 8, 10, 14, 0, 0}, - { 4, 6, 8, 10, 14, 0, 0, 0}, - { 0, 4, 6, 8, 10, 14, 0, 0}, - { 2, 4, 6, 8, 10, 14, 0, 0}, - { 0, 2, 4, 6, 8, 10, 14, 0}, - {12, 14, 0, 0, 0, 0, 0, 0}, - { 0, 12, 14, 0, 0, 0, 0, 0}, - { 2, 12, 14, 0, 0, 0, 0, 0}, - { 0, 2, 12, 14, 0, 0, 0, 0}, - { 4, 12, 14, 0, 0, 0, 0, 0}, - { 0, 4, 12, 14, 0, 0, 0, 0}, - { 2, 4, 12, 14, 0, 0, 0, 0}, - { 0, 2, 4, 12, 14, 0, 0, 0}, - { 6, 12, 14, 0, 0, 0, 0, 0}, - { 0, 6, 12, 14, 0, 0, 0, 0}, - { 2, 6, 12, 14, 0, 0, 0, 0}, - { 0, 2, 6, 12, 14, 0, 0, 0}, - { 4, 6, 12, 14, 0, 0, 0, 0}, - { 0, 4, 6, 12, 14, 0, 0, 0}, - { 2, 4, 6, 12, 14, 0, 0, 0}, - { 0, 2, 4, 6, 12, 14, 0, 0}, - { 8, 12, 14, 0, 0, 0, 0, 0}, - { 0, 8, 12, 14, 0, 0, 0, 0}, - { 2, 8, 12, 14, 0, 0, 0, 0}, - { 0, 2, 8, 12, 14, 0, 0, 0}, - { 4, 8, 12, 14, 0, 0, 0, 0}, - { 0, 4, 8, 12, 14, 0, 0, 0}, - { 2, 4, 8, 12, 14, 0, 0, 0}, - { 0, 2, 4, 8, 12, 14, 0, 0}, - { 6, 8, 12, 14, 0, 0, 0, 0}, - { 0, 6, 8, 12, 14, 0, 0, 0}, - { 2, 6, 8, 12, 14, 0, 0, 0}, - { 0, 2, 6, 8, 12, 14, 0, 0}, - { 4, 6, 8, 12, 14, 0, 0, 0}, - { 0, 4, 6, 8, 12, 14, 0, 0}, - { 2, 4, 6, 8, 12, 14, 0, 0}, - { 0, 2, 4, 6, 8, 12, 14, 0}, - {10, 12, 14, 0, 0, 0, 0, 0}, - { 0, 10, 12, 14, 0, 0, 0, 0}, - { 2, 10, 12, 14, 0, 0, 0, 0}, - { 0, 2, 10, 12, 14, 0, 0, 0}, - { 4, 10, 12, 14, 0, 0, 0, 0}, - { 0, 4, 10, 12, 14, 0, 0, 0}, - { 2, 4, 10, 12, 14, 0, 0, 0}, - { 0, 2, 4, 10, 12, 14, 0, 0}, - { 6, 10, 12, 14, 0, 0, 0, 0}, - { 0, 6, 10, 12, 14, 0, 0, 0}, - { 2, 6, 10, 12, 14, 0, 0, 0}, - { 0, 2, 6, 10, 12, 14, 0, 0}, - { 4, 6, 10, 12, 14, 0, 0, 0}, - { 0, 4, 6, 10, 12, 14, 0, 0}, - { 2, 4, 6, 10, 12, 14, 0, 0}, - { 0, 2, 4, 6, 10, 12, 14, 0}, - { 8, 10, 12, 14, 0, 0, 0, 0}, - { 0, 8, 10, 12, 14, 0, 0, 0}, - { 2, 8, 10, 12, 14, 0, 0, 0}, - { 0, 2, 8, 10, 12, 14, 0, 0}, - { 4, 8, 10, 12, 14, 0, 0, 0}, - { 0, 4, 8, 10, 12, 14, 0, 0}, - { 2, 4, 8, 10, 12, 14, 0, 0}, - { 0, 2, 4, 8, 10, 12, 14, 0}, - { 6, 8, 10, 12, 14, 0, 0, 0}, - { 0, 6, 8, 10, 12, 14, 0, 0}, - { 2, 6, 8, 10, 12, 14, 0, 0}, - { 0, 2, 6, 8, 10, 12, 14, 0}, - { 4, 6, 8, 10, 12, 14, 0, 0}, - { 0, 4, 6, 8, 10, 12, 14, 0}, - { 2, 4, 6, 8, 10, 12, 14, 0}, - { 0, 2, 4, 6, 8, 10, 12, 14} +static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { + {-1, -1, -1, -1, -1, -1, -1, -1}, + { 0, -1, -1, -1, -1, -1, -1, -1}, + { 2, -1, -1, -1, -1, -1, -1, -1}, + { 0, 2, -1, -1, -1, -1, -1, -1}, + { 4, -1, -1, -1, -1, -1, -1, -1}, + { 0, 4, -1, -1, -1, -1, -1, -1}, + { 2, 4, -1, -1, -1, -1, -1, -1}, + { 0, 2, 4, -1, -1, -1, -1, -1}, + { 6, -1, -1, -1, -1, -1, -1, -1}, + { 0, 6, -1, -1, -1, -1, -1, -1}, + { 2, 6, -1, -1, -1, -1, -1, -1}, + { 0, 2, 6, -1, -1, -1, -1, -1}, + { 4, 6, -1, -1, -1, -1, -1, -1}, + { 0, 4, 6, -1, -1, -1, -1, -1}, + { 2, 4, 6, -1, -1, -1, -1, -1}, + { 0, 2, 4, 6, -1, -1, -1, -1}, + { 8, -1, -1, -1, -1, -1, -1, -1}, + { 0, 8, -1, -1, -1, -1, -1, -1}, + { 2, 8, -1, -1, -1, -1, -1, -1}, + { 0, 2, 8, -1, -1, -1, -1, -1}, + { 4, 8, -1, -1, -1, -1, -1, -1}, + { 0, 4, 8, -1, -1, -1, -1, -1}, + { 2, 4, 8, -1, -1, -1, -1, -1}, + { 0, 2, 4, 8, -1, -1, -1, -1}, + { 6, 8, -1, -1, -1, -1, -1, -1}, + { 0, 6, 8, -1, -1, -1, -1, -1}, + { 2, 6, 8, -1, -1, -1, -1, -1}, + { 0, 2, 6, 8, -1, -1, -1, -1}, + { 4, 6, 8, -1, -1, -1, -1, -1}, + { 0, 4, 6, 8, -1, -1, -1, -1}, + { 2, 4, 6, 8, -1, -1, -1, -1}, + { 0, 2, 4, 6, 8, -1, -1, -1}, + {10, -1, -1, -1, -1, -1, -1, -1}, + { 0, 10, -1, -1, -1, -1, -1, -1}, + { 2, 10, -1, -1, -1, -1, -1, -1}, + { 0, 2, 10, -1, -1, -1, -1, -1}, + { 4, 10, -1, -1, -1, -1, -1, -1}, + { 0, 4, 10, -1, -1, -1, -1, -1}, + { 2, 4, 10, -1, -1, -1, -1, -1}, + { 0, 2, 4, 10, -1, -1, -1, -1}, + { 6, 10, -1, -1, -1, -1, -1, -1}, + { 0, 6, 10, -1, -1, -1, -1, -1}, + { 2, 6, 10, -1, -1, -1, -1, -1}, + { 0, 2, 6, 10, -1, -1, -1, -1}, + { 4, 6, 10, -1, -1, -1, -1, -1}, + { 0, 4, 6, 10, -1, -1, -1, -1}, + { 2, 4, 6, 10, -1, -1, -1, -1}, + { 0, 2, 4, 6, 10, -1, -1, -1}, + { 8, 10, -1, -1, -1, -1, -1, -1}, + { 0, 8, 10, -1, -1, -1, -1, -1}, + { 2, 8, 10, -1, -1, -1, -1, -1}, + { 0, 2, 8, 10, -1, -1, -1, -1}, + { 4, 8, 10, -1, -1, -1, -1, -1}, + { 0, 4, 8, 10, -1, -1, -1, -1}, + { 2, 4, 8, 10, -1, -1, -1, -1}, + { 0, 2, 4, 8, 10, -1, -1, -1}, + { 6, 8, 10, -1, -1, -1, -1, -1}, + { 0, 6, 8, 10, -1, -1, -1, -1}, + { 2, 6, 8, 10, -1, -1, -1, -1}, + { 0, 2, 6, 8, 10, -1, -1, -1}, + { 4, 6, 8, 10, -1, -1, -1, -1}, + { 0, 4, 6, 8, 10, -1, -1, -1}, + { 2, 4, 6, 8, 10, -1, -1, -1}, + { 0, 2, 4, 6, 8, 10, -1, -1}, + {12, -1, -1, -1, -1, -1, -1, -1}, + { 0, 12, -1, -1, -1, -1, -1, -1}, + { 2, 12, -1, -1, -1, -1, -1, -1}, + { 0, 2, 12, -1, -1, -1, -1, -1}, + { 4, 12, -1, -1, -1, -1, -1, -1}, + { 0, 4, 12, -1, -1, -1, -1, -1}, + { 2, 4, 12, -1, -1, -1, -1, -1}, + { 0, 2, 4, 12, -1, -1, -1, -1}, + { 6, 12, -1, -1, -1, -1, -1, -1}, + { 0, 6, 12, -1, -1, -1, -1, -1}, + { 2, 6, 12, -1, -1, -1, -1, -1}, + { 0, 2, 6, 12, -1, -1, -1, -1}, + { 4, 6, 12, -1, -1, -1, -1, -1}, + { 0, 4, 6, 12, -1, -1, -1, -1}, + { 2, 4, 6, 12, -1, -1, -1, -1}, + { 0, 2, 4, 6, 12, -1, -1, -1}, + { 8, 12, -1, -1, -1, -1, -1, -1}, + { 0, 8, 12, -1, -1, -1, -1, -1}, + { 2, 8, 12, -1, -1, -1, -1, -1}, + { 0, 2, 8, 12, -1, -1, -1, -1}, + { 4, 8, 12, -1, -1, -1, -1, -1}, + { 0, 4, 8, 12, -1, -1, -1, -1}, + { 2, 4, 8, 12, -1, -1, -1, -1}, + { 0, 2, 4, 8, 12, -1, -1, -1}, + { 6, 8, 12, -1, -1, -1, -1, -1}, + { 0, 6, 8, 12, -1, -1, -1, -1}, + { 2, 6, 8, 12, -1, -1, -1, -1}, + { 0, 2, 6, 8, 12, -1, -1, -1}, + { 4, 6, 8, 12, -1, -1, -1, -1}, + { 0, 4, 6, 8, 12, -1, -1, -1}, + { 2, 4, 6, 8, 12, -1, -1, -1}, + { 0, 2, 4, 6, 8, 12, -1, -1}, + {10, 12, -1, -1, -1, -1, -1, -1}, + { 0, 10, 12, -1, -1, -1, -1, -1}, + { 2, 10, 12, -1, -1, -1, -1, -1}, + { 0, 2, 10, 12, -1, -1, -1, -1}, + { 4, 10, 12, -1, -1, -1, -1, -1}, + { 0, 4, 10, 12, -1, -1, -1, -1}, + { 2, 4, 10, 12, -1, -1, -1, -1}, + { 0, 2, 4, 10, 12, -1, -1, -1}, + { 6, 10, 12, -1, -1, -1, -1, -1}, + { 0, 6, 10, 12, -1, -1, -1, -1}, + { 2, 6, 10, 12, -1, -1, -1, -1}, + { 0, 2, 6, 10, 12, -1, -1, -1}, + { 4, 6, 10, 12, -1, -1, -1, -1}, + { 0, 4, 6, 10, 12, -1, -1, -1}, + { 2, 4, 6, 10, 12, -1, -1, -1}, + { 0, 2, 4, 6, 10, 12, -1, -1}, + { 8, 10, 12, -1, -1, -1, -1, -1}, + { 0, 8, 10, 12, -1, -1, -1, -1}, + { 2, 8, 10, 12, -1, -1, -1, -1}, + { 0, 2, 8, 10, 12, -1, -1, -1}, + { 4, 8, 10, 12, -1, -1, -1, -1}, + { 0, 4, 8, 10, 12, -1, -1, -1}, + { 2, 4, 8, 10, 12, -1, -1, -1}, + { 0, 2, 4, 8, 10, 12, -1, -1}, + { 6, 8, 10, 12, -1, -1, -1, -1}, + { 0, 6, 8, 10, 12, -1, -1, -1}, + { 2, 6, 8, 10, 12, -1, -1, -1}, + { 0, 2, 6, 8, 10, 12, -1, -1}, + { 4, 6, 8, 10, 12, -1, -1, -1}, + { 0, 4, 6, 8, 10, 12, -1, -1}, + { 2, 4, 6, 8, 10, 12, -1, -1}, + { 0, 2, 4, 6, 8, 10, 12, -1}, + {14, -1, -1, -1, -1, -1, -1, -1}, + { 0, 14, -1, -1, -1, -1, -1, -1}, + { 2, 14, -1, -1, -1, -1, -1, -1}, + { 0, 2, 14, -1, -1, -1, -1, -1}, + { 4, 14, -1, -1, -1, -1, -1, -1}, + { 0, 4, 14, -1, -1, -1, -1, -1}, + { 2, 4, 14, -1, -1, -1, -1, -1}, + { 0, 2, 4, 14, -1, -1, -1, -1}, + { 6, 14, -1, -1, -1, -1, -1, -1}, + { 0, 6, 14, -1, -1, -1, -1, -1}, + { 2, 6, 14, -1, -1, -1, -1, -1}, + { 0, 2, 6, 14, -1, -1, -1, -1}, + { 4, 6, 14, -1, -1, -1, -1, -1}, + { 0, 4, 6, 14, -1, -1, -1, -1}, + { 2, 4, 6, 14, -1, -1, -1, -1}, + { 0, 2, 4, 6, 14, -1, -1, -1}, + { 8, 14, -1, -1, -1, -1, -1, -1}, + { 0, 8, 14, -1, -1, -1, -1, -1}, + { 2, 8, 14, -1, -1, -1, -1, -1}, + { 0, 2, 8, 14, -1, -1, -1, -1}, + { 4, 8, 14, -1, -1, -1, -1, -1}, + { 0, 4, 8, 14, -1, -1, -1, -1}, + { 2, 4, 8, 14, -1, -1, -1, -1}, + { 0, 2, 4, 8, 14, -1, -1, -1}, + { 6, 8, 14, -1, -1, -1, -1, -1}, + { 0, 6, 8, 14, -1, -1, -1, -1}, + { 2, 6, 8, 14, -1, -1, -1, -1}, + { 0, 2, 6, 8, 14, -1, -1, -1}, + { 4, 6, 8, 14, -1, -1, -1, -1}, + { 0, 4, 6, 8, 14, -1, -1, -1}, + { 2, 4, 6, 8, 14, -1, -1, -1}, + { 0, 2, 4, 6, 8, 14, -1, -1}, + {10, 14, -1, -1, -1, -1, -1, -1}, + { 0, 10, 14, -1, -1, -1, -1, -1}, + { 2, 10, 14, -1, -1, -1, -1, -1}, + { 0, 2, 10, 14, -1, -1, -1, -1}, + { 4, 10, 14, -1, -1, -1, -1, -1}, + { 0, 4, 10, 14, -1, -1, -1, -1}, + { 2, 4, 10, 14, -1, -1, -1, -1}, + { 0, 2, 4, 10, 14, -1, -1, -1}, + { 6, 10, 14, -1, -1, -1, -1, -1}, + { 0, 6, 10, 14, -1, -1, -1, -1}, + { 2, 6, 10, 14, -1, -1, -1, -1}, + { 0, 2, 6, 10, 14, -1, -1, -1}, + { 4, 6, 10, 14, -1, -1, -1, -1}, + { 0, 4, 6, 10, 14, -1, -1, -1}, + { 2, 4, 6, 10, 14, -1, -1, -1}, + { 0, 2, 4, 6, 10, 14, -1, -1}, + { 8, 10, 14, -1, -1, -1, -1, -1}, + { 0, 8, 10, 14, -1, -1, -1, -1}, + { 2, 8, 10, 14, -1, -1, -1, -1}, + { 0, 2, 8, 10, 14, -1, -1, -1}, + { 4, 8, 10, 14, -1, -1, -1, -1}, + { 0, 4, 8, 10, 14, -1, -1, -1}, + { 2, 4, 8, 10, 14, -1, -1, -1}, + { 0, 2, 4, 8, 10, 14, -1, -1}, + { 6, 8, 10, 14, -1, -1, -1, -1}, + { 0, 6, 8, 10, 14, -1, -1, -1}, + { 2, 6, 8, 10, 14, -1, -1, -1}, + { 0, 2, 6, 8, 10, 14, -1, -1}, + { 4, 6, 8, 10, 14, -1, -1, -1}, + { 0, 4, 6, 8, 10, 14, -1, -1}, + { 2, 4, 6, 8, 10, 14, -1, -1}, + { 0, 2, 4, 6, 8, 10, 14, -1}, + {12, 14, -1, -1, -1, -1, -1, -1}, + { 0, 12, 14, -1, -1, -1, -1, -1}, + { 2, 12, 14, -1, -1, -1, -1, -1}, + { 0, 2, 12, 14, -1, -1, -1, -1}, + { 4, 12, 14, -1, -1, -1, -1, -1}, + { 0, 4, 12, 14, -1, -1, -1, -1}, + { 2, 4, 12, 14, -1, -1, -1, -1}, + { 0, 2, 4, 12, 14, -1, -1, -1}, + { 6, 12, 14, -1, -1, -1, -1, -1}, + { 0, 6, 12, 14, -1, -1, -1, -1}, + { 2, 6, 12, 14, -1, -1, -1, -1}, + { 0, 2, 6, 12, 14, -1, -1, -1}, + { 4, 6, 12, 14, -1, -1, -1, -1}, + { 0, 4, 6, 12, 14, -1, -1, -1}, + { 2, 4, 6, 12, 14, -1, -1, -1}, + { 0, 2, 4, 6, 12, 14, -1, -1}, + { 8, 12, 14, -1, -1, -1, -1, -1}, + { 0, 8, 12, 14, -1, -1, -1, -1}, + { 2, 8, 12, 14, -1, -1, -1, -1}, + { 0, 2, 8, 12, 14, -1, -1, -1}, + { 4, 8, 12, 14, -1, -1, -1, -1}, + { 0, 4, 8, 12, 14, -1, -1, -1}, + { 2, 4, 8, 12, 14, -1, -1, -1}, + { 0, 2, 4, 8, 12, 14, -1, -1}, + { 6, 8, 12, 14, -1, -1, -1, -1}, + { 0, 6, 8, 12, 14, -1, -1, -1}, + { 2, 6, 8, 12, 14, -1, -1, -1}, + { 0, 2, 6, 8, 12, 14, -1, -1}, + { 4, 6, 8, 12, 14, -1, -1, -1}, + { 0, 4, 6, 8, 12, 14, -1, -1}, + { 2, 4, 6, 8, 12, 14, -1, -1}, + { 0, 2, 4, 6, 8, 12, 14, -1}, + {10, 12, 14, -1, -1, -1, -1, -1}, + { 0, 10, 12, 14, -1, -1, -1, -1}, + { 2, 10, 12, 14, -1, -1, -1, -1}, + { 0, 2, 10, 12, 14, -1, -1, -1}, + { 4, 10, 12, 14, -1, -1, -1, -1}, + { 0, 4, 10, 12, 14, -1, -1, -1}, + { 2, 4, 10, 12, 14, -1, -1, -1}, + { 0, 2, 4, 10, 12, 14, -1, -1}, + { 6, 10, 12, 14, -1, -1, -1, -1}, + { 0, 6, 10, 12, 14, -1, -1, -1}, + { 2, 6, 10, 12, 14, -1, -1, -1}, + { 0, 2, 6, 10, 12, 14, -1, -1}, + { 4, 6, 10, 12, 14, -1, -1, -1}, + { 0, 4, 6, 10, 12, 14, -1, -1}, + { 2, 4, 6, 10, 12, 14, -1, -1}, + { 0, 2, 4, 6, 10, 12, 14, -1}, + { 8, 10, 12, 14, -1, -1, -1, -1}, + { 0, 8, 10, 12, 14, -1, -1, -1}, + { 2, 8, 10, 12, 14, -1, -1, -1}, + { 0, 2, 8, 10, 12, 14, -1, -1}, + { 4, 8, 10, 12, 14, -1, -1, -1}, + { 0, 4, 8, 10, 12, 14, -1, -1}, + { 2, 4, 8, 10, 12, 14, -1, -1}, + { 0, 2, 4, 8, 10, 12, 14, -1}, + { 6, 8, 10, 12, 14, -1, -1, -1}, + { 0, 6, 8, 10, 12, 14, -1, -1}, + { 2, 6, 8, 10, 12, 14, -1, -1}, + { 0, 2, 6, 8, 10, 12, 14, -1}, + { 4, 6, 8, 10, 12, 14, -1, -1}, + { 0, 4, 6, 8, 10, 12, 14, -1}, + { 2, 4, 6, 8, 10, 12, 14, -1}, + { 0, 2, 4, 6, 8, 10, 12, 14} + } }; -#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) -#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) +#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) +#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -size_t PQCLEAN_KYBER102490S_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint16_t val; - uint32_t good0, good1, good2; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison +#define REJ_UNIFORM_BUFLEN 576 +unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *restrict r, + const uint8_t *restrict buf) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; + uint32_t good = 0; + const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_16xq.as_vec); - const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER102490S_AVX2_16xv.as_vec); - __m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2; - __m128i d, tmp, pilo, pihi; + const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XQ]); + const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER102490S_AVX2_qdata.as_arr[_16XV]); + __m256i f0, f1, g0, g1, g2, g3; + __m128i f, t, pilo, pihi; - ctr = pos = 0; - while (ctr + 48 <= len && pos + 96 <= buflen) { - d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]); - d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]); - d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]); + ctr = 0; + for (pos = 0; pos < 2 * KYBER_N; pos += 64) { + f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); + f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); - tmp0 = _mm256_cmpge_epu16(bound, d0); - tmp1 = _mm256_cmpge_epu16(bound, d1); - tmp2 = _mm256_cmpge_epu16(bound, d2); - good0 = (uint32_t)_mm256_movemask_epi8(tmp0); - good1 = (uint32_t)_mm256_movemask_epi8(tmp1); - good2 = (uint32_t)_mm256_movemask_epi8(tmp2); - good0 = _pext_u32(good0, 0x55555555); - good1 = _pext_u32(good1, 0x55555555); - good2 = _pext_u32(good2, 0x55555555); + g0 = _mm256_cmpge_epu16(bound, f0); + g1 = _mm256_cmpge_epu16(bound, f1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]); - pi0 = _mm256_castsi128_si256(pilo); - pi0 = _mm256_inserti128_si256(pi0, pihi, 1); + g0 = _mm256_packs_epi16(g0, g1); + good = _mm256_movemask_epi8(g0); - pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]); - pi1 = _mm256_castsi128_si256(pilo); - pi1 = _mm256_inserti128_si256(pi1, pihi, 1); + g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); + g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); + g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); + g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]); - pi2 = _mm256_castsi128_si256(pilo); - pi2 = _mm256_inserti128_si256(pi2, pihi, 1); + //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); + //g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8); - tmp0 = _mm256_add_epi8(pi0, ones); - tmp1 = _mm256_add_epi8(pi1, ones); - tmp2 = _mm256_add_epi8(pi2, ones); - pi0 = _mm256_unpacklo_epi8(pi0, tmp0); - pi1 = _mm256_unpacklo_epi8(pi1, tmp1); - pi2 = _mm256_unpacklo_epi8(pi2, tmp2); + /* Barrett reduction of (still unsigned) values */ + g2 = _mm256_mulhi_epu16(f0, v); + g3 = _mm256_mulhi_epu16(f1, v); + g2 = _mm256_srli_epi16(g2, 10); + g3 = _mm256_srli_epi16(g3, 10); + g2 = _mm256_mullo_epi16(g2, kyberq); + g3 = _mm256_mullo_epi16(g3, kyberq); + f0 = _mm256_sub_epi16(f0, g2); + f1 = _mm256_sub_epi16(f1, g3); - d0 = _mm256_shuffle_epi8(d0, pi0); - d1 = _mm256_shuffle_epi8(d1, pi1); - d2 = _mm256_shuffle_epi8(d2, pi2); + g2 = _mm256_add_epi8(g0, ones); + g3 = _mm256_add_epi8(g1, ones); + g0 = _mm256_unpacklo_epi8(g0, g2); + g1 = _mm256_unpacklo_epi8(g1, g3); - /* Barrett reduction of (still unsigned) d values */ - tmp0 = _mm256_mulhi_epu16(d0, v); - tmp1 = _mm256_mulhi_epu16(d1, v); - tmp2 = _mm256_mulhi_epu16(d2, v); - tmp0 = _mm256_srli_epi16(tmp0, 10); - tmp1 = _mm256_srli_epi16(tmp1, 10); - tmp2 = _mm256_srli_epi16(tmp2, 10); - tmp0 = _mm256_mullo_epi16(tmp0, kyberq); - tmp1 = _mm256_mullo_epi16(tmp1, kyberq); - tmp2 = _mm256_mullo_epi16(tmp2, kyberq); - d0 = _mm256_sub_epi16(d0, tmp0); - d1 = _mm256_sub_epi16(d1, tmp1); - d2 = _mm256_sub_epi16(d2, tmp2); + f0 = _mm256_shuffle_epi8(f0, g0); + f1 = _mm256_shuffle_epi8(f1, g1); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0)); - ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1)); - ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2)); - ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF); - pos += 96; + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); + ctr += _mm_popcnt_u32((good >> 0) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); + ctr += _mm_popcnt_u32((good >> 16) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); + ctr += _mm_popcnt_u32((good >> 8) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); + ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr + 8 <= len && pos + 16 <= buflen) { - d = _mm_loadu_si128((__m128i *)&buf[pos]); - tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d); - good0 = (uint32_t)_mm_movemask_epi8(tmp); - good0 = _pext_u32(good0, 0x55555555); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0]); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { + f = _mm_load_si128((__m128i *)&buf[pos]); + t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + good = _mm_movemask_epi8(t); + good = _pext_u32(good, 0x5555); + pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - d = _mm_shuffle_epi8(d, pilo); /* Barrett reduction */ - tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v)); - tmp = _mm_srli_epi16(tmp, 10); - tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq)); - d = _mm_sub_epi16(d, tmp); + t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); + t = _mm_srli_epi16(t, 10); + t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); + f = _mm_sub_epi16(f, t); - _mm_storeu_si128((__m128i *)&r[ctr], d); - ctr += (unsigned int)_mm_popcnt_u32(good0); + f = _mm_shuffle_epi8(f, pilo); + _mm_storeu_si128((__m128i *)&r[ctr], f); + ctr += _mm_popcnt_u32(good); pos += 16; } - while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = (int16_t)val; + r[ctr++] = val; } } diff --git a/crypto_kem/kyber1024-90s/avx2/rejsample.h b/crypto_kem/kyber1024-90s/avx2/rejsample.h index b0ecf88b..03f04912 100644 --- a/crypto_kem/kyber1024-90s/avx2/rejsample.h +++ b/crypto_kem/kyber1024-90s/avx2/rejsample.h @@ -1,12 +1,11 @@ #ifndef REJSAMPLE_H #define REJSAMPLE_H -#include +#include "params.h" #include -size_t PQCLEAN_KYBER102490S_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); + +unsigned int PQCLEAN_KYBER102490S_AVX2_rej_uniform_avx(int16_t *r, + const unsigned char *buf); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/shuffle.S b/crypto_kem/kyber1024-90s/avx2/shuffle.S new file mode 100644 index 00000000..e60befbe --- /dev/null +++ b/crypto_kem/kyber1024-90s/avx2/shuffle.S @@ -0,0 +1,255 @@ +#include "cdecl.inc" +.include "fq.inc" +.include "shuffle.inc" + +/* +nttpack_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret +*/ + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_nttunpack_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret + +ntttobytes128_avx: +#load +vmovdqa (%rsi),%ymm5 +vmovdqa 32(%rsi),%ymm6 +vmovdqa 64(%rsi),%ymm7 +vmovdqa 96(%rsi),%ymm8 +vmovdqa 128(%rsi),%ymm9 +vmovdqa 160(%rsi),%ymm10 +vmovdqa 192(%rsi),%ymm11 +vmovdqa 224(%rsi),%ymm12 + +#csubq +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,1 +csubq 9,13 +csubq 10,14 +csubq 11,15 +csubq 12,1 + +#bitpack +vpsllw $12,%ymm6,%ymm4 +vpor %ymm4,%ymm5,%ymm4 + +vpsrlw $4,%ymm6,%ymm5 +vpsllw $8,%ymm7,%ymm6 +vpor %ymm5,%ymm6,%ymm5 + +vpsrlw $8,%ymm7,%ymm6 +vpsllw $4,%ymm8,%ymm7 +vpor %ymm6,%ymm7,%ymm6 + +vpsllw $12,%ymm10,%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +vpsrlw $4,%ymm10,%ymm8 +vpsllw $8,%ymm11,%ymm9 +vpor %ymm8,%ymm9,%ymm8 + +vpsrlw $8,%ymm11,%ymm9 +vpsllw $4,%ymm12,%ymm10 +vpor %ymm9,%ymm10,%ymm9 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 + +shuffle2 3,4,8,4 +shuffle2 6,5,3,5 +shuffle2 7,9,6,9 + +shuffle4 8,3,7,3 +shuffle4 6,4,8,4 +shuffle4 5,9,6,9 + +shuffle8 7,8,5,8 +shuffle8 6,3,7,3 +shuffle8 4,9,6,9 + +#store +vmovdqu %ymm5,(%rdi) +vmovdqu %ymm7,32(%rdi) +vmovdqu %ymm6,64(%rdi) +vmovdqu %ymm8,96(%rdi) +vmovdqu %ymm3,128(%rdi) +vmovdqu %ymm9,160(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx): +#consts +vmovdqa _16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret + +nttfrombytes128_avx: +#load +vmovdqu (%rsi),%ymm4 +vmovdqu 32(%rsi),%ymm5 +vmovdqu 64(%rsi),%ymm6 +vmovdqu 96(%rsi),%ymm7 +vmovdqu 128(%rsi),%ymm8 +vmovdqu 160(%rsi),%ymm9 + +shuffle8 4,7,3,7 +shuffle8 5,8,4,8 +shuffle8 6,9,5,9 + +shuffle4 3,8,6,8 +shuffle4 7,5,3,5 +shuffle4 4,9,7,9 + +shuffle2 6,5,4,5 +shuffle2 8,7,6,7 +shuffle2 3,9,8,9 + +shuffle1 4,7,10,7 +shuffle1 5,8,4,8 +shuffle1 6,9,5,9 + +#bitunpack +vpsrlw $12,%ymm10,%ymm11 +vpsllw $4,%ymm7,%ymm12 +vpor %ymm11,%ymm12,%ymm11 +vpand %ymm0,%ymm10,%ymm10 +vpand %ymm0,%ymm11,%ymm11 + +vpsrlw $8,%ymm7,%ymm12 +vpsllw $8,%ymm4,%ymm13 +vpor %ymm12,%ymm13,%ymm12 +vpand %ymm0,%ymm12,%ymm12 + +vpsrlw $4,%ymm4,%ymm13 +vpand %ymm0,%ymm13,%ymm13 + +vpsrlw $12,%ymm8,%ymm14 +vpsllw $4,%ymm5,%ymm15 +vpor %ymm14,%ymm15,%ymm14 +vpand %ymm0,%ymm8,%ymm8 +vpand %ymm0,%ymm14,%ymm14 + +vpsrlw $8,%ymm5,%ymm15 +vpsllw $8,%ymm9,%ymm1 +vpor %ymm15,%ymm1,%ymm15 +vpand %ymm0,%ymm15,%ymm15 + +vpsrlw $4,%ymm9,%ymm1 +vpand %ymm0,%ymm1,%ymm1 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm11,32(%rdi) +vmovdqa %ymm12,64(%rdi) +vmovdqa %ymm13,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm14,160(%rdi) +vmovdqa %ymm15,192(%rdi) +vmovdqa %ymm1,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx) +cdecl(PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx): +#consts +vmovdqa _16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret diff --git a/crypto_kem/kyber1024-90s/avx2/shuffle.inc b/crypto_kem/kyber1024-90s/avx2/shuffle.inc index df352030..d4b092bc 100644 --- a/crypto_kem/kyber1024-90s/avx2/shuffle.inc +++ b/crypto_kem/kyber1024-90s/avx2/shuffle.inc @@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 +#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 vpsllq $32,%ymm\r1,%ymm12 vpsrlq $32,%ymm\r0,%ymm13 vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 diff --git a/crypto_kem/kyber1024-90s/avx2/symmetric.h b/crypto_kem/kyber1024-90s/avx2/symmetric.h index 4ee7b202..47579fd8 100644 --- a/crypto_kem/kyber1024-90s/avx2/symmetric.h +++ b/crypto_kem/kyber1024-90s/avx2/symmetric.h @@ -2,22 +2,26 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "aes256ctr.h" #include "sha2.h" -#define hash_h(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES)) -#define hash_g(OUT, IN, INBYTES) sha512((OUT), (IN), (INBYTES)) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER102490S_AVX2_aes256ctr_init((STATE), (IN), (Y) + ((uint16_t)(X) << 8)) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks((OUT), (OUTBLOCKS), (STATE)) -#define xof_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf((OUT), (OUTBYTES), (KEY), (NONCE)) -#define kdf(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES)) - -#define XOF_BLOCKBYTES 128 - typedef aes256ctr_ctx xof_state; +#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) +#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) +#define xof_absorb(STATE, SEED, X, Y) \ + PQCLEAN_KYBER102490S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_KYBER102490S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER102490S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) +#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) + #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber1024-90s/avx2/verify.c b/crypto_kem/kyber1024-90s/avx2/verify.c index 23aec250..836771e8 100644 --- a/crypto_kem/kyber1024-90s/avx2/verify.c +++ b/crypto_kem/kyber1024-90s/avx2/verify.c @@ -1,23 +1,22 @@ #include "verify.h" - #include #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER102490S_AVX2_verify * * Description: Compare two arrays for equality in constant time. * -* Arguments: const uint8_t *a: pointer to first byte array -* const uint8_t *b: pointer to second byte array +* Arguments: const unsigned char *a: pointer to first byte array +* const unsigned char *b: pointer to second byte array * size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; - uint64_t r; +int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t pos = 0; + uint64_t r = 0; __m256i avec, bvec, cvec; cvec = _mm256_setzero_si256(); @@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, siz avec = _mm256_xor_si256(avec, bvec); cvec = _mm256_or_si256(cvec, avec); } + r = !_mm256_testz_si256(cvec, cvec); - cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256()); - r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1); - - while (pos < len) { - r |= a[pos] ^ b[pos]; - pos += 1; + if (pos < len) { + avec = _mm256_loadu_si256((__m256i *)&a[pos]); + bvec = _mm256_loadu_si256((__m256i *)&b[pos]); + cvec = _mm256_cmpeq_epi8(avec, bvec); + r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); } r = (-r) >> 63; - return (uint8_t)r; + return r; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER102490S_AVX2_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: uint8_t *r: pointer to output byte array -* const uint8_t *x: pointer to input byte array +* Arguments: unsigned char *r: pointer to output byte array +* const unsigned char *x: pointer to input byte array * size_t len: Amount of bytes to be copied -* uint8_t b: Condition bit; has to be in {0,1} +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { + size_t pos = 0; __m256i xvec, rvec, bvec; b = -b; - bvec = _mm256_set1_epi8((char)b); + bvec = _mm256_set1_epi8(b); for (pos = 0; pos + 32 <= len; pos += 32) { rvec = _mm256_loadu_si256((__m256i *)&r[pos]); diff --git a/crypto_kem/kyber1024-90s/avx2/verify.h b/crypto_kem/kyber1024-90s/avx2/verify.h index 572aa11f..ff8dfe4d 100644 --- a/crypto_kem/kyber1024-90s/avx2/verify.h +++ b/crypto_kem/kyber1024-90s/avx2/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER102490S_AVX2_VERIFY_H +#define PQCLEAN_KYBER102490S_AVX2_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER102490S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER102490S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber1024-90s/clean/LICENSE b/crypto_kem/kyber1024-90s/clean/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber1024-90s/clean/LICENSE +++ b/crypto_kem/kyber1024-90s/clean/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber1024-90s/clean/Makefile b/crypto_kem/kyber1024-90s/clean/Makefile index 29aec0ad..21c159e1 100644 --- a/crypto_kem/kyber1024-90s/clean/Makefile +++ b/crypto_kem/kyber1024-90s/clean/Makefile @@ -1,8 +1,29 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber1024-90s_clean.a -HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h aes256ctr.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o aes256ctr.o +HEADERS= \ + api.h \ + cbd.h \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + symmetric-aes.h \ + symmetric.h \ + verify.h +OBJECTS= \ + cbd.o \ + indcpa.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + reduce.o \ + verify.o \ + symmetric-aes.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake index 920180ff..4e0c51a8 100644 --- a/crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber1024-90s/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber1024-90s_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj aes256ctr.obj +OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-aes.o # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber1024-90s/clean/cbd.c b/crypto_kem/kyber1024-90s/clean/cbd.c index 8b7b95d1..92fb85a6 100644 --- a/crypto_kem/kyber1024-90s/clean/cbd.c +++ b/crypto_kem/kyber1024-90s/clean/cbd.c @@ -1,7 +1,5 @@ -#include "cbd.h" #include "params.h" - -#include +#include "cbd.h" #include /************************************************* @@ -14,8 +12,8 @@ * * Returns 32-bit unsigned integer loaded from x **************************************************/ -static uint32_t load32_littleendian(const uint8_t *x) { - uint32_t r; +static uint32_t load32_littleendian(const uint8_t x[4]) { + uint32_t r = 0; r = (uint32_t)x[0]; r |= (uint32_t)x[1] << 8; r |= (uint32_t)x[2] << 16; @@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) { } /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER102490S_CLEAN_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA -* specialized for KYBER_ETA=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t *buf) { - uint32_t d, t; - int16_t a, b; +void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { + unsigned int i = 0, j = 0; + uint32_t t = 0, d = 0; + int16_t a = 0, b = 0; - for (size_t i = 0; i < KYBER_N / 8; i++) { - t = load32_littleendian(buf + 4 * i); + for (i = 0; i < KYBER_N / 8; i++) { + t = load32_littleendian(buf + 4 * i); d = t & 0x55555555; d += (t >> 1) & 0x55555555; - for (size_t j = 0; j < 8; j++) { - a = (d >> 4 * j) & 0x3; + for (j = 0; j < 8; j++) { + a = (d >> (4 * j + 0)) & 0x3; b = (d >> (4 * j + 2)) & 0x3; r->coeffs[8 * i + j] = a - b; } diff --git a/crypto_kem/kyber1024-90s/clean/cbd.h b/crypto_kem/kyber1024-90s/clean/cbd.h index d1e4ecdd..7e9c635e 100644 --- a/crypto_kem/kyber1024-90s/clean/cbd.h +++ b/crypto_kem/kyber1024-90s/clean/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER102490S_CLEAN_CBD_H +#define PQCLEAN_KYBER102490S_CLEAN_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER102490S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber1024-90s/clean/indcpa.c b/crypto_kem/kyber1024-90s/clean/indcpa.c index e60c89b2..32b3404c 100644 --- a/crypto_kem/kyber1024-90s/clean/indcpa.c +++ b/crypto_kem/kyber1024-90s/clean/indcpa.c @@ -5,7 +5,7 @@ #include "polyvec.h" #include "randombytes.h" #include "symmetric.h" - +#include #include /************************************************* @@ -16,12 +16,15 @@ * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key +* polynomial vector +* - uint8_t *seed: pointer to output seed to generate +* matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(r, sk); } @@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of +* polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER102490S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER102490S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - size_t len: requested number of 16-bit integers (uniform mod q) -* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes) -* - size_t buflen: length of input buffer in bytes +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) **************************************************/ -static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - size_t ctr, pos; - uint16_t val; +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { - val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction + val -= (val >> 12) * KYBER_Q; // Barrett reduction r[ctr++] = (int16_t)val; } } @@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER102490S_CLEAN_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER102490S_CLEAN_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER102490S_CLEAN_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T +* is generated **************************************************/ -#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - uint8_t i, j; - uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1]; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +// Not static for benchmarking +void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { xof_absorb(&state, seed, j, i); } - xof_squeezeblocks(buf, MAXNBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES); + xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); while (ctr < KYBER_N) { xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, + XOF_BLOCKBYTES); } xof_ctx_release(&state); } @@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], e, pkpv, skpv; +void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t *publicseed = buf; - uint8_t *noiseseed = buf + KYBER_SYMBYTES; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; uint8_t nonce = 0; + polyvec a[KYBER_K], e, pkpv, skpv; randombytes(buf, KYBER_SYMBYTES); hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&skpv); PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv); - PQCLEAN_KYBER102490S_CLEAN_poly_frommont(&pkpv.vec[i]); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER102490S_CLEAN_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&pkpv, &pkpv, &e); @@ -217,34 +243,40 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec sp, pkpv, ep, at[KYBER_K], bp; - poly v, k, epp; +void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(&k, m); gen_at(at, seed); - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); } PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(&epp, coins, nonce++); @@ -252,14 +284,14 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c, PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(&bp); - PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&v); + PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&v); PQCLEAN_KYBER102490S_CLEAN_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER102490S_CLEAN_poly_add(&v, &v, &epp); @@ -271,18 +303,21 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER102490S_CLEAN_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -290,8 +325,8 @@ void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&mp); + PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER102490S_CLEAN_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&mp); diff --git a/crypto_kem/kyber1024-90s/clean/indcpa.h b/crypto_kem/kyber1024-90s/clean/indcpa.h index a2452863..b8d62e3b 100644 --- a/crypto_kem/kyber1024-90s/clean/indcpa.h +++ b/crypto_kem/kyber1024-90s/clean/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER102490S_CLEAN_INDCPA_H +#define PQCLEAN_KYBER102490S_CLEAN_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER102490S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber1024-90s/clean/kem.c b/crypto_kem/kyber1024-90s/clean/kem.c index 3ed23315..9020f577 100644 --- a/crypto_kem/kyber1024-90s/clean/kem.c +++ b/crypto_kem/kyber1024-90s/clean/kem.c @@ -1,99 +1,125 @@ -#include "api.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER102490S_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { uint8_t buf[2 * KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + /* Don't release system RNG output */ + hash_h(buf, buf, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; PQCLEAN_KYBER102490S_CLEAN_indcpa_dec(buf, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER102490S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER102490S_CLEAN_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER102490S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER102490S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber1024-90s/clean/kem.h b/crypto_kem/kyber1024-90s/clean/kem.h new file mode 100644 index 00000000..84951187 --- /dev/null +++ b/crypto_kem/kyber1024-90s/clean/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER102490S_CLEAN_KEM_H +#define PQCLEAN_KYBER102490S_CLEAN_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER102490S_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber1024-90s/clean/ntt.c b/crypto_kem/kyber1024-90s/clean/ntt.c index aa395c0b..8537f819 100644 --- a/crypto_kem/kyber1024-90s/clean/ntt.c +++ b/crypto_kem/kyber1024-90s/clean/ntt.c @@ -1,11 +1,9 @@ -#include "ntt.h" #include "params.h" +#include "ntt.h" #include "reduce.h" - -#include #include -/* Code to generate zetas and zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER102490S_CLEAN_zetas and PQCLEAN_KYBER102490S_CLEAN_zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 @@ -17,12 +15,8 @@ static const uint16_t tree[128] = { 1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121, 5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125, 3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123, - 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127}; - - -static int16_t fqmul(int16_t a, int16_t b) { - return montgomery_reduce((int32_t)a*b); -} + 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127 +}; void init_ntt() { unsigned int i, j, k; @@ -33,40 +27,44 @@ void init_ntt() { tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); for(i = 0; i < 128; ++i) - zetas[i] = tmp[tree[i]]; + PQCLEAN_KYBER102490S_CLEAN_zetas[i] = tmp[tree[i]]; k = 0; for(i = 64; i >= 1; i >>= 1) for(j = i; j < 2*i; ++j) - zetas_inv[k++] = -tmp[128 - tree[j]]; + PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; } */ + const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, - 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, - 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, - 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, - 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, - 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, + 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, + 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, + 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, + 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, + 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, + 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, + 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, + 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 }; const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, - 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, - 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, - 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, - 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, - 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 + 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, + 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, + 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, + 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, + 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, + 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, + 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, + 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, + 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 }; - /************************************************* * Name: fqmul * @@ -82,68 +80,73 @@ static int16_t fqmul(int16_t a, int16_t b) { } /************************************************* -* Name: ntt +* Name: PQCLEAN_KYBER102490S_CLEAN_ntt * * Description: Inplace number-theoretic transform (NTT) in Rq * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t poly[256]) { - size_t j, k = 1; - int16_t t, zeta; +void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]) { + unsigned int len = 0, start = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 128; len >= 2; len >>= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 1; + for (len = 128; len >= 2; len >>= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER102490S_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = fqmul(zeta, poly[j + len]); - poly[j + len] = poly[j] - t; - poly[j] = poly[j] + t; + t = fqmul(zeta, r[j + len]); + r[j + len] = r[j] - t; + r[j] = r[j] + t; } } } } /************************************************* -* Name: invntt +* Name: invntt_tomont * -* Description: Inplace inverse number-theoretic transform in Rq -* input is in bitreversed order, output is in standard order +* Description: Inplace inverse number-theoretic transform in Rq and +* multiplication by Montgomery factor 2^16. +* Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t poly[256]) { - size_t j, k = 0; - int16_t t, zeta; +void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]) { + unsigned int start = 0, len = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 2; len <= 128; len <<= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 0; + for (len = 2; len <= 128; len <<= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER102490S_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { - t = poly[j]; - poly[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + poly[j + len]); - poly[j + len] = t - poly[j + len]; - poly[j + len] = fqmul(zeta, poly[j + len]); + t = r[j]; + r[j] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(t + r[j + len]); + r[j + len] = t - r[j + len]; + r[j + len] = fqmul(zeta, r[j + len]); } } } for (j = 0; j < 256; ++j) { - poly[j] = fqmul(poly[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]); + r[j] = fqmul(r[j], PQCLEAN_KYBER102490S_CLEAN_zetas_inv[127]); } } /************************************************* -* Name: basemul +* Name: PQCLEAN_KYBER102490S_CLEAN_basemul * -* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta)) +* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); diff --git a/crypto_kem/kyber1024-90s/clean/ntt.h b/crypto_kem/kyber1024-90s/clean/ntt.h index dd4b75ae..a64fd0a7 100644 --- a/crypto_kem/kyber1024-90s/clean/ntt.h +++ b/crypto_kem/kyber1024-90s/clean/ntt.h @@ -1,13 +1,22 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER102490S_CLEAN_NTT_H +#define PQCLEAN_KYBER102490S_CLEAN_NTT_H +#include "params.h" #include + extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetasinv[128]; -void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t *poly); -void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t *poly); + +extern const int16_t PQCLEAN_KYBER102490S_CLEAN_zetas_inv[128]; + + +void PQCLEAN_KYBER102490S_CLEAN_ntt(int16_t r[256]); + + +void PQCLEAN_KYBER102490S_CLEAN_invntt(int16_t r[256]); + + void PQCLEAN_KYBER102490S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber1024-90s/clean/params.h b/crypto_kem/kyber1024-90s/clean/params.h index 85dcf73a..97aa969f 100644 --- a/crypto_kem/kyber1024-90s/clean/params.h +++ b/crypto_kem/kyber1024-90s/clean/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER102490S_CLEAN_PARAMS_H +#define PQCLEAN_KYBER102490S_CLEAN_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 4 #define KYBER_POLYCOMPRESSEDBYTES 160 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber1024-90s/clean/poly.c b/crypto_kem/kyber1024-90s/clean/poly.c index a2855378..bafb04a5 100644 --- a/crypto_kem/kyber1024-90s/clean/poly.c +++ b/crypto_kem/kyber1024-90s/clean/poly.c @@ -1,120 +1,177 @@ +#include "params.h" #include "cbd.h" #include "ntt.h" -#include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include + /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_compress * * Description: Compression and subsequent serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t k = 0; PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N; i += 8) { - for (size_t j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; } - r[k] = (uint8_t)( t[0] | (t[1] << 5)); - r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4)); - r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3)); - k += 5; + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER102490S_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5); +void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + unsigned int j = 0; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 5) | (a[1] << 3); + t[2] = (a[1] >> 2); + t[3] = (a[1] >> 7) | (a[2] << 1); + t[4] = (a[2] >> 4) | (a[3] << 4); + t[5] = (a[3] >> 1); + t[6] = (a[3] >> 6) | (a[4] << 2); + t[7] = (a[4] >> 3); a += 5; + + for (j = 0; j < 8; j++) { + r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; + } } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tobytes * * Description: Serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t *r, poly *a) { - int16_t t0, t1; +void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + unsigned int i = 0; + uint16_t t0 = 0, t1 = 0; PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N / 2; i++) { + for (i = 0; i < KYBER_N / 2; i++) { t0 = a->coeffs[2 * i]; t1 = a->coeffs[2 * i + 1]; - r[3 * i] = t0 & 0xff; - r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 & 0xf) << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER102490S_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N / 2; i++) { - r->coeffs[2 * i] = (int16_t)(a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8); - r->coeffs[2 * i + 1] = (int16_t)(a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4); +void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF; + r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF; } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + unsigned int i = 0, j = 0; + int16_t mask = 0; + + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + mask = -(int16_t)((msg[i] >> j) & 1); + r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { + unsigned int i = 0, j = 0; + uint16_t t = 0; + + PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); + + for (i = 0; i < KYBER_N / 8; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { +void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); + prf(buf, sizeof(buf), seed, nonce); PQCLEAN_KYBER102490S_CLEAN_cbd(r, buf); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -128,20 +185,20 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r) { } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r) { +void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r) { PQCLEAN_KYBER102490S_CLEAN_invntt(r->coeffs); } /************************************************* -* Name: poly_basemul +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery * * Description: Multiplication of two polynomials in NTT domain * @@ -149,68 +206,64 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N / 4; ++i) { - PQCLEAN_KYBER102490S_CLEAN_basemul( - r->coeffs + 4 * i, - a->coeffs + 4 * i, - b->coeffs + 4 * i, - PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER102490S_CLEAN_basemul( - r->coeffs + 4 * i + 2, - a->coeffs + 4 * i + 2, - b->coeffs + 4 * i + 2, - -PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); +void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 4; i++) { + PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER102490S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], + -PQCLEAN_KYBER102490S_CLEAN_zetas[64 + i]); } } /************************************************* -* Name: poly_frommont +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_tomont * * Description: Inplace conversion of all coefficients of a polynomial -* from Montgomery domain to normal domain +* from normal domain to Montgomery domain * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_frommont(poly *r) { +void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r) { + unsigned int i = 0; const int16_t f = (1ULL << 32) % KYBER_Q; - - for (size_t i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce( - (int32_t)r->coeffs[i] * f); + for (i = 0; i < KYBER_N; i++) { + r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce((int32_t)r->coeffs[i] * f); } } /************************************************* -* Name: poly_reduce +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_reduce * * Description: Applies Barrett reduction to all coefficients of a polynomial * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(r->coeffs[i]); } } /************************************************* -* Name: poly_csubq +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_csubq * -* Description: Applies conditional subtraction of q to each coefficient of a polynomial -* for details of conditional subtraction of q see comments in reduce.c +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER102490S_CLEAN_csubq(r->coeffs[i]); } } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_add * * Description: Add two polynomials * @@ -219,13 +272,14 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER102490S_CLEAN_poly_sub * * Description: Subtract two polynomials * @@ -234,48 +288,8 @@ void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] - b->coeffs[i]; } } - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - uint16_t mask; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - for (size_t j = 0; j < 8; j++) { - mask = -((msg[i] >> j) & 1); - r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); - } - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint16_t t; - - PQCLEAN_KYBER102490S_CLEAN_poly_csubq(a); - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - msg[i] = 0; - for (size_t j = 0; j < 8; j++) { - t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; - msg[i] |= t << j; - } - } -} diff --git a/crypto_kem/kyber1024-90s/clean/poly.h b/crypto_kem/kyber1024-90s/clean/poly.h index e483c050..1c01914d 100644 --- a/crypto_kem/kyber1024-90s/clean/poly.h +++ b/crypto_kem/kyber1024-90s/clean/poly.h @@ -1,9 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER102490S_CLEAN_POLY_H +#define PQCLEAN_KYBER102490S_CLEAN_POLY_H #include "params.h" - #include + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -12,26 +12,41 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER102490S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER102490S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER102490S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER102490S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER102490S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + void PQCLEAN_KYBER102490S_CLEAN_poly_ntt(poly *r); -void PQCLEAN_KYBER102490S_CLEAN_poly_invntt(poly *r); -void PQCLEAN_KYBER102490S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER102490S_CLEAN_poly_frommont(poly *r); + +void PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(poly *r); + +void PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER102490S_CLEAN_poly_tomont(poly *r); + void PQCLEAN_KYBER102490S_CLEAN_poly_reduce(poly *r); + void PQCLEAN_KYBER102490S_CLEAN_poly_csubq(poly *r); + void PQCLEAN_KYBER102490S_CLEAN_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER102490S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber1024-90s/clean/polyvec.c b/crypto_kem/kyber1024-90s/clean/polyvec.c index bd5a60b3..aa36c8df 100644 --- a/crypto_kem/kyber1024-90s/clean/polyvec.c +++ b/crypto_kem/kyber1024-90s/clean/polyvec.c @@ -1,138 +1,163 @@ -#include "polyvec.h" - +#include "params.h" #include "poly.h" - -#include +#include "polyvec.h" #include + /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_compress * * Description: Compress and serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(a); uint16_t t[8]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 8; j++) { - for (size_t k = 0; k < 8; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + for (k = 0; k < 8; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) + / KYBER_Q) & 0x7ff; + } } - r[11 * j + 0] = (uint8_t)t[0]; - r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3)); - r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6)); - r[11 * j + 3] = (uint8_t)((t[2] >> 2)); - r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1)); - r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4)); - r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7)); - r[11 * j + 7] = (uint8_t)((t[5] >> 1)); - r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2)); - r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5)); - r[11 * j + 10] = (uint8_t)((t[7] >> 3)); + r[ 0] = (t[0] >> 0); + r[ 1] = (t[0] >> 8) | (t[1] << 3); + r[ 2] = (t[1] >> 5) | (t[2] << 6); + r[ 3] = (t[2] >> 2); + r[ 4] = (t[2] >> 10) | (t[3] << 1); + r[ 5] = (t[3] >> 7) | (t[4] << 4); + r[ 6] = (t[4] >> 4) | (t[5] << 7); + r[ 7] = (t[5] >> 1); + r[ 8] = (t[5] >> 9) | (t[6] << 2); + r[ 9] = (t[6] >> 6) | (t[7] << 5); + r[10] = (t[7] >> 3); + r += 11; } - r += 352; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER102490S_CLEAN_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 8; j++) { - r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11); +void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[8]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); + t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); + t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); + t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); + t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); + t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); + t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); + t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); + a += 11; + + for (k = 0; k < 8; k++) { + r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; + } } - a += 352; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes * * Description: Serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) +* Arguments: - uint8_t *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_CLEAN_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { +void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + unsigned int i = 0; poly t; - PQCLEAN_KYBER102490S_CLEAN_poly_basemul(r, &a->vec[0], &b->vec[0]); - for (size_t i = 1; i < KYBER_K; i++) { - PQCLEAN_KYBER102490S_CLEAN_poly_basemul(&t, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER102490S_CLEAN_poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]); PQCLEAN_KYBER102490S_CLEAN_poly_add(r, r, &t); } @@ -140,37 +165,40 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, } /************************************************* -* Name: polyvec_reduce +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient * of each element of a vector of polynomials * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_reduce(&r->vec[i]); } } /************************************************* -* Name: polyvec_csubq +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq * * Description: Applies conditional subtraction of q to each coefficient * of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in reduce.c +* for details of conditional subtraction of q see comments in +* reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER102490S_CLEAN_polyvec_add * * Description: Add vectors of polynomials * @@ -179,7 +207,8 @@ void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER102490S_CLEAN_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber1024-90s/clean/polyvec.h b/crypto_kem/kyber1024-90s/clean/polyvec.h index 9df6e4d9..00d41ea2 100644 --- a/crypto_kem/kyber1024-90s/clean/polyvec.h +++ b/crypto_kem/kyber1024-90s/clean/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER102490S_CLEAN_POLYVEC_H +#define PQCLEAN_KYBER102490S_CLEAN_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER102490S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER102490S_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER102490S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER102490S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER102490S_CLEAN_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER102490S_CLEAN_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER102490S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER102490S_CLEAN_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER102490S_CLEAN_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER102490S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber1024-90s/clean/reduce.c b/crypto_kem/kyber1024-90s/clean/reduce.c index b7b3722b..89ea4861 100644 --- a/crypto_kem/kyber1024-90s/clean/reduce.c +++ b/crypto_kem/kyber1024-90s/clean/reduce.c @@ -1,32 +1,32 @@ -#include "reduce.h" - #include "params.h" - +#include "reduce.h" #include + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes * 16-bit integer congruent to a * R^-1 mod q, * where R=2^16 * -* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1} +* Arguments: - int32_t a: input integer to be reduced; +* has to be in {-q2^15,...,q2^15-1} * * Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. **************************************************/ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) { - int32_t t; - int16_t u; + int32_t t = 0; + int16_t u = 0; u = (int16_t)(a * (int64_t)QINV); t = (int32_t)u * KYBER_Q; t = a - t; t >>= 16; - return (int16_t)t; + return t; } /************************************************* -* Name: barrett_reduce +* Name: PQCLEAN_KYBER102490S_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes * 16-bit integer congruent to a mod q in {0,...,q} @@ -36,21 +36,20 @@ int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a) { * Returns: integer in {0,...,q} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a) { - int32_t t; - const int32_t v = (1U << 26) / KYBER_Q + 1; + int16_t t = 0; + const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = v * a; - t >>= 26; + t = (int32_t)v * a >> 26; t *= KYBER_Q; - return a - (int16_t)t; + return a - t; } /************************************************* -* Name: csubq +* Name: PQCLEAN_KYBER102490S_CLEAN_csubq * * Description: Conditionallly subtract q * -* Arguments: - int16_t a: input integer +* Arguments: - int16_t x: input integer * * Returns: a - q if a >= q, else a **************************************************/ diff --git a/crypto_kem/kyber1024-90s/clean/reduce.h b/crypto_kem/kyber1024-90s/clean/reduce.h index 04a43f85..d53bafdc 100644 --- a/crypto_kem/kyber1024-90s/clean/reduce.h +++ b/crypto_kem/kyber1024-90s/clean/reduce.h @@ -1,15 +1,19 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER102490S_CLEAN_REDUCE_H +#define PQCLEAN_KYBER102490S_CLEAN_REDUCE_H +#include "params.h" #include -#define MONT 2285 // 2^16 % Q -#define QINV 62209 // q^(-1) mod 2^16 +#define MONT 2285 // 2^16 mod q +#define QINV 62209 // q^-1 mod 2^16 + int16_t PQCLEAN_KYBER102490S_CLEAN_montgomery_reduce(int32_t a); + int16_t PQCLEAN_KYBER102490S_CLEAN_barrett_reduce(int16_t a); + int16_t PQCLEAN_KYBER102490S_CLEAN_csubq(int16_t a); #endif diff --git a/crypto_kem/kyber1024-90s/clean/aes256ctr.c b/crypto_kem/kyber1024-90s/clean/symmetric-aes.c similarity index 98% rename from crypto_kem/kyber1024-90s/clean/aes256ctr.c rename to crypto_kem/kyber1024-90s/clean/symmetric-aes.c index 2d4592e5..1728f02e 100644 --- a/crypto_kem/kyber1024-90s/clean/aes256ctr.c +++ b/crypto_kem/kyber1024-90s/clean/symmetric-aes.c @@ -1,4 +1,4 @@ -#include "aes256ctr.h" +#include "symmetric-aes.h" #include "aes.h" #include #include @@ -14,7 +14,7 @@ static inline void br_enc32be(unsigned char *dst, uint32_t x) { static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) { uint8_t ivw[16]; uint8_t buf[AES_BLOCKBYTES]; - size_t i; + size_t i = 0; memcpy(ivw, iv, AESCTR_NONCEBYTES); br_enc32be(ivw + AESCTR_NONCEBYTES, ctr); @@ -94,7 +94,6 @@ void PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nbl s->ctr += (uint32_t) (4 * nblocks); } -/** Free the AES ctx **/ void PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) { aes256_ctx_release(&s->sk_exp); } diff --git a/crypto_kem/kyber1024-90s/clean/aes256ctr.h b/crypto_kem/kyber1024-90s/clean/symmetric-aes.h similarity index 100% rename from crypto_kem/kyber1024-90s/clean/aes256ctr.h rename to crypto_kem/kyber1024-90s/clean/symmetric-aes.h diff --git a/crypto_kem/kyber1024-90s/clean/symmetric.h b/crypto_kem/kyber1024-90s/clean/symmetric.h index df1faa56..d5adb1fb 100644 --- a/crypto_kem/kyber1024-90s/clean/symmetric.h +++ b/crypto_kem/kyber1024-90s/clean/symmetric.h @@ -2,22 +2,24 @@ #define SYMMETRIC_H #include "params.h" +#include +#include -#include "aes256ctr.h" #include "sha2.h" +#include "symmetric-aes.h" + +typedef aes256xof_ctx xof_state; + +#define XOF_BLOCKBYTES 64 #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, IN, X, Y) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER102490S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y) #define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) PQCLEAN_KYBER102490S_CLEAN_aes256xof_ctx_release(STATE) #define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER102490S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) -#define XOF_BLOCKBYTES 64 - -typedef aes256xof_ctx xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber1024-90s/clean/verify.c b/crypto_kem/kyber1024-90s/clean/verify.c index 5f98ba9d..2b03b703 100644 --- a/crypto_kem/kyber1024-90s/clean/verify.c +++ b/crypto_kem/kyber1024-90s/clean/verify.c @@ -1,34 +1,31 @@ #include "verify.h" - #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER102490S_CLEAN_verify * * Description: Compare two arrays for equality in constant time. * * Arguments: const uint8_t *a: pointer to first byte array * const uint8_t *b: pointer to second byte array -* size_t len: length of the byte arrays +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { - uint64_t r; - size_t i; - r = 0; +int PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t i = 0; + uint8_t r = 0; for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } - r = (-r) >> 63; - return (uint8_t)r; + return (-(uint64_t)r) >> 63; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER102490S_CLEAN_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; @@ -37,14 +34,14 @@ uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, si * * Arguments: uint8_t *r: pointer to output byte array * const uint8_t *x: pointer to input byte array -* size_t len: Amount of bytes to be copied +* size_t len: Amount of bytes to be copied * uint8_t b: Condition bit; has to be in {0,1} **************************************************/ void PQCLEAN_KYBER102490S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t i; + size_t i = 0; b = -b; for (i = 0; i < len; i++) { - r[i] ^= b & (x[i] ^ r[i]); + r[i] ^= b & (r[i] ^ x[i]); } } diff --git a/crypto_kem/kyber1024-90s/clean/verify.h b/crypto_kem/kyber1024-90s/clean/verify.h index 3afd3fea..2446d798 100644 --- a/crypto_kem/kyber1024-90s/clean/verify.h +++ b/crypto_kem/kyber1024-90s/clean/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER102490S_CLEAN_VERIFY_H +#define PQCLEAN_KYBER102490S_CLEAN_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER102490S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER102490S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber1024/META.yml b/crypto_kem/kyber1024/META.yml index c50507eb..9e75d2ef 100644 --- a/crypto_kem/kyber1024/META.yml +++ b/crypto_kem/kyber1024/META.yml @@ -28,6 +28,7 @@ implementations: - architecture: x86_64 operating_systems: - Linux + - Darwin required_flags: - avx2 - bmi2 diff --git a/crypto_kem/kyber1024/avx2/LICENSE b/crypto_kem/kyber1024/avx2/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber1024/avx2/LICENSE +++ b/crypto_kem/kyber1024/avx2/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber1024/avx2/Makefile b/crypto_kem/kyber1024/avx2/Makefile index c5647c56..d8addf47 100644 --- a/crypto_kem/kyber1024/avx2/Makefile +++ b/crypto_kem/kyber1024/avx2/Makefile @@ -1,26 +1,58 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber1024_avx2.a -HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h fips202x4.h -OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \ - verify.o indcpa.o rejsample.o fips202x4.o symmetric-fips202.o +HEADERS= \ + align.h \ + api.h \ + cbd.h \ + cdecl.inc \ + consts.h \ + fips202x4.h \ + fq.inc \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + rejsample.h \ + shuffle.inc \ + symmetric.h \ + verify.h +OBJECTS= \ + basemul.o \ + cbd.o \ + consts.o \ + fips202x4.o \ + fq.o \ + indcpa.o \ + invntt.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + rejsample.o \ + shuffle.o \ + symmetric-shake.o \ + verify.o KECCAK4XDIR=../../../common/keccak4x KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) -CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls \ + -Wpointer-arith -Wshadow \ + -std=c99 -I../../../common $(EXTRAFLAGS) all: $(LIB) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) $(CFLAGS) -c -o $@ $< $(LIB): $(OBJECTS) $(KECCAK4X) $(AR) -r $@ $(OBJECTS) $(KECCAK4X) diff --git a/crypto_kem/kyber1024/avx2/align.h b/crypto_kem/kyber1024/avx2/align.h new file mode 100644 index 00000000..bd9c2be5 --- /dev/null +++ b/crypto_kem/kyber1024/avx2/align.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_KYBER1024_AVX2_ALIGN_H +#define PQCLEAN_KYBER1024_AVX2_ALIGN_H +#include + +#define ALIGN16_TYPE(t) \ + union { \ + __m128i vec; \ + t orig; \ + } + +#define ALIGN32_ARRAY(t, s) \ + union { \ + __m256i vec; \ + t arr[(s)]; \ + } + +#define ALIGN32_ARRAY_2D(t, n, m) \ + union { \ + __m256i vec; \ + t arr[(n)][(m)]; \ + } +#endif diff --git a/crypto_kem/kyber1024/avx2/basemul.S b/crypto_kem/kyber1024/avx2/basemul.S index 622d63d9..c2ee0bdb 100644 --- a/crypto_kem/kyber1024/avx2/basemul.S +++ b/crypto_kem/kyber1024/avx2/basemul.S @@ -1,4 +1,5 @@ #include "params.h" +#include "cdecl.inc" .macro schoolbook off,sign #load @@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0 vpaddd %ymm7,%ymm8,%ymm7 # y1 .endm -.macro red a0,a1,b0,b1 x,y,z +.macro red a0,a1,b0,b1,x,y,z #pack vpxor %ymm\x,%ymm\x,%ymm\x vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y @@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0 vpsubw %ymm\y,%ymm\b0,%ymm\b0 .endm -.global PQCLEAN_KYBER1024_AVX2_basemul_acc_avx -PQCLEAN_KYBER1024_AVX2_basemul_acc_avx: -#consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 - +.text +basemul64_acc_avx: poly0.0: schoolbook 0,0 @@ -117,7 +113,7 @@ vpaddd %ymm12,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,(%rdi) @@ -160,7 +156,7 @@ vpaddd %ymm12,%ymm5,%ymm5 vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,64(%rdi) @@ -168,17 +164,40 @@ vmovdqa %ymm5,96(%rdi) ret -.global PQCLEAN_KYBER1024_AVX2_basemul_avx -PQCLEAN_KYBER1024_AVX2_basemul_avx: +.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_basemul_acc_avx): #consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +ret +basemul64_avx: schoolbook 0,0 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,(%rdi) @@ -187,10 +206,39 @@ vmovdqa %ymm12,32(%rdi) schoolbook 64,1 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,64(%rdi) vmovdqa %ymm12,96(%rdi) ret + +.global cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_basemul_avx): +#consts +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +ret diff --git a/crypto_kem/kyber1024/avx2/cbd.c b/crypto_kem/kyber1024/avx2/cbd.c index aab0331f..6377f9dc 100644 --- a/crypto_kem/kyber1024/avx2/cbd.c +++ b/crypto_kem/kyber1024/avx2/cbd.c @@ -1,27 +1,27 @@ -#include "cbd.h" #include "params.h" - +#include "cbd.h" #include #include /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER1024_AVX2_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA * * Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *buf: pointer to input byte array +* - const unsigned char *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t *buf) { +void PQCLEAN_KYBER1024_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { + unsigned int i = 0; __m256i vec0, vec1, vec2, vec3, tmp; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); - for (size_t i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]); + for (i = 0; i < KYBER_N / 64; i++) { + vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); vec1 = _mm256_srli_epi32(vec0, 1); vec0 = _mm256_and_si256(mask55, vec0); diff --git a/crypto_kem/kyber1024/avx2/cbd.h b/crypto_kem/kyber1024/avx2/cbd.h index 87f9c77d..7603f2e7 100644 --- a/crypto_kem/kyber1024/avx2/cbd.h +++ b/crypto_kem/kyber1024/avx2/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER1024_AVX2_CBD_H +#define PQCLEAN_KYBER1024_AVX2_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER1024_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber1024/avx2/cdecl.inc b/crypto_kem/kyber1024/avx2/cdecl.inc new file mode 100644 index 00000000..8ded53b1 --- /dev/null +++ b/crypto_kem/kyber1024/avx2/cdecl.inc @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL +#define PQCLEAN_DILITHIUM2_AVX2_CDECL + +#define _16XQ 0 +#define _16XQINV 16 +#define _16XV 32 +#define _16XFLO 48 +#define _16XFHI 64 +#define _16XMONTSQLO 80 +#define _16XMONTSQHI 96 +#define _16XMASK 112 +#define _ZETAS_EXP 128 +#define _ZETAS_INV_EXP 528 + + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_kem/kyber1024/avx2/consts.c b/crypto_kem/kyber1024/avx2/consts.c index c7591fa7..dfa52c12 100644 --- a/crypto_kem/kyber1024/avx2/consts.c +++ b/crypto_kem/kyber1024/avx2/consts.c @@ -1,34 +1,155 @@ -#include "consts.h" #include "params.h" - -const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628}; -const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932}; +#include "consts.h" +#include #define Q KYBER_Q -#define MONT ((1U << 16) % KYBER_Q) +#define MONT ((1U << 16) % Q) #define QINV 62209 // q^-1 mod 2^16 -#define V ((1U << 26)/KYBER_Q + 1) -#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q) -#define FLO (FHI * QINV % 65536) -#define MONTSQHI (MONT * MONT % KYBER_Q) -#define MONTSQLO (MONTSQHI * QINV % 65536) +#define V (((1U << 26) + Q/2)/Q) +#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) +#define FLO (FHI*QINV % 65536) +#define MONTSQHI (MONT*MONT % Q) +#define MONTSQLO (MONTSQHI*QINV % 65536) #define MASK 4095 -const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}}; -const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}}; -const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}}; -const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}}; -const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}}; -const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}}; - -#undef Q -#undef QINV -#undef MONT -#undef V -#undef FLO -#undef FHI -#undef MONTSQLO -#undef MONTSQHI -#undef MASK + +const qdata_t PQCLEAN_KYBER1024_AVX2_qdata = {.as_arr = { +#define _16XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, + +#define _16XQINV 16 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +#define _16XV 32 + V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + +#define _16XFLO 48 + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + +#define _16XFHI 64 + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + +#define _16XMONTSQLO 80 + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + +#define _16XMONTSQHI 96 + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + +#define _16XMASK 112 + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + +#define _ZETAS_EXP 128 + 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, + 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, + 3158, 3158, 3158, 3158, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, + 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, + 573, 573, 2004, 2004, 264, 264, 383, 383, + 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, + 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, + 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, + 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, + 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, + 2226, 555, 2078, 1550, 422, 177, 3038, 1574, + 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, + 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, + 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, + 430, 843, 871, 105, 587, 3094, 2869, 1653, + 778, 3182, 1483, 1119, 644, 349, 329, 3254, + 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, + 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, + 48842, 48842, 48842, 48842, 287, 287, 287, 287, + 287, 287, 287, 287, 202, 202, 202, 202, + 202, 202, 202, 202, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, + 31164, 31164, 31164, 31164, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, + 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, + 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, + 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, + 732, 732, 608, 608, 1787, 1787, 411, 411, + 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, + 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, + 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, + 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, + 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, + 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, + 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, + 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, + 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, + 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, + 3193, 1994, 220, 1670, 1799, 794, 2475, 478, + 3021, 991, 1869, 1628, 0, 0, 0, 0, + +#define _ZETAS_INV_EXP 528 + 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, + 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, + 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, + 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, + 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, + 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, + 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, + 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, + 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, + 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, + 951, 247, 1421, 3222, 2499, 271, 90, 853, + 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, + 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, + 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, + 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, + 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, + 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, + 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, + 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, + 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, + 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, + 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, + 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, + 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, + 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, + 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, + 2210, 1846, 147, 2551, 1676, 460, 235, 2742, + 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, + 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, + 45043, 32227, 11478, 335, 156, 2911, 872, 1590, + 602, 777, 2170, 246, 1755, 291, 3152, 2907, + 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, + 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, + 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, + 666, 320, 8, 2813, 1544, 282, 1838, 1293, + 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, + 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, + 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, + 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, + 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, + 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, + 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, + 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, + 171, 171, 171, 171, 12403, 12403, 12403, 12403, + 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, + 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, + 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, + 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, + 60300, 60300, 1932, 1932, 0, 0, 0, 0 + } +}; diff --git a/crypto_kem/kyber1024/avx2/consts.h b/crypto_kem/kyber1024/avx2/consts.h index 78b99566..ea1376c1 100644 --- a/crypto_kem/kyber1024/avx2/consts.h +++ b/crypto_kem/kyber1024/avx2/consts.h @@ -1,24 +1,20 @@ -#ifndef CONSTS_H -#define CONSTS_H +#ifndef PQCLEAN_KYBER1024_AVX2_CONSTS_H +#define PQCLEAN_KYBER1024_AVX2_CONSTS_H +#include "cdecl.inc" + +#include "params.h" #include #include -typedef union { - uint16_t as_arr[16]; - __m256i as_vec; -} aligned_uint16_t; +#define ALIGNED_UINT16_T(N) \ + union { \ + __m256i as_vec; \ + uint16_t as_arr[(N)]; \ + } -extern const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_exp[396]; -extern const uint16_t PQCLEAN_KYBER1024_AVX2_zetas_inv_exp[396]; +typedef ALIGNED_UINT16_T(928) qdata_t; -extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xq; -extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xqinv; -extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xv; -extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xflo; -extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xfhi; -extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqlo; -extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmontsqhi; -extern const aligned_uint16_t PQCLEAN_KYBER1024_AVX2_16xmask; +extern const qdata_t PQCLEAN_KYBER1024_AVX2_qdata; #endif diff --git a/crypto_kem/kyber1024/avx2/fips202x4.c b/crypto_kem/kyber1024/avx2/fips202x4.c index 67041f0b..82c2e4c9 100644 --- a/crypto_kem/kyber1024/avx2/fips202x4.c +++ b/crypto_kem/kyber1024/avx2/fips202x4.c @@ -1,148 +1,111 @@ #include "fips202.h" #include "fips202x4.h" -#include "params.h" - #include +#include #include +#include +/* Use implementation from the Keccak Code Package */ +#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds +extern void KeccakF1600_StatePermute4x(__m256i *s); -#define NROUNDS 24 -#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64-(offset)))) - -static uint64_t load64(const uint8_t *x) { - unsigned long long r = 0, i; - - for (i = 0; i < 8; ++i) { - r |= (unsigned long long)x[i] << 8 * i; - } - return r; -} - -static void store64(uint8_t *x, uint64_t u) { - size_t i; +static inline void store64(uint8_t x[8], uint64_t u) { + unsigned int i = 0; - for (i = 0; i < 8; ++i) { - x[i] = (uint8_t)u; - u >>= 8; + for (i = 0; i < 8; i++) { + x[i] = u >> 8 * i; } } -/* Use implementation from the Keccak Code Package */ -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); -#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds - -static void keccak_absorb4x(__m256i *s, +static void keccakx4_absorb(__m256i s[25], unsigned int r, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, uint8_t p) { - size_t i; - uint8_t t0[200] = {0}; - uint8_t t1[200] = {0}; - uint8_t t2[200] = {0}; - uint8_t t3[200] = {0}; + size_t i = 0, pos = 0; + __m256i t, idx; - unsigned long long *ss = (unsigned long long *)s; + for (i = 0; i < 25; ++i) { + s[i] = _mm256_setzero_si256(); + } - while (mlen >= r) { + idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); + while (inlen >= r) { for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(m0 + 8 * i); - ss[4 * i + 1] ^= load64(m1 + 8 * i); - ss[4 * i + 2] ^= load64(m2 + 8 * i); - ss[4 * i + 3] ^= load64(m3 + 8 * i); + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; } KeccakF1600_StatePermute4x(s); - mlen -= r; - m0 += r; - m1 += r; - m2 += r; - m3 += r; + inlen -= r; } - for (i = 0; i < mlen; ++i) { - t0[i] = m0[i]; - t1[i] = m1[i]; - t2[i] = m2[i]; - t3[i] = m3[i]; + i = 0; + while (inlen >= 8) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + + i++; + pos += 8; + inlen -= 8; } - t0[i] = p; - t1[i] = p; - t2[i] = p; - t3[i] = p; - - t0[r - 1] |= 128; - t1[r - 1] |= 128; - t2[r - 1] |= 128; - t3[r - 1] |= 128; - - for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(t0 + 8 * i); - ss[4 * i + 1] ^= load64(t1 + 8 * i); - ss[4 * i + 2] ^= load64(t2 + 8 * i); - ss[4 * i + 3] ^= load64(t3 + 8 * i); + if (inlen) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); + t = _mm256_and_si256(t, idx); + s[i] = _mm256_xor_si256(s[i], t); } + + t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); + s[i] = _mm256_xor_si256(s[i], t); + t = _mm256_set1_epi64x((long long)(1ULL << 63)); + s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); } -static void keccak_squeezeblocks4x(uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, +static void keccakx4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, size_t nblocks, - __m256i *s, - unsigned int r) { - unsigned long long *ss = (unsigned long long *)s; + unsigned int r, + __m256i s[25]) { + unsigned int i = 0; + uint64_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; while (nblocks > 0) { KeccakF1600_StatePermute4x(s); - for (size_t i = 0; i < (r >> 3); i++) { - store64(h0 + 8 * i, ss[4 * i + 0]); - store64(h1 + 8 * i, ss[4 * i + 1]); - store64(h2 + 8 * i, ss[4 * i + 2]); - store64(h3 + 8 * i, ss[4 * i + 3]); + for (i = 0; i < r / 8; ++i) { + f0 = _mm256_extract_epi64(s[i], 0); + f1 = _mm256_extract_epi64(s[i], 1); + f2 = _mm256_extract_epi64(s[i], 2); + f3 = _mm256_extract_epi64(s[i], 3); + store64(out0, f0); + store64(out1, f1); + store64(out2, f2); + store64(out3, f3); + + out0 += 8; + out1 += 8; + out2 += 8; + out3 += 8; } - h0 += r; - h1 += r; - h2 += r; - h3 += r; - nblocks--; - } -} -void PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb(keccak4x_state *state, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3) { - uint8_t extseed[4][KYBER_SYMBYTES + 2]; - - for (size_t i = 0; i < KYBER_SYMBYTES; ++i) { - extseed[0][i] = seed[i]; - extseed[1][i] = seed[i]; - extseed[2][i] = seed[i]; - extseed[3][i] = seed[i]; - } - extseed[0][KYBER_SYMBYTES + 0] = (uint8_t)nonce0; - extseed[0][KYBER_SYMBYTES + 1] = (uint8_t)(nonce0 >> 8); - extseed[1][KYBER_SYMBYTES + 0] = (uint8_t)nonce1; - extseed[1][KYBER_SYMBYTES + 1] = (uint8_t)(nonce1 >> 8); - extseed[2][KYBER_SYMBYTES + 0] = (uint8_t)nonce2; - extseed[2][KYBER_SYMBYTES + 1] = (uint8_t)(nonce2 >> 8); - extseed[3][KYBER_SYMBYTES + 0] = (uint8_t)nonce3; - extseed[3][KYBER_SYMBYTES + 1] = (uint8_t)(nonce3 >> 8); - - /* zero state */ - for (size_t i = 0; i < 25; i++) { - state->s[i] = _mm256_xor_si256(state->s[i], state->s[i]); + --nblocks; } +} - /* absorb 4 message of identical length in parallel */ - keccak_absorb4x(state->s, SHAKE128_RATE, extseed[0], extseed[1], extseed[2], extseed[3], KYBER_SYMBYTES + 2, 0x1F); +void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, @@ -150,82 +113,78 @@ void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out2, uint8_t *out3, size_t nblocks, - keccak4x_state *state) { - keccak_squeezeblocks4x(out0, out1, out2, out3, nblocks, state->s, SHAKE128_RATE); + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, + state->s); } -static void shake256x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, size_t inlen) { - __m256i s[25]; - uint8_t t0[SHAKE256_RATE]; - uint8_t t1[SHAKE256_RATE]; - uint8_t t2[SHAKE256_RATE]; - uint8_t t3[SHAKE256_RATE]; - - /* zero state */ - for (size_t i = 0; i < 25; i++) { - s[i] = _mm256_xor_si256(s[i], s[i]); - } - - /* absorb 4 message of identical length in parallel */ - keccak_absorb4x(s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); - - /* Squeeze output */ - keccak_squeezeblocks4x(out0, out1, out2, out3, outlen / SHAKE256_RATE, s, SHAKE256_RATE); +void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); +} - out0 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out1 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out2 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out3 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; +void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, + state->s); +} - if (outlen % SHAKE256_RATE) { - keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE256_RATE); - for (size_t i = 0; i < outlen % SHAKE256_RATE; i++) { - out0[i] = t0[i]; - out1[i] = t1[i]; - out2[i] = t2[i]; - out3[i] = t3[i]; +void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { + unsigned int i = 0; + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[4][SHAKE128_RATE]; + keccakx4_state state; + + PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + out2 += nblocks * SHAKE128_RATE; + out3 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; } } } -void PQCLEAN_KYBER1024_AVX2_shake256x4_prf(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *key, - uint8_t nonce0, - uint8_t nonce1, - uint8_t nonce2, - uint8_t nonce3) { - uint8_t extseed[4][KYBER_SYMBYTES + 1]; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - extseed[0][i] = key[i]; - extseed[1][i] = key[i]; - extseed[2][i] = key[i]; - extseed[3][i] = key[i]; +void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { + unsigned int i = 0; + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[4][SHAKE256_RATE]; + keccakx4_state state; + + PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + out2 += nblocks * SHAKE256_RATE; + out3 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; + } } - extseed[0][KYBER_SYMBYTES] = nonce0; - extseed[1][KYBER_SYMBYTES] = nonce1; - extseed[2][KYBER_SYMBYTES] = nonce2; - extseed[3][KYBER_SYMBYTES] = nonce3; - - shake256x4(out0, - out1, - out2, - out3, - outlen, - extseed[0], - extseed[1], - extseed[2], - extseed[3], - KYBER_SYMBYTES + 1); } diff --git a/crypto_kem/kyber1024/avx2/fips202x4.h b/crypto_kem/kyber1024/avx2/fips202x4.h index d79081b9..a2d7802c 100644 --- a/crypto_kem/kyber1024/avx2/fips202x4.h +++ b/crypto_kem/kyber1024/avx2/fips202x4.h @@ -7,31 +7,19 @@ typedef struct { __m256i s[25]; -} keccak4x_state; - -void PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb(keccak4x_state *state, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3); - -void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccak4x_state *state); - -void PQCLEAN_KYBER1024_AVX2_shake256x4_prf(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *key, - uint8_t nonce0, - uint8_t nonce1, - uint8_t nonce2, - uint8_t nonce3); +} keccakx4_state; + +void PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, keccakx4_state *state); + +void PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, + keccakx4_state *state); + +void PQCLEAN_KYBER1024_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER1024_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); #endif diff --git a/crypto_kem/kyber512/avx2/fq.s b/crypto_kem/kyber1024/avx2/fq.S similarity index 54% rename from crypto_kem/kyber512/avx2/fq.s rename to crypto_kem/kyber1024/avx2/fq.S index 348898a5..1fba2bcd 100644 --- a/crypto_kem/kyber512/avx2/fq.s +++ b/crypto_kem/kyber1024/avx2/fq.S @@ -1,11 +1,8 @@ +#include "cdecl.inc" .include "fq.inc" -.global PQCLEAN_KYBER512_AVX2_reduce_avx -PQCLEAN_KYBER512_AVX2_reduce_avx: -#consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER512_AVX2_16xv(%rip),%ymm1 - +.text +reduce128_avx: #load vmovdqa (%rdi),%ymm2 vmovdqa 32(%rdi),%ymm3 @@ -16,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2 10 -red16 3 11 -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 10 -red16 9 11 +red16 2,10 +red16 3,11 +red16 4,12 +red16 5,13 +red16 6,14 +red16 7,15 +red16 8,10 +red16 9,11 #store vmovdqa %ymm2,(%rdi) @@ -37,11 +34,17 @@ vmovdqa %ymm9,224(%rdi) ret -.global PQCLEAN_KYBER512_AVX2_csubq_avx -PQCLEAN_KYBER512_AVX2_csubq_avx: +.global cdecl(PQCLEAN_KYBER1024_AVX2_reduce_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_reduce_avx): #consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XV*2(%rsi),%ymm1 +call reduce128_avx +add $256,%rdi +call reduce128_avx +ret +csubq128_avx: #load vmovdqa (%rdi),%ymm1 vmovdqa 32(%rdi),%ymm2 @@ -52,14 +55,14 @@ vmovdqa 160(%rdi),%ymm6 vmovdqa 192(%rdi),%ymm7 vmovdqa 224(%rdi),%ymm8 -csubq 1 9 -csubq 2 10 -csubq 3 11 -csubq 4 12 -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 9 +csubq 1,9 +csubq 2,10 +csubq 3,11 +csubq 4,12 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,9 #store vmovdqa %ymm1,(%rdi) @@ -73,13 +76,16 @@ vmovdqa %ymm8,224(%rdi) ret -.global PQCLEAN_KYBER512_AVX2_frommont_avx -PQCLEAN_KYBER512_AVX2_frommont_avx: +.global cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_csubq_avx): #consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER512_AVX2_16xmontsqlo(%rip),%ymm1 -vmovdqa PQCLEAN_KYBER512_AVX2_16xmontsqhi(%rip),%ymm2 +vmovdqa _16XQ*2(%rsi),%ymm0 +call csubq128_avx +add $256,%rdi +call csubq128_avx +ret +tomont128_avx: #load vmovdqa (%rdi),%ymm3 vmovdqa 32(%rdi),%ymm4 @@ -90,14 +96,14 @@ vmovdqa 160(%rdi),%ymm8 vmovdqa 192(%rdi),%ymm9 vmovdqa 224(%rdi),%ymm10 -fqmulprecomp 1,2,3 11 -fqmulprecomp 1,2,4 12 -fqmulprecomp 1,2,5 13 -fqmulprecomp 1,2,6 14 -fqmulprecomp 1,2,7 15 -fqmulprecomp 1,2,8 11 -fqmulprecomp 1,2,9 12 -fqmulprecomp 1,2,10 13 +fqmulprecomp 1,2,3,11 +fqmulprecomp 1,2,4,12 +fqmulprecomp 1,2,5,13 +fqmulprecomp 1,2,6,14 +fqmulprecomp 1,2,7,15 +fqmulprecomp 1,2,8,11 +fqmulprecomp 1,2,9,12 +fqmulprecomp 1,2,10,13 #store vmovdqa %ymm3,(%rdi) @@ -110,3 +116,14 @@ vmovdqa %ymm9,192(%rdi) vmovdqa %ymm10,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER1024_AVX2_tomont_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_tomont_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx +add $256,%rdi +call tomont128_avx +ret diff --git a/crypto_kem/kyber1024/avx2/fq.inc b/crypto_kem/kyber1024/avx2/fq.inc index 56e36a02..75df098a 100644 --- a/crypto_kem/kyber1024/avx2/fq.inc +++ b/crypto_kem/kyber1024/avx2/fq.inc @@ -1,24 +1,27 @@ -.macro red16 r x=12 +.macro red16 r,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x vpsraw $10,%ymm\x,%ymm\x vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro csubq r x=12 +.macro csubq r,x=12 vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r +#vpcmpgtw %ymm0,%ymm\r,%ymm\x +#vpand %ymm0,%ymm\x,%ymm\x +#vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro caddq r x=12 +.macro caddq r,x=12 vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r .endm -.macro fqmulprecomp al,ah,b x=12 +.macro fqmulprecomp al,ah,b,x=12 vpmullw %ymm\al,%ymm\b,%ymm\x vpmulhw %ymm\ah,%ymm\b,%ymm\b vpmulhw %ymm0,%ymm\x,%ymm\x diff --git a/crypto_kem/kyber1024/avx2/indcpa.c b/crypto_kem/kyber1024/avx2/indcpa.c index b68568d1..0ec35809 100644 --- a/crypto_kem/kyber1024/avx2/indcpa.c +++ b/crypto_kem/kyber1024/avx2/indcpa.c @@ -1,26 +1,33 @@ +#include "align.h" #include "cbd.h" #include "indcpa.h" #include "ntt.h" +#include "params.h" #include "poly.h" #include "polyvec.h" #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include +#include /************************************************* * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* compressed and serialized vector of polynomials pk +* serialized vector of polynomials pk * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { /************************************************* * Name: unpack_pk * -* Description: De-serialize and decompress public key from a byte array; +* Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(r, sk); } @@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials +* (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(sk, packedsk); } @@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER1024_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER1024_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER1024_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER1024_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } -static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - unsigned int ctr, pos; - uint16_t val; +/************************************************* +* Name: rej_uniform +* +* Description: Run rejection sampling on uniform random bytes to generate +* uniform random integers mod q +* +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes +* +* Returns number of sampled 16-bit integers (at most len) +**************************************************/ +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { @@ -116,57 +150,76 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER1024_AVX2_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER1024_AVX2_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER1024_AVX2_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - uint16_t i; - size_t ctr0, ctr1, ctr2, ctr3, bufbytes; - union { - uint8_t x[4][XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS]; - __m256i _dummy; - } buf; - keccak4x_state state; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { + unsigned int i = 0, ctr0 = 0, ctr1 = 0, ctr2 = 0, ctr3 = 0; + ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf; + __m256i f; + keccakx4_state state; for (i = 0; i < 4; i++) { + f = _mm256_load_si256((__m256i *)seed); + _mm256_store_si256((__m256i *)buf.arr[0], f); + _mm256_store_si256((__m256i *)buf.arr[1], f); + _mm256_store_si256((__m256i *)buf.arr[2], f); + _mm256_store_si256((__m256i *)buf.arr[3], f); + if (transposed) { - PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb( - &state, seed, i + 0, i + 256, i + 512, i + 768); + buf.arr[0][KYBER_SYMBYTES + 0] = i; + buf.arr[0][KYBER_SYMBYTES + 1] = 0; + buf.arr[1][KYBER_SYMBYTES + 0] = i; + buf.arr[1][KYBER_SYMBYTES + 1] = 1; + buf.arr[2][KYBER_SYMBYTES + 0] = i; + buf.arr[2][KYBER_SYMBYTES + 1] = 2; + buf.arr[3][KYBER_SYMBYTES + 0] = i; + buf.arr[3][KYBER_SYMBYTES + 1] = 3; } else { - PQCLEAN_KYBER1024_AVX2_kyber_shake128x4_absorb( - &state, seed, 256 * i + 0, 256 * i + 1, 256 * i + 2, 256 * i + 3); + buf.arr[0][KYBER_SYMBYTES + 0] = 0; + buf.arr[0][KYBER_SYMBYTES + 1] = i; + buf.arr[1][KYBER_SYMBYTES + 0] = 1; + buf.arr[1][KYBER_SYMBYTES + 1] = i; + buf.arr[2][KYBER_SYMBYTES + 0] = 2; + buf.arr[2][KYBER_SYMBYTES + 1] = i; + buf.arr[3][KYBER_SYMBYTES + 0] = 3; + buf.arr[3][KYBER_SYMBYTES + 1] = i; } - PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks( - buf.x[0], buf.x[1], buf.x[2], buf.x[3], GEN_MATRIX_MAXNBLOCKS, &state); - bufbytes = GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES; + PQCLEAN_KYBER1024_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); + PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], + GEN_MATRIX_NBLOCKS, &state); - ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[0].coeffs, KYBER_N, buf.x[0], bufbytes); - ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[1].coeffs, KYBER_N, buf.x[1], bufbytes); - ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[2].coeffs, KYBER_N, buf.x[2], bufbytes); - ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform(a[i].vec[3].coeffs, KYBER_N, buf.x[3], bufbytes); + ctr0 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[0].coeffs, buf.arr[0]); + ctr1 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[1].coeffs, buf.arr[1]); + ctr2 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[2].coeffs, buf.arr[2]); + ctr3 = PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(a[i].vec[3].coeffs, buf.arr[3]); while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { - PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], 1, &state); - bufbytes = XOF_BLOCKBYTES; - - ctr0 += rej_uniform_ref(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.x[0], bufbytes); - ctr1 += rej_uniform_ref(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.x[1], bufbytes); - ctr2 += rej_uniform_ref(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.x[2], bufbytes); - ctr3 += rej_uniform_ref(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.x[3], bufbytes); + PQCLEAN_KYBER1024_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + + ctr0 += rej_uniform(a[i].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], + XOF_BLOCKBYTES); + ctr1 += rej_uniform(a[i].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], + XOF_BLOCKBYTES); + ctr2 += rej_uniform(a[i].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], + XOF_BLOCKBYTES); + ctr3 += rej_uniform(a[i].vec[3].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], + XOF_BLOCKBYTES); } PQCLEAN_KYBER1024_AVX2_poly_nttunpack(&a[i].vec[0]); @@ -177,36 +230,41 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER1024_AVX2_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], skpv, e, pkpv; - uint8_t buf[2 * KYBER_SYMBYTES]; - const uint8_t *publicseed = buf; - const uint8_t *noiseseed = buf + KYBER_SYMBYTES; - uint8_t nonce = 0; +void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + const uint8_t *publicseed = buf.arr; + const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf, KYBER_SYMBYTES); - hash_g(buf, buf, KYBER_SYMBYTES); + randombytes(buf.arr, KYBER_SYMBYTES); + hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); gen_a(a, publicseed); - PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, nonce + 0, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, nonce + 4, nonce + 5, nonce + 6, nonce + 7); + PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, skpv.vec + 3, noiseseed, + 0, 1, 2, 3); + PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(e.vec + 0, e.vec + 1, e.vec + 2, e.vec + 3, noiseseed, + 4, 5, 6, 7); PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&skpv); PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv); - PQCLEAN_KYBER1024_AVX2_poly_frommont(pkpv.vec + i); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER1024_AVX2_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER1024_AVX2_polyvec_add(&pkpv, &pkpv, &e); @@ -217,45 +275,52 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER1024_AVX2_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec at[KYBER_K], pkpv, sp, ep, bp; - poly k, v, epp; - uint8_t seed[KYBER_SYMBYTES]; - uint8_t nonce = 0; - - unpack_pk(&pkpv, seed, pk); +void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; + + unpack_pk(&pkpv, seed.arr, pk); PQCLEAN_KYBER1024_AVX2_poly_frommsg(&k, m); - gen_at(at, seed); + gen_at(at, seed.arr); - PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, nonce + 0, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, nonce + 4, nonce + 5, nonce + 6, nonce + 7); - PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, nonce + 8); + PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, sp.vec + 3, coins, + 0, 1, 2, 3); + PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(ep.vec + 0, ep.vec + 1, ep.vec + 2, ep.vec + 3, coins, + 4, 5, 6, 7); + PQCLEAN_KYBER1024_AVX2_poly_getnoise(&epp, coins, 8); PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER1024_AVX2_polyvec_invntt(&bp); - PQCLEAN_KYBER1024_AVX2_poly_invntt(&v); + PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&v); PQCLEAN_KYBER1024_AVX2_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER1024_AVX2_poly_add(&v, &v, &epp); @@ -267,18 +332,21 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER1024_AVX2_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -286,8 +354,8 @@ void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER1024_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER1024_AVX2_poly_invntt(&mp); + PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER1024_AVX2_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER1024_AVX2_poly_reduce(&mp); diff --git a/crypto_kem/kyber1024/avx2/indcpa.h b/crypto_kem/kyber1024/avx2/indcpa.h index 39462caa..1f9b9604 100644 --- a/crypto_kem/kyber1024/avx2/indcpa.h +++ b/crypto_kem/kyber1024/avx2/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER1024_AVX2_INDCPA_H +#define PQCLEAN_KYBER1024_AVX2_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER1024_AVX2_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER1024_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER1024_AVX2_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER1024_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER1024_AVX2_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER1024_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER1024_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber512/avx2/invntt.s b/crypto_kem/kyber1024/avx2/invntt.S similarity index 76% rename from crypto_kem/kyber512/avx2/invntt.s rename to crypto_kem/kyber1024/avx2/invntt.S index 1673a338..0715e88c 100644 --- a/crypto_kem/kyber512/avx2/invntt.s +++ b/crypto_kem/kyber1024/avx2/invntt.S @@ -1,7 +1,8 @@ +#include "cdecl.inc" .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 #update & mul vpsubw %ymm\rh0,%ymm\rl0,%ymm12 vpsubw %ymm\rh1,%ymm\rl1,%ymm13 @@ -36,12 +37,8 @@ vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.global PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx -.p2align 5 -PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx: -#consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 - +.text +invntt_levels0t5_avx: level0: #zetas vmovdqu (%rsi),%ymm15 @@ -59,14 +56,14 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -butterfly 4,5,8,9,6,7,10,11 15,3,1,2 +butterfly 4,5,8,9,6,7,10,11,15,3,1,2 level1: #zetas vmovdqu 128(%rsi),%ymm3 vmovdqu 160(%rsi),%ymm2 -butterfly 4,5,6,7,8,9,10,11 3,3,2,2 +butterfly 4,5,6,7,8,9,10,11,3,3,2,2 shuffle1 4,5,3,5 shuffle1 6,7,4,7 @@ -79,9 +76,9 @@ vmovdqu 192(%rsi),%ymm10 vmovdqu 224(%rsi),%ymm2 #consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xv(%rip),%ymm1 +vmovdqa _16XV*2(%rdx),%ymm1 -butterfly 3,4,6,8,5,7,9,11 10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,10,10,2,2 red16 3 @@ -95,7 +92,7 @@ level3: vmovdqu 256(%rsi),%ymm9 vmovdqu 288(%rsi),%ymm2 -butterfly 10,3,6,5,4,8,7,11 9,9,2,2 +butterfly 10,3,6,5,4,8,7,11,9,9,2,2 red16 10 @@ -109,7 +106,7 @@ level4: vmovdqu 320(%rsi),%ymm7 vmovdqu 352(%rsi),%ymm2 -butterfly 9,10,6,4,3,5,8,11 7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,7,7,2,2 red16 9 @@ -123,7 +120,7 @@ level5: vpbroadcastd 384(%rsi),%ymm8 vpbroadcastd 388(%rsi),%ymm2 -butterfly 7,9,6,3,10,4,5,11 8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,8,8,2,2 red16 7 @@ -139,11 +136,7 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_KYBER512_AVX2_invntt_level6_avx -PQCLEAN_KYBER512_AVX2_invntt_level6_avx: -#consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 - +invntt_level6_avx: #zetas vpbroadcastd (%rsi),%ymm1 vpbroadcastd 4(%rsi),%ymm2 @@ -161,8 +154,8 @@ vmovdqa 352(%rdi),%ymm11 butterfly 4,5,6,7,8,9,10,11 #consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER512_AVX2_16xfhi(%rip),%ymm13 +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 #store vmovdqa %ymm8,256(%rdi) @@ -170,10 +163,10 @@ vmovdqa %ymm9,288(%rdi) vmovdqa %ymm10,320(%rdi) vmovdqa %ymm11,352(%rdi) -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 #store vmovdqa %ymm4,(%rdi) @@ -194,8 +187,8 @@ vmovdqa 480(%rdi),%ymm11 butterfly 4,5,6,7,8,9,10,11 #consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER512_AVX2_16xfhi(%rip),%ymm13 +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 #store vmovdqa %ymm8,384(%rdi) @@ -203,10 +196,10 @@ vmovdqa %ymm9,416(%rdi) vmovdqa %ymm10,448(%rdi) vmovdqa %ymm11,480(%rdi) -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 #store vmovdqa %ymm4,128(%rdi) @@ -215,3 +208,18 @@ vmovdqa %ymm6,192(%rdi) vmovdqa %ymm7,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_invntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_INV_EXP*2,%rsi +call invntt_levels0t5_avx +add $256,%rdi +add $392,%rsi +call invntt_levels0t5_avx +sub $256,%rdi +add $392,%rsi +call invntt_level6_avx +ret diff --git a/crypto_kem/kyber1024/avx2/kem.c b/crypto_kem/kyber1024/avx2/kem.c index c93fea21..cca559b5 100644 --- a/crypto_kem/kyber1024/avx2/kem.c +++ b/crypto_kem/kyber1024/avx2/kem.c @@ -1,103 +1,127 @@ -#include "api.h" +#include "align.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include + -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER1024_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - uint8_t buf[2 * KYBER_SYMBYTES]; +int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + randombytes(buf.arr, KYBER_SYMBYTES); + /* Don't release system RNG output */ + hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER1024_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER1024_AVX2_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - union { - uint8_t x[KYBER_CIPHERTEXTBYTES]; - __m256i __dummy; - } _cmp; - uint8_t *cmp = _cmp.x; - uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf, ct, sk); + PQCLEAN_KYBER1024_AVX2_indcpa_dec(buf.arr, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER1024_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER1024_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER1024_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER1024_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber1024/avx2/kem.h b/crypto_kem/kyber1024/avx2/kem.h new file mode 100644 index 00000000..35a3c39c --- /dev/null +++ b/crypto_kem/kyber1024/avx2/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER1024_AVX2_KEM_H +#define PQCLEAN_KYBER1024_AVX2_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER1024_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER1024_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER1024_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber1024-90s/avx2/ntt.s b/crypto_kem/kyber1024/avx2/ntt.S similarity index 81% rename from crypto_kem/kyber1024-90s/avx2/ntt.s rename to crypto_kem/kyber1024/avx2/ntt.S index 17bfcdb2..9cc14c03 100644 --- a/crypto_kem/kyber1024-90s/avx2/ntt.s +++ b/crypto_kem/kyber1024/avx2/ntt.S @@ -1,7 +1,8 @@ +#include "cdecl.inc" .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=15,zl1=15,zh0=1,zh1=1 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 #mul vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 @@ -36,7 +37,7 @@ vpaddw %ymm15,%ymm\rl3,%ymm\rl3 # We break the dependency chains with the cost of slightly more additions. # But they can be run in parallel to the multiplications on execution port 5 # (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 +.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 #mul vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x @@ -73,11 +74,8 @@ vpaddw %ymm15,%ymm\rh3,%ymm\rh3 vpsubw %ymm15,%ymm\rl3,%ymm\rl3 .endm -.global PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx -PQCLEAN_KYBER102490S_AVX2_ntt_level0_avx: -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 - +.text +ntt_level0_avx: level0: #zetas vpbroadcastd (%rsi),%ymm15 @@ -107,11 +105,7 @@ vmovdqa %ymm11,352(%rdi) ret -.global PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx -PQCLEAN_KYBER102490S_AVX2_ntt_levels1t6_avx: -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 - +ntt_levels1t6_avx: level1: #zetas vpbroadcastd (%rsi),%ymm15 @@ -127,7 +121,7 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -butterfly2 4,5,6,7,8,9,10,11 3 +butterfly2 4,5,6,7,8,9,10,11,3 level2: #zetas @@ -139,7 +133,7 @@ shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly2 3,8,4,9,5,10,6,11 7 +butterfly2 3,8,4,9,5,10,6,11,7 level3: #zetas @@ -151,7 +145,7 @@ shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly2 7,5,3,10,8,6,4,11 9 +butterfly2 7,5,3,10,8,6,4,11,9 level4: #zetas @@ -163,7 +157,7 @@ shuffle2 5,6,7,6 shuffle2 3,4,5,4 shuffle2 10,11,3,11 -butterfly2 9,8,7,6,5,4,3,11 10 +butterfly2 9,8,7,6,5,4,3,11,10 level5: #zetas @@ -175,7 +169,7 @@ shuffle1 8,4,9,4 shuffle1 7,3,8,3 shuffle1 6,11,7,11 -butterfly2 10,5,9,4,8,3,7,11 6 +butterfly2 10,5,9,4,8,3,7,11,6 level6: #zetas @@ -184,17 +178,17 @@ vmovdqu 328(%rsi),%ymm15 vmovdqu 296(%rsi),%ymm1 vmovdqu 360(%rsi),%ymm2 -butterfly2 10,5,8,3,9,4,7,11 6,1,14,15,1,2 +butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xv(%rip),%ymm1 -red16 10 12 -red16 5 13 -red16 9 14 -red16 4 15 -red16 8 2 -red16 3 6 -red16 7 12 -red16 11 13 +vmovdqa _16XV*2(%rdx),%ymm1 +red16 10,12 +red16 5,13 +red16 9,14 +red16 4,15 +red16 8,2 +red16 3,6 +red16 7,12 +red16 11,13 #store vmovdqa %ymm10,(%rdi) @@ -207,3 +201,20 @@ vmovdqa %ymm7,192(%rdi) vmovdqa %ymm11,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_ntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_EXP*2,%rsi +call ntt_level0_avx +add $128,%rdi +call ntt_level0_avx +sub $128,%rdi +add $8,%rsi +call ntt_levels1t6_avx +add $256,%rdi +add $392,%rsi +call ntt_levels1t6_avx +ret diff --git a/crypto_kem/kyber1024/avx2/ntt.h b/crypto_kem/kyber1024/avx2/ntt.h index 763b8a0d..fb7505e6 100644 --- a/crypto_kem/kyber1024/avx2/ntt.h +++ b/crypto_kem/kyber1024/avx2/ntt.h @@ -2,19 +2,27 @@ #define NTT_H #include "consts.h" - +#include "params.h" #include -void PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER1024_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER1024_AVX2_nttpack_avx(int16_t *r); -void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r); -void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); -void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); - -void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a); -void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a); + +void PQCLEAN_KYBER1024_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + +void PQCLEAN_KYBER1024_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + + +void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + +void PQCLEAN_KYBER1024_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + + +void PQCLEAN_KYBER1024_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + +void PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + + +void PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + +void PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); #endif diff --git a/crypto_kem/kyber1024/avx2/params.h b/crypto_kem/kyber1024/avx2/params.h index 85dcf73a..bfaecb40 100644 --- a/crypto_kem/kyber1024/avx2/params.h +++ b/crypto_kem/kyber1024/avx2/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER1024_AVX2_PARAMS_H +#define PQCLEAN_KYBER1024_AVX2_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 4 #define KYBER_POLYCOMPRESSEDBYTES 160 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber1024/avx2/poly.c b/crypto_kem/kyber1024/avx2/poly.c index 025a8e98..028f303d 100644 --- a/crypto_kem/kyber1024/avx2/poly.c +++ b/crypto_kem/kyber1024/avx2/poly.c @@ -1,132 +1,242 @@ +#include "align.h" #include "cbd.h" +#include "consts.h" #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include #include /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER1024_AVX2_poly_compress * * Description: Compression and subsequent serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t i, j, k = 0; PQCLEAN_KYBER1024_AVX2_poly_csubq(a); - for (i = 0; i < KYBER_N; i += 8) { + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = (uint8_t)(((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31); + t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; } - r[k] = (uint8_t)( t[0] | (t[1] << 5)); - r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4)); - r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3)); - k += 5; + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER1024_AVX2_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER1024_AVX2_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t *a) { - size_t i; - for (i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5); +void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *restrict r, + const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + unsigned int j = 0; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 5) | (a[1] << 3); + t[2] = (a[1] >> 2); + t[3] = (a[1] >> 7) | (a[2] << 1); + t[4] = (a[2] >> 4) | (a[3] << 4); + t[5] = (a[3] >> 1); + t[6] = (a[3] >> 6) | (a[4] << 2); + t[7] = (a[4] >> 3); a += 5; + + for (j = 0; j < 8; j++) { + r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; + } } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER1024_AVX2_poly_tobytes * * Description: Serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t *r, poly *a) { - PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs); - PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128); +void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + PQCLEAN_KYBER1024_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER1024_AVX2_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER1024_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) +**************************************************/ +void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER1024_AVX2_qdata); +} + +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t *a) { - PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs, a); - PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192); +void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *restrict r, + const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; + const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); + const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2); + +#define FROMMSG64(i) \ + g3 = _mm256_shuffle_epi32(f,0x55*(i)); \ + g3 = _mm256_sllv_epi32(g3,shift); \ + g3 = _mm256_shuffle_epi8(g3,idx); \ + g0 = _mm256_slli_epi16(g3,12); \ + g1 = _mm256_slli_epi16(g3,8); \ + g2 = _mm256_slli_epi16(g3,4); \ + g0 = _mm256_srai_epi16(g0,15); \ + g1 = _mm256_srai_epi16(g1,15); \ + g2 = _mm256_srai_epi16(g2,15); \ + g3 = _mm256_srai_epi16(g3,15); \ + g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ + g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ + g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ + g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ + h0 = _mm256_unpacklo_epi64(g0,g1); \ + h2 = _mm256_unpackhi_epi64(g0,g1); \ + h1 = _mm256_unpacklo_epi64(g2,g3); \ + h3 = _mm256_unpackhi_epi64(g2,g3); \ + g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ + g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ + g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ + g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + + f = _mm256_load_si256((__m256i *)msg); + FROMMSG64(0); + FROMMSG64(1); + FROMMSG64(2); + FROMMSG64(3); } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER1024_AVX2_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { + unsigned int i = 0; + uint32_t small = 0; + __m256i f0, f1, g0, g1; + const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); + f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); + f0 = _mm256_sub_epi16(hqs, f0); + f1 = _mm256_sub_epi16(hqs, f1); + g0 = _mm256_srai_epi16(f0, 15); + g1 = _mm256_srai_epi16(f1, 15); + f0 = _mm256_xor_si256(f0, g0); + f1 = _mm256_xor_si256(f1, g1); + f0 = _mm256_sub_epi16(hhqs, f0); + f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_packs_epi16(f0, f1); + small = _mm256_movemask_epi8(f0); + small = ~small; + msg[4 * i + 0] = small; + msg[4 * i + 1] = small >> 16; + msg[4 * i + 2] = small >> 8; + msg[4 * i + 3] = small >> 24; + } +} + +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); - PQCLEAN_KYBER1024_AVX2_cbd(r, buf); +void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; + prf(buf.arr, sizeof(buf.arr), seed, nonce); + PQCLEAN_KYBER1024_AVX2_cbd(r, buf.arr); } -// FIXME void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t *seed, + const uint8_t seed[32], uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) { - uint8_t buf[4][SHAKE256_RATE]; - - PQCLEAN_KYBER1024_AVX2_shake256x4_prf(buf[0], buf[1], buf[2], buf[3], SHAKE256_RATE, seed, nonce0, nonce1, nonce2, nonce3); - - PQCLEAN_KYBER1024_AVX2_cbd(r0, buf[0]); - PQCLEAN_KYBER1024_AVX2_cbd(r1, buf[1]); - PQCLEAN_KYBER1024_AVX2_cbd(r2, buf[2]); - PQCLEAN_KYBER1024_AVX2_cbd(r3, buf[3]); + ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf; + __m256i f; + keccakx4_state state; + + f = _mm256_load_si256((__m256i *)seed); + _mm256_store_si256((__m256i *)buf.arr[0], f); + _mm256_store_si256((__m256i *)buf.arr[1], f); + _mm256_store_si256((__m256i *)buf.arr[2], f); + _mm256_store_si256((__m256i *)buf.arr[3], f); + + buf.arr[0][32] = nonce0; + buf.arr[1][32] = nonce1; + buf.arr[2][32] = nonce2; + buf.arr[3][32] = nonce3; + + PQCLEAN_KYBER1024_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33); + PQCLEAN_KYBER1024_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + + PQCLEAN_KYBER1024_AVX2_cbd(r0, buf.arr[0]); + PQCLEAN_KYBER1024_AVX2_cbd(r1, buf.arr[1]); + PQCLEAN_KYBER1024_AVX2_cbd(r2, buf.arr[2]); + PQCLEAN_KYBER1024_AVX2_cbd(r3, buf.arr[3]); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER1024_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -135,73 +245,78 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, * Arguments: - uint16_t *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_exp); - PQCLEAN_KYBER1024_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER1024_AVX2_zetas_exp); - PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_exp + 4); - PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER1024_AVX2_zetas_exp + 200); + PQCLEAN_KYBER1024_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_invntt(poly *r) { - PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp); - PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp + 196); - PQCLEAN_KYBER1024_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER1024_AVX2_zetas_inv_exp + 392); +void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r) { + PQCLEAN_KYBER1024_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } -// FIXME void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs); - PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs + 128); + PQCLEAN_KYBER1024_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } -//XXX Add comment -void PQCLEAN_KYBER1024_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs, - a->coeffs, - b->coeffs, - PQCLEAN_KYBER1024_AVX2_zetas_exp + 152); - PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 64, - a->coeffs + 64, - b->coeffs + 64, - PQCLEAN_KYBER1024_AVX2_zetas_exp + 184); - PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 128, - a->coeffs + 128, - b->coeffs + 128, - PQCLEAN_KYBER1024_AVX2_zetas_exp + 348); - PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs + 192, - a->coeffs + 192, - b->coeffs + 192, - PQCLEAN_KYBER1024_AVX2_zetas_exp + 380); +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + PQCLEAN_KYBER1024_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } -// FIXME -void PQCLEAN_KYBER1024_AVX2_poly_frommont(poly *r) { - PQCLEAN_KYBER1024_AVX2_frommont_avx(r->coeffs); - PQCLEAN_KYBER1024_AVX2_frommont_avx(r->coeffs + 128); +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_poly_tomont +* +* Description: Inplace conversion of all coefficients of a polynomial +* from normal domain to Montgomery domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r) { + PQCLEAN_KYBER1024_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs); - PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs + 128); + PQCLEAN_KYBER1024_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_poly_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs); - PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs + 128); + PQCLEAN_KYBER1024_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER1024_AVX2_poly_add * * Description: Add two polynomials * @@ -210,18 +325,19 @@ void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_add_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_add_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER1024_AVX2_poly_sub * * Description: Subtract two polynomials * @@ -230,127 +346,13 @@ void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_sub_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); - } -} - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - __m128i tmp; - __m256i a[4], d0, d1, d2, d3; - const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i zeros = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi32(1); - const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2); - - tmp = _mm_loadu_si128((__m128i *)msg); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2); - } - - tmp = _mm_loadu_si128((__m128i *)&msg[16]); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2); - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint32_t small; - __m256i vec, tmp; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); - - for (size_t i = 0; i < KYBER_N / 16; i++) { - vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]); - vec = _mm256_sub_epi16(hqs, vec); - tmp = _mm256_srai_epi16(vec, 15); - vec = _mm256_xor_si256(vec, tmp); - vec = _mm256_sub_epi16(hhqs, vec); - small = (uint32_t)_mm256_movemask_epi8(vec); - small = _pext_u32(small, 0xAAAAAAAA); - small = ~small; - msg[2 * i + 0] = (uint8_t)small; - msg[2 * i + 1] = (uint8_t)(small >> 8); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_sub_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } diff --git a/crypto_kem/kyber1024/avx2/poly.h b/crypto_kem/kyber1024/avx2/poly.h index f3a6652a..b2c7000d 100644 --- a/crypto_kem/kyber1024/avx2/poly.h +++ b/crypto_kem/kyber1024/avx2/poly.h @@ -1,8 +1,7 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER1024_AVX2_POLY_H +#define PQCLEAN_KYBER1024_AVX2_POLY_H #include "params.h" - #include #include @@ -11,20 +10,28 @@ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] */ typedef union { + __m256i dummy; int16_t coeffs[KYBER_N]; - __m256i _dummy; } poly; -void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER1024_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER1024_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER1024_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER1024_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); + -void PQCLEAN_KYBER1024_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER1024_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); +void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); -void PQCLEAN_KYBER1024_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, poly *r1, poly *r2, @@ -37,15 +44,23 @@ void PQCLEAN_KYBER1024_AVX2_poly_getnoise4x(poly *r0, void PQCLEAN_KYBER1024_AVX2_poly_ntt(poly *r); -void PQCLEAN_KYBER1024_AVX2_poly_invntt(poly *r); + +void PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(poly *r); + void PQCLEAN_KYBER1024_AVX2_poly_nttunpack(poly *r); -void PQCLEAN_KYBER1024_AVX2_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER1024_AVX2_poly_frommont(poly *r); + +void PQCLEAN_KYBER1024_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER1024_AVX2_poly_tomont(poly *r); + void PQCLEAN_KYBER1024_AVX2_poly_reduce(poly *r); + void PQCLEAN_KYBER1024_AVX2_poly_csubq(poly *r); + void PQCLEAN_KYBER1024_AVX2_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER1024_AVX2_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber1024/avx2/polyvec.c b/crypto_kem/kyber1024/avx2/polyvec.c index 2c5c547a..8710f90b 100644 --- a/crypto_kem/kyber1024/avx2/polyvec.c +++ b/crypto_kem/kyber1024/avx2/polyvec.c @@ -1,167 +1,198 @@ +#include "params.h" +#include "consts.h" #include "ntt.h" #include "poly.h" #include "polyvec.h" - #include /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_compress * * Description: Compress and serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], + polyvec *restrict a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER1024_AVX2_polyvec_csubq(a); uint16_t t[8]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 8; j++) { - for (size_t k = 0; k < 8; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + for (k = 0; k < 8; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) + / KYBER_Q) & 0x7ff; + } } - r[11 * j + 0] = (uint8_t)t[0]; - r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3)); - r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6)); - r[11 * j + 3] = (uint8_t)((t[2] >> 2)); - r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1)); - r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4)); - r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7)); - r[11 * j + 7] = (uint8_t)((t[5] >> 1)); - r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2)); - r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5)); - r[11 * j + 10] = (uint8_t)((t[7] >> 3)); + r[ 0] = (t[0] >> 0); + r[ 1] = (t[0] >> 8) | (t[1] << 3); + r[ 2] = (t[1] >> 5) | (t[2] << 6); + r[ 3] = (t[2] >> 2); + r[ 4] = (t[2] >> 10) | (t[3] << 1); + r[ 5] = (t[3] >> 7) | (t[4] << 4); + r[ 6] = (t[4] >> 4) | (t[5] << 7); + r[ 7] = (t[5] >> 1); + r[ 8] = (t[5] >> 9) | (t[6] << 2); + r[ 9] = (t[6] >> 6) | (t[7] << 5); + r[10] = (t[7] >> 3); + r += 11; } - r += 352; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER1024_AVX2_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 8; j++) { - r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11); +void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *restrict r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[8]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); + t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); + t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); + t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); + t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); + t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); + t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); + t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); + a += 11; + + for (k = 0; k < 8; k++) { + r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; + } } - a += 352; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_tobytes * * Description: Serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER1024_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_AVX2_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_AVX2_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_AVX2_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { - PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs, - a->vec->coeffs, - b->vec->coeffs, - PQCLEAN_KYBER1024_AVX2_zetas_exp + 152); - PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 64, - a->vec->coeffs + 64, - b->vec->coeffs + 64, - PQCLEAN_KYBER1024_AVX2_zetas_exp + 184); - PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 128, - a->vec->coeffs + 128, - b->vec->coeffs + 128, - PQCLEAN_KYBER1024_AVX2_zetas_exp + 348); - PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs + 192, - a->vec->coeffs + 192, - b->vec->coeffs + 192, - PQCLEAN_KYBER1024_AVX2_zetas_exp + 380); +void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + PQCLEAN_KYBER1024_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER1024_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_AVX2_poly_reduce(&r->vec[i]); } } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of each element of a vector of polynomials +* for details of conditional subtraction of q see comments in +* reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_AVX2_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER1024_AVX2_polyvec_add * * Description: Add vectors of polynomials * @@ -170,7 +201,8 @@ void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber1024/avx2/polyvec.h b/crypto_kem/kyber1024/avx2/polyvec.h index 40bd2ef0..981b8a7e 100644 --- a/crypto_kem/kyber1024/avx2/polyvec.h +++ b/crypto_kem/kyber1024/avx2/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER1024_AVX2_POLYVEC_H +#define PQCLEAN_KYBER1024_AVX2_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER1024_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER1024_AVX2_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER1024_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER1024_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER1024_AVX2_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER1024_AVX2_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER1024_AVX2_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER1024_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER1024_AVX2_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER1024_AVX2_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER1024_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber1024/avx2/reduce.h b/crypto_kem/kyber1024/avx2/reduce.h index 279d9b8e..9daf9b7a 100644 --- a/crypto_kem/kyber1024/avx2/reduce.h +++ b/crypto_kem/kyber1024/avx2/reduce.h @@ -3,8 +3,14 @@ #include -int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r); -int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r); -int16_t PQCLEAN_KYBER1024_AVX2_frommont_avx(int16_t *r); +#include "consts.h" +#include "params.h" + + +int16_t PQCLEAN_KYBER1024_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + +int16_t PQCLEAN_KYBER1024_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); + +int16_t PQCLEAN_KYBER1024_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER1024_AVX2_qdata); #endif diff --git a/crypto_kem/kyber1024/avx2/rejsample.c b/crypto_kem/kyber1024/avx2/rejsample.c index aa25ae6a..b091901a 100644 --- a/crypto_kem/kyber1024/avx2/rejsample.c +++ b/crypto_kem/kyber1024/avx2/rejsample.c @@ -1,386 +1,360 @@ +#include "align.h" #include "consts.h" #include "params.h" #include "rejsample.h" - -#include #include - -static const uint8_t idx[256][8] = { - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 2, 0, 0, 0, 0, 0, 0, 0}, - { 0, 2, 0, 0, 0, 0, 0, 0}, - { 4, 0, 0, 0, 0, 0, 0, 0}, - { 0, 4, 0, 0, 0, 0, 0, 0}, - { 2, 4, 0, 0, 0, 0, 0, 0}, - { 0, 2, 4, 0, 0, 0, 0, 0}, - { 6, 0, 0, 0, 0, 0, 0, 0}, - { 0, 6, 0, 0, 0, 0, 0, 0}, - { 2, 6, 0, 0, 0, 0, 0, 0}, - { 0, 2, 6, 0, 0, 0, 0, 0}, - { 4, 6, 0, 0, 0, 0, 0, 0}, - { 0, 4, 6, 0, 0, 0, 0, 0}, - { 2, 4, 6, 0, 0, 0, 0, 0}, - { 0, 2, 4, 6, 0, 0, 0, 0}, - { 8, 0, 0, 0, 0, 0, 0, 0}, - { 0, 8, 0, 0, 0, 0, 0, 0}, - { 2, 8, 0, 0, 0, 0, 0, 0}, - { 0, 2, 8, 0, 0, 0, 0, 0}, - { 4, 8, 0, 0, 0, 0, 0, 0}, - { 0, 4, 8, 0, 0, 0, 0, 0}, - { 2, 4, 8, 0, 0, 0, 0, 0}, - { 0, 2, 4, 8, 0, 0, 0, 0}, - { 6, 8, 0, 0, 0, 0, 0, 0}, - { 0, 6, 8, 0, 0, 0, 0, 0}, - { 2, 6, 8, 0, 0, 0, 0, 0}, - { 0, 2, 6, 8, 0, 0, 0, 0}, - { 4, 6, 8, 0, 0, 0, 0, 0}, - { 0, 4, 6, 8, 0, 0, 0, 0}, - { 2, 4, 6, 8, 0, 0, 0, 0}, - { 0, 2, 4, 6, 8, 0, 0, 0}, - {10, 0, 0, 0, 0, 0, 0, 0}, - { 0, 10, 0, 0, 0, 0, 0, 0}, - { 2, 10, 0, 0, 0, 0, 0, 0}, - { 0, 2, 10, 0, 0, 0, 0, 0}, - { 4, 10, 0, 0, 0, 0, 0, 0}, - { 0, 4, 10, 0, 0, 0, 0, 0}, - { 2, 4, 10, 0, 0, 0, 0, 0}, - { 0, 2, 4, 10, 0, 0, 0, 0}, - { 6, 10, 0, 0, 0, 0, 0, 0}, - { 0, 6, 10, 0, 0, 0, 0, 0}, - { 2, 6, 10, 0, 0, 0, 0, 0}, - { 0, 2, 6, 10, 0, 0, 0, 0}, - { 4, 6, 10, 0, 0, 0, 0, 0}, - { 0, 4, 6, 10, 0, 0, 0, 0}, - { 2, 4, 6, 10, 0, 0, 0, 0}, - { 0, 2, 4, 6, 10, 0, 0, 0}, - { 8, 10, 0, 0, 0, 0, 0, 0}, - { 0, 8, 10, 0, 0, 0, 0, 0}, - { 2, 8, 10, 0, 0, 0, 0, 0}, - { 0, 2, 8, 10, 0, 0, 0, 0}, - { 4, 8, 10, 0, 0, 0, 0, 0}, - { 0, 4, 8, 10, 0, 0, 0, 0}, - { 2, 4, 8, 10, 0, 0, 0, 0}, - { 0, 2, 4, 8, 10, 0, 0, 0}, - { 6, 8, 10, 0, 0, 0, 0, 0}, - { 0, 6, 8, 10, 0, 0, 0, 0}, - { 2, 6, 8, 10, 0, 0, 0, 0}, - { 0, 2, 6, 8, 10, 0, 0, 0}, - { 4, 6, 8, 10, 0, 0, 0, 0}, - { 0, 4, 6, 8, 10, 0, 0, 0}, - { 2, 4, 6, 8, 10, 0, 0, 0}, - { 0, 2, 4, 6, 8, 10, 0, 0}, - {12, 0, 0, 0, 0, 0, 0, 0}, - { 0, 12, 0, 0, 0, 0, 0, 0}, - { 2, 12, 0, 0, 0, 0, 0, 0}, - { 0, 2, 12, 0, 0, 0, 0, 0}, - { 4, 12, 0, 0, 0, 0, 0, 0}, - { 0, 4, 12, 0, 0, 0, 0, 0}, - { 2, 4, 12, 0, 0, 0, 0, 0}, - { 0, 2, 4, 12, 0, 0, 0, 0}, - { 6, 12, 0, 0, 0, 0, 0, 0}, - { 0, 6, 12, 0, 0, 0, 0, 0}, - { 2, 6, 12, 0, 0, 0, 0, 0}, - { 0, 2, 6, 12, 0, 0, 0, 0}, - { 4, 6, 12, 0, 0, 0, 0, 0}, - { 0, 4, 6, 12, 0, 0, 0, 0}, - { 2, 4, 6, 12, 0, 0, 0, 0}, - { 0, 2, 4, 6, 12, 0, 0, 0}, - { 8, 12, 0, 0, 0, 0, 0, 0}, - { 0, 8, 12, 0, 0, 0, 0, 0}, - { 2, 8, 12, 0, 0, 0, 0, 0}, - { 0, 2, 8, 12, 0, 0, 0, 0}, - { 4, 8, 12, 0, 0, 0, 0, 0}, - { 0, 4, 8, 12, 0, 0, 0, 0}, - { 2, 4, 8, 12, 0, 0, 0, 0}, - { 0, 2, 4, 8, 12, 0, 0, 0}, - { 6, 8, 12, 0, 0, 0, 0, 0}, - { 0, 6, 8, 12, 0, 0, 0, 0}, - { 2, 6, 8, 12, 0, 0, 0, 0}, - { 0, 2, 6, 8, 12, 0, 0, 0}, - { 4, 6, 8, 12, 0, 0, 0, 0}, - { 0, 4, 6, 8, 12, 0, 0, 0}, - { 2, 4, 6, 8, 12, 0, 0, 0}, - { 0, 2, 4, 6, 8, 12, 0, 0}, - {10, 12, 0, 0, 0, 0, 0, 0}, - { 0, 10, 12, 0, 0, 0, 0, 0}, - { 2, 10, 12, 0, 0, 0, 0, 0}, - { 0, 2, 10, 12, 0, 0, 0, 0}, - { 4, 10, 12, 0, 0, 0, 0, 0}, - { 0, 4, 10, 12, 0, 0, 0, 0}, - { 2, 4, 10, 12, 0, 0, 0, 0}, - { 0, 2, 4, 10, 12, 0, 0, 0}, - { 6, 10, 12, 0, 0, 0, 0, 0}, - { 0, 6, 10, 12, 0, 0, 0, 0}, - { 2, 6, 10, 12, 0, 0, 0, 0}, - { 0, 2, 6, 10, 12, 0, 0, 0}, - { 4, 6, 10, 12, 0, 0, 0, 0}, - { 0, 4, 6, 10, 12, 0, 0, 0}, - { 2, 4, 6, 10, 12, 0, 0, 0}, - { 0, 2, 4, 6, 10, 12, 0, 0}, - { 8, 10, 12, 0, 0, 0, 0, 0}, - { 0, 8, 10, 12, 0, 0, 0, 0}, - { 2, 8, 10, 12, 0, 0, 0, 0}, - { 0, 2, 8, 10, 12, 0, 0, 0}, - { 4, 8, 10, 12, 0, 0, 0, 0}, - { 0, 4, 8, 10, 12, 0, 0, 0}, - { 2, 4, 8, 10, 12, 0, 0, 0}, - { 0, 2, 4, 8, 10, 12, 0, 0}, - { 6, 8, 10, 12, 0, 0, 0, 0}, - { 0, 6, 8, 10, 12, 0, 0, 0}, - { 2, 6, 8, 10, 12, 0, 0, 0}, - { 0, 2, 6, 8, 10, 12, 0, 0}, - { 4, 6, 8, 10, 12, 0, 0, 0}, - { 0, 4, 6, 8, 10, 12, 0, 0}, - { 2, 4, 6, 8, 10, 12, 0, 0}, - { 0, 2, 4, 6, 8, 10, 12, 0}, - {14, 0, 0, 0, 0, 0, 0, 0}, - { 0, 14, 0, 0, 0, 0, 0, 0}, - { 2, 14, 0, 0, 0, 0, 0, 0}, - { 0, 2, 14, 0, 0, 0, 0, 0}, - { 4, 14, 0, 0, 0, 0, 0, 0}, - { 0, 4, 14, 0, 0, 0, 0, 0}, - { 2, 4, 14, 0, 0, 0, 0, 0}, - { 0, 2, 4, 14, 0, 0, 0, 0}, - { 6, 14, 0, 0, 0, 0, 0, 0}, - { 0, 6, 14, 0, 0, 0, 0, 0}, - { 2, 6, 14, 0, 0, 0, 0, 0}, - { 0, 2, 6, 14, 0, 0, 0, 0}, - { 4, 6, 14, 0, 0, 0, 0, 0}, - { 0, 4, 6, 14, 0, 0, 0, 0}, - { 2, 4, 6, 14, 0, 0, 0, 0}, - { 0, 2, 4, 6, 14, 0, 0, 0}, - { 8, 14, 0, 0, 0, 0, 0, 0}, - { 0, 8, 14, 0, 0, 0, 0, 0}, - { 2, 8, 14, 0, 0, 0, 0, 0}, - { 0, 2, 8, 14, 0, 0, 0, 0}, - { 4, 8, 14, 0, 0, 0, 0, 0}, - { 0, 4, 8, 14, 0, 0, 0, 0}, - { 2, 4, 8, 14, 0, 0, 0, 0}, - { 0, 2, 4, 8, 14, 0, 0, 0}, - { 6, 8, 14, 0, 0, 0, 0, 0}, - { 0, 6, 8, 14, 0, 0, 0, 0}, - { 2, 6, 8, 14, 0, 0, 0, 0}, - { 0, 2, 6, 8, 14, 0, 0, 0}, - { 4, 6, 8, 14, 0, 0, 0, 0}, - { 0, 4, 6, 8, 14, 0, 0, 0}, - { 2, 4, 6, 8, 14, 0, 0, 0}, - { 0, 2, 4, 6, 8, 14, 0, 0}, - {10, 14, 0, 0, 0, 0, 0, 0}, - { 0, 10, 14, 0, 0, 0, 0, 0}, - { 2, 10, 14, 0, 0, 0, 0, 0}, - { 0, 2, 10, 14, 0, 0, 0, 0}, - { 4, 10, 14, 0, 0, 0, 0, 0}, - { 0, 4, 10, 14, 0, 0, 0, 0}, - { 2, 4, 10, 14, 0, 0, 0, 0}, - { 0, 2, 4, 10, 14, 0, 0, 0}, - { 6, 10, 14, 0, 0, 0, 0, 0}, - { 0, 6, 10, 14, 0, 0, 0, 0}, - { 2, 6, 10, 14, 0, 0, 0, 0}, - { 0, 2, 6, 10, 14, 0, 0, 0}, - { 4, 6, 10, 14, 0, 0, 0, 0}, - { 0, 4, 6, 10, 14, 0, 0, 0}, - { 2, 4, 6, 10, 14, 0, 0, 0}, - { 0, 2, 4, 6, 10, 14, 0, 0}, - { 8, 10, 14, 0, 0, 0, 0, 0}, - { 0, 8, 10, 14, 0, 0, 0, 0}, - { 2, 8, 10, 14, 0, 0, 0, 0}, - { 0, 2, 8, 10, 14, 0, 0, 0}, - { 4, 8, 10, 14, 0, 0, 0, 0}, - { 0, 4, 8, 10, 14, 0, 0, 0}, - { 2, 4, 8, 10, 14, 0, 0, 0}, - { 0, 2, 4, 8, 10, 14, 0, 0}, - { 6, 8, 10, 14, 0, 0, 0, 0}, - { 0, 6, 8, 10, 14, 0, 0, 0}, - { 2, 6, 8, 10, 14, 0, 0, 0}, - { 0, 2, 6, 8, 10, 14, 0, 0}, - { 4, 6, 8, 10, 14, 0, 0, 0}, - { 0, 4, 6, 8, 10, 14, 0, 0}, - { 2, 4, 6, 8, 10, 14, 0, 0}, - { 0, 2, 4, 6, 8, 10, 14, 0}, - {12, 14, 0, 0, 0, 0, 0, 0}, - { 0, 12, 14, 0, 0, 0, 0, 0}, - { 2, 12, 14, 0, 0, 0, 0, 0}, - { 0, 2, 12, 14, 0, 0, 0, 0}, - { 4, 12, 14, 0, 0, 0, 0, 0}, - { 0, 4, 12, 14, 0, 0, 0, 0}, - { 2, 4, 12, 14, 0, 0, 0, 0}, - { 0, 2, 4, 12, 14, 0, 0, 0}, - { 6, 12, 14, 0, 0, 0, 0, 0}, - { 0, 6, 12, 14, 0, 0, 0, 0}, - { 2, 6, 12, 14, 0, 0, 0, 0}, - { 0, 2, 6, 12, 14, 0, 0, 0}, - { 4, 6, 12, 14, 0, 0, 0, 0}, - { 0, 4, 6, 12, 14, 0, 0, 0}, - { 2, 4, 6, 12, 14, 0, 0, 0}, - { 0, 2, 4, 6, 12, 14, 0, 0}, - { 8, 12, 14, 0, 0, 0, 0, 0}, - { 0, 8, 12, 14, 0, 0, 0, 0}, - { 2, 8, 12, 14, 0, 0, 0, 0}, - { 0, 2, 8, 12, 14, 0, 0, 0}, - { 4, 8, 12, 14, 0, 0, 0, 0}, - { 0, 4, 8, 12, 14, 0, 0, 0}, - { 2, 4, 8, 12, 14, 0, 0, 0}, - { 0, 2, 4, 8, 12, 14, 0, 0}, - { 6, 8, 12, 14, 0, 0, 0, 0}, - { 0, 6, 8, 12, 14, 0, 0, 0}, - { 2, 6, 8, 12, 14, 0, 0, 0}, - { 0, 2, 6, 8, 12, 14, 0, 0}, - { 4, 6, 8, 12, 14, 0, 0, 0}, - { 0, 4, 6, 8, 12, 14, 0, 0}, - { 2, 4, 6, 8, 12, 14, 0, 0}, - { 0, 2, 4, 6, 8, 12, 14, 0}, - {10, 12, 14, 0, 0, 0, 0, 0}, - { 0, 10, 12, 14, 0, 0, 0, 0}, - { 2, 10, 12, 14, 0, 0, 0, 0}, - { 0, 2, 10, 12, 14, 0, 0, 0}, - { 4, 10, 12, 14, 0, 0, 0, 0}, - { 0, 4, 10, 12, 14, 0, 0, 0}, - { 2, 4, 10, 12, 14, 0, 0, 0}, - { 0, 2, 4, 10, 12, 14, 0, 0}, - { 6, 10, 12, 14, 0, 0, 0, 0}, - { 0, 6, 10, 12, 14, 0, 0, 0}, - { 2, 6, 10, 12, 14, 0, 0, 0}, - { 0, 2, 6, 10, 12, 14, 0, 0}, - { 4, 6, 10, 12, 14, 0, 0, 0}, - { 0, 4, 6, 10, 12, 14, 0, 0}, - { 2, 4, 6, 10, 12, 14, 0, 0}, - { 0, 2, 4, 6, 10, 12, 14, 0}, - { 8, 10, 12, 14, 0, 0, 0, 0}, - { 0, 8, 10, 12, 14, 0, 0, 0}, - { 2, 8, 10, 12, 14, 0, 0, 0}, - { 0, 2, 8, 10, 12, 14, 0, 0}, - { 4, 8, 10, 12, 14, 0, 0, 0}, - { 0, 4, 8, 10, 12, 14, 0, 0}, - { 2, 4, 8, 10, 12, 14, 0, 0}, - { 0, 2, 4, 8, 10, 12, 14, 0}, - { 6, 8, 10, 12, 14, 0, 0, 0}, - { 0, 6, 8, 10, 12, 14, 0, 0}, - { 2, 6, 8, 10, 12, 14, 0, 0}, - { 0, 2, 6, 8, 10, 12, 14, 0}, - { 4, 6, 8, 10, 12, 14, 0, 0}, - { 0, 4, 6, 8, 10, 12, 14, 0}, - { 2, 4, 6, 8, 10, 12, 14, 0}, - { 0, 2, 4, 6, 8, 10, 12, 14} +static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { + {-1, -1, -1, -1, -1, -1, -1, -1}, + { 0, -1, -1, -1, -1, -1, -1, -1}, + { 2, -1, -1, -1, -1, -1, -1, -1}, + { 0, 2, -1, -1, -1, -1, -1, -1}, + { 4, -1, -1, -1, -1, -1, -1, -1}, + { 0, 4, -1, -1, -1, -1, -1, -1}, + { 2, 4, -1, -1, -1, -1, -1, -1}, + { 0, 2, 4, -1, -1, -1, -1, -1}, + { 6, -1, -1, -1, -1, -1, -1, -1}, + { 0, 6, -1, -1, -1, -1, -1, -1}, + { 2, 6, -1, -1, -1, -1, -1, -1}, + { 0, 2, 6, -1, -1, -1, -1, -1}, + { 4, 6, -1, -1, -1, -1, -1, -1}, + { 0, 4, 6, -1, -1, -1, -1, -1}, + { 2, 4, 6, -1, -1, -1, -1, -1}, + { 0, 2, 4, 6, -1, -1, -1, -1}, + { 8, -1, -1, -1, -1, -1, -1, -1}, + { 0, 8, -1, -1, -1, -1, -1, -1}, + { 2, 8, -1, -1, -1, -1, -1, -1}, + { 0, 2, 8, -1, -1, -1, -1, -1}, + { 4, 8, -1, -1, -1, -1, -1, -1}, + { 0, 4, 8, -1, -1, -1, -1, -1}, + { 2, 4, 8, -1, -1, -1, -1, -1}, + { 0, 2, 4, 8, -1, -1, -1, -1}, + { 6, 8, -1, -1, -1, -1, -1, -1}, + { 0, 6, 8, -1, -1, -1, -1, -1}, + { 2, 6, 8, -1, -1, -1, -1, -1}, + { 0, 2, 6, 8, -1, -1, -1, -1}, + { 4, 6, 8, -1, -1, -1, -1, -1}, + { 0, 4, 6, 8, -1, -1, -1, -1}, + { 2, 4, 6, 8, -1, -1, -1, -1}, + { 0, 2, 4, 6, 8, -1, -1, -1}, + {10, -1, -1, -1, -1, -1, -1, -1}, + { 0, 10, -1, -1, -1, -1, -1, -1}, + { 2, 10, -1, -1, -1, -1, -1, -1}, + { 0, 2, 10, -1, -1, -1, -1, -1}, + { 4, 10, -1, -1, -1, -1, -1, -1}, + { 0, 4, 10, -1, -1, -1, -1, -1}, + { 2, 4, 10, -1, -1, -1, -1, -1}, + { 0, 2, 4, 10, -1, -1, -1, -1}, + { 6, 10, -1, -1, -1, -1, -1, -1}, + { 0, 6, 10, -1, -1, -1, -1, -1}, + { 2, 6, 10, -1, -1, -1, -1, -1}, + { 0, 2, 6, 10, -1, -1, -1, -1}, + { 4, 6, 10, -1, -1, -1, -1, -1}, + { 0, 4, 6, 10, -1, -1, -1, -1}, + { 2, 4, 6, 10, -1, -1, -1, -1}, + { 0, 2, 4, 6, 10, -1, -1, -1}, + { 8, 10, -1, -1, -1, -1, -1, -1}, + { 0, 8, 10, -1, -1, -1, -1, -1}, + { 2, 8, 10, -1, -1, -1, -1, -1}, + { 0, 2, 8, 10, -1, -1, -1, -1}, + { 4, 8, 10, -1, -1, -1, -1, -1}, + { 0, 4, 8, 10, -1, -1, -1, -1}, + { 2, 4, 8, 10, -1, -1, -1, -1}, + { 0, 2, 4, 8, 10, -1, -1, -1}, + { 6, 8, 10, -1, -1, -1, -1, -1}, + { 0, 6, 8, 10, -1, -1, -1, -1}, + { 2, 6, 8, 10, -1, -1, -1, -1}, + { 0, 2, 6, 8, 10, -1, -1, -1}, + { 4, 6, 8, 10, -1, -1, -1, -1}, + { 0, 4, 6, 8, 10, -1, -1, -1}, + { 2, 4, 6, 8, 10, -1, -1, -1}, + { 0, 2, 4, 6, 8, 10, -1, -1}, + {12, -1, -1, -1, -1, -1, -1, -1}, + { 0, 12, -1, -1, -1, -1, -1, -1}, + { 2, 12, -1, -1, -1, -1, -1, -1}, + { 0, 2, 12, -1, -1, -1, -1, -1}, + { 4, 12, -1, -1, -1, -1, -1, -1}, + { 0, 4, 12, -1, -1, -1, -1, -1}, + { 2, 4, 12, -1, -1, -1, -1, -1}, + { 0, 2, 4, 12, -1, -1, -1, -1}, + { 6, 12, -1, -1, -1, -1, -1, -1}, + { 0, 6, 12, -1, -1, -1, -1, -1}, + { 2, 6, 12, -1, -1, -1, -1, -1}, + { 0, 2, 6, 12, -1, -1, -1, -1}, + { 4, 6, 12, -1, -1, -1, -1, -1}, + { 0, 4, 6, 12, -1, -1, -1, -1}, + { 2, 4, 6, 12, -1, -1, -1, -1}, + { 0, 2, 4, 6, 12, -1, -1, -1}, + { 8, 12, -1, -1, -1, -1, -1, -1}, + { 0, 8, 12, -1, -1, -1, -1, -1}, + { 2, 8, 12, -1, -1, -1, -1, -1}, + { 0, 2, 8, 12, -1, -1, -1, -1}, + { 4, 8, 12, -1, -1, -1, -1, -1}, + { 0, 4, 8, 12, -1, -1, -1, -1}, + { 2, 4, 8, 12, -1, -1, -1, -1}, + { 0, 2, 4, 8, 12, -1, -1, -1}, + { 6, 8, 12, -1, -1, -1, -1, -1}, + { 0, 6, 8, 12, -1, -1, -1, -1}, + { 2, 6, 8, 12, -1, -1, -1, -1}, + { 0, 2, 6, 8, 12, -1, -1, -1}, + { 4, 6, 8, 12, -1, -1, -1, -1}, + { 0, 4, 6, 8, 12, -1, -1, -1}, + { 2, 4, 6, 8, 12, -1, -1, -1}, + { 0, 2, 4, 6, 8, 12, -1, -1}, + {10, 12, -1, -1, -1, -1, -1, -1}, + { 0, 10, 12, -1, -1, -1, -1, -1}, + { 2, 10, 12, -1, -1, -1, -1, -1}, + { 0, 2, 10, 12, -1, -1, -1, -1}, + { 4, 10, 12, -1, -1, -1, -1, -1}, + { 0, 4, 10, 12, -1, -1, -1, -1}, + { 2, 4, 10, 12, -1, -1, -1, -1}, + { 0, 2, 4, 10, 12, -1, -1, -1}, + { 6, 10, 12, -1, -1, -1, -1, -1}, + { 0, 6, 10, 12, -1, -1, -1, -1}, + { 2, 6, 10, 12, -1, -1, -1, -1}, + { 0, 2, 6, 10, 12, -1, -1, -1}, + { 4, 6, 10, 12, -1, -1, -1, -1}, + { 0, 4, 6, 10, 12, -1, -1, -1}, + { 2, 4, 6, 10, 12, -1, -1, -1}, + { 0, 2, 4, 6, 10, 12, -1, -1}, + { 8, 10, 12, -1, -1, -1, -1, -1}, + { 0, 8, 10, 12, -1, -1, -1, -1}, + { 2, 8, 10, 12, -1, -1, -1, -1}, + { 0, 2, 8, 10, 12, -1, -1, -1}, + { 4, 8, 10, 12, -1, -1, -1, -1}, + { 0, 4, 8, 10, 12, -1, -1, -1}, + { 2, 4, 8, 10, 12, -1, -1, -1}, + { 0, 2, 4, 8, 10, 12, -1, -1}, + { 6, 8, 10, 12, -1, -1, -1, -1}, + { 0, 6, 8, 10, 12, -1, -1, -1}, + { 2, 6, 8, 10, 12, -1, -1, -1}, + { 0, 2, 6, 8, 10, 12, -1, -1}, + { 4, 6, 8, 10, 12, -1, -1, -1}, + { 0, 4, 6, 8, 10, 12, -1, -1}, + { 2, 4, 6, 8, 10, 12, -1, -1}, + { 0, 2, 4, 6, 8, 10, 12, -1}, + {14, -1, -1, -1, -1, -1, -1, -1}, + { 0, 14, -1, -1, -1, -1, -1, -1}, + { 2, 14, -1, -1, -1, -1, -1, -1}, + { 0, 2, 14, -1, -1, -1, -1, -1}, + { 4, 14, -1, -1, -1, -1, -1, -1}, + { 0, 4, 14, -1, -1, -1, -1, -1}, + { 2, 4, 14, -1, -1, -1, -1, -1}, + { 0, 2, 4, 14, -1, -1, -1, -1}, + { 6, 14, -1, -1, -1, -1, -1, -1}, + { 0, 6, 14, -1, -1, -1, -1, -1}, + { 2, 6, 14, -1, -1, -1, -1, -1}, + { 0, 2, 6, 14, -1, -1, -1, -1}, + { 4, 6, 14, -1, -1, -1, -1, -1}, + { 0, 4, 6, 14, -1, -1, -1, -1}, + { 2, 4, 6, 14, -1, -1, -1, -1}, + { 0, 2, 4, 6, 14, -1, -1, -1}, + { 8, 14, -1, -1, -1, -1, -1, -1}, + { 0, 8, 14, -1, -1, -1, -1, -1}, + { 2, 8, 14, -1, -1, -1, -1, -1}, + { 0, 2, 8, 14, -1, -1, -1, -1}, + { 4, 8, 14, -1, -1, -1, -1, -1}, + { 0, 4, 8, 14, -1, -1, -1, -1}, + { 2, 4, 8, 14, -1, -1, -1, -1}, + { 0, 2, 4, 8, 14, -1, -1, -1}, + { 6, 8, 14, -1, -1, -1, -1, -1}, + { 0, 6, 8, 14, -1, -1, -1, -1}, + { 2, 6, 8, 14, -1, -1, -1, -1}, + { 0, 2, 6, 8, 14, -1, -1, -1}, + { 4, 6, 8, 14, -1, -1, -1, -1}, + { 0, 4, 6, 8, 14, -1, -1, -1}, + { 2, 4, 6, 8, 14, -1, -1, -1}, + { 0, 2, 4, 6, 8, 14, -1, -1}, + {10, 14, -1, -1, -1, -1, -1, -1}, + { 0, 10, 14, -1, -1, -1, -1, -1}, + { 2, 10, 14, -1, -1, -1, -1, -1}, + { 0, 2, 10, 14, -1, -1, -1, -1}, + { 4, 10, 14, -1, -1, -1, -1, -1}, + { 0, 4, 10, 14, -1, -1, -1, -1}, + { 2, 4, 10, 14, -1, -1, -1, -1}, + { 0, 2, 4, 10, 14, -1, -1, -1}, + { 6, 10, 14, -1, -1, -1, -1, -1}, + { 0, 6, 10, 14, -1, -1, -1, -1}, + { 2, 6, 10, 14, -1, -1, -1, -1}, + { 0, 2, 6, 10, 14, -1, -1, -1}, + { 4, 6, 10, 14, -1, -1, -1, -1}, + { 0, 4, 6, 10, 14, -1, -1, -1}, + { 2, 4, 6, 10, 14, -1, -1, -1}, + { 0, 2, 4, 6, 10, 14, -1, -1}, + { 8, 10, 14, -1, -1, -1, -1, -1}, + { 0, 8, 10, 14, -1, -1, -1, -1}, + { 2, 8, 10, 14, -1, -1, -1, -1}, + { 0, 2, 8, 10, 14, -1, -1, -1}, + { 4, 8, 10, 14, -1, -1, -1, -1}, + { 0, 4, 8, 10, 14, -1, -1, -1}, + { 2, 4, 8, 10, 14, -1, -1, -1}, + { 0, 2, 4, 8, 10, 14, -1, -1}, + { 6, 8, 10, 14, -1, -1, -1, -1}, + { 0, 6, 8, 10, 14, -1, -1, -1}, + { 2, 6, 8, 10, 14, -1, -1, -1}, + { 0, 2, 6, 8, 10, 14, -1, -1}, + { 4, 6, 8, 10, 14, -1, -1, -1}, + { 0, 4, 6, 8, 10, 14, -1, -1}, + { 2, 4, 6, 8, 10, 14, -1, -1}, + { 0, 2, 4, 6, 8, 10, 14, -1}, + {12, 14, -1, -1, -1, -1, -1, -1}, + { 0, 12, 14, -1, -1, -1, -1, -1}, + { 2, 12, 14, -1, -1, -1, -1, -1}, + { 0, 2, 12, 14, -1, -1, -1, -1}, + { 4, 12, 14, -1, -1, -1, -1, -1}, + { 0, 4, 12, 14, -1, -1, -1, -1}, + { 2, 4, 12, 14, -1, -1, -1, -1}, + { 0, 2, 4, 12, 14, -1, -1, -1}, + { 6, 12, 14, -1, -1, -1, -1, -1}, + { 0, 6, 12, 14, -1, -1, -1, -1}, + { 2, 6, 12, 14, -1, -1, -1, -1}, + { 0, 2, 6, 12, 14, -1, -1, -1}, + { 4, 6, 12, 14, -1, -1, -1, -1}, + { 0, 4, 6, 12, 14, -1, -1, -1}, + { 2, 4, 6, 12, 14, -1, -1, -1}, + { 0, 2, 4, 6, 12, 14, -1, -1}, + { 8, 12, 14, -1, -1, -1, -1, -1}, + { 0, 8, 12, 14, -1, -1, -1, -1}, + { 2, 8, 12, 14, -1, -1, -1, -1}, + { 0, 2, 8, 12, 14, -1, -1, -1}, + { 4, 8, 12, 14, -1, -1, -1, -1}, + { 0, 4, 8, 12, 14, -1, -1, -1}, + { 2, 4, 8, 12, 14, -1, -1, -1}, + { 0, 2, 4, 8, 12, 14, -1, -1}, + { 6, 8, 12, 14, -1, -1, -1, -1}, + { 0, 6, 8, 12, 14, -1, -1, -1}, + { 2, 6, 8, 12, 14, -1, -1, -1}, + { 0, 2, 6, 8, 12, 14, -1, -1}, + { 4, 6, 8, 12, 14, -1, -1, -1}, + { 0, 4, 6, 8, 12, 14, -1, -1}, + { 2, 4, 6, 8, 12, 14, -1, -1}, + { 0, 2, 4, 6, 8, 12, 14, -1}, + {10, 12, 14, -1, -1, -1, -1, -1}, + { 0, 10, 12, 14, -1, -1, -1, -1}, + { 2, 10, 12, 14, -1, -1, -1, -1}, + { 0, 2, 10, 12, 14, -1, -1, -1}, + { 4, 10, 12, 14, -1, -1, -1, -1}, + { 0, 4, 10, 12, 14, -1, -1, -1}, + { 2, 4, 10, 12, 14, -1, -1, -1}, + { 0, 2, 4, 10, 12, 14, -1, -1}, + { 6, 10, 12, 14, -1, -1, -1, -1}, + { 0, 6, 10, 12, 14, -1, -1, -1}, + { 2, 6, 10, 12, 14, -1, -1, -1}, + { 0, 2, 6, 10, 12, 14, -1, -1}, + { 4, 6, 10, 12, 14, -1, -1, -1}, + { 0, 4, 6, 10, 12, 14, -1, -1}, + { 2, 4, 6, 10, 12, 14, -1, -1}, + { 0, 2, 4, 6, 10, 12, 14, -1}, + { 8, 10, 12, 14, -1, -1, -1, -1}, + { 0, 8, 10, 12, 14, -1, -1, -1}, + { 2, 8, 10, 12, 14, -1, -1, -1}, + { 0, 2, 8, 10, 12, 14, -1, -1}, + { 4, 8, 10, 12, 14, -1, -1, -1}, + { 0, 4, 8, 10, 12, 14, -1, -1}, + { 2, 4, 8, 10, 12, 14, -1, -1}, + { 0, 2, 4, 8, 10, 12, 14, -1}, + { 6, 8, 10, 12, 14, -1, -1, -1}, + { 0, 6, 8, 10, 12, 14, -1, -1}, + { 2, 6, 8, 10, 12, 14, -1, -1}, + { 0, 2, 6, 8, 10, 12, 14, -1}, + { 4, 6, 8, 10, 12, 14, -1, -1}, + { 0, 4, 6, 8, 10, 12, 14, -1}, + { 2, 4, 6, 8, 10, 12, 14, -1}, + { 0, 2, 4, 6, 8, 10, 12, 14} + } }; -#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) -#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) +#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) +#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -size_t PQCLEAN_KYBER1024_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint16_t val; - uint32_t good0, good1, good2; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison +#define REJ_UNIFORM_BUFLEN 672 +unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *restrict r, + const uint8_t *restrict buf) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; + uint32_t good = 0; + const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_16xq.as_vec); - const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER1024_AVX2_16xv.as_vec); - __m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2; - __m128i d, tmp, pilo, pihi; + const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XQ]); + const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER1024_AVX2_qdata.as_arr[_16XV]); + __m256i f0, f1, g0, g1, g2, g3; + __m128i f, t, pilo, pihi; - ctr = pos = 0; - while (ctr + 48 <= len && pos + 96 <= buflen) { - d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]); - d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]); - d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]); + ctr = 0; + for (pos = 0; pos < 2 * KYBER_N; pos += 64) { + f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); + f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); - tmp0 = _mm256_cmpge_epu16(bound, d0); - tmp1 = _mm256_cmpge_epu16(bound, d1); - tmp2 = _mm256_cmpge_epu16(bound, d2); - good0 = (uint32_t)_mm256_movemask_epi8(tmp0); - good1 = (uint32_t)_mm256_movemask_epi8(tmp1); - good2 = (uint32_t)_mm256_movemask_epi8(tmp2); - good0 = _pext_u32(good0, 0x55555555); - good1 = _pext_u32(good1, 0x55555555); - good2 = _pext_u32(good2, 0x55555555); + g0 = _mm256_cmpge_epu16(bound, f0); + g1 = _mm256_cmpge_epu16(bound, f1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]); - pi0 = _mm256_castsi128_si256(pilo); - pi0 = _mm256_inserti128_si256(pi0, pihi, 1); + g0 = _mm256_packs_epi16(g0, g1); + good = _mm256_movemask_epi8(g0); - pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]); - pi1 = _mm256_castsi128_si256(pilo); - pi1 = _mm256_inserti128_si256(pi1, pihi, 1); + g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); + g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); + g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); + g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]); - pi2 = _mm256_castsi128_si256(pilo); - pi2 = _mm256_inserti128_si256(pi2, pihi, 1); + //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); + //g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8); - tmp0 = _mm256_add_epi8(pi0, ones); - tmp1 = _mm256_add_epi8(pi1, ones); - tmp2 = _mm256_add_epi8(pi2, ones); - pi0 = _mm256_unpacklo_epi8(pi0, tmp0); - pi1 = _mm256_unpacklo_epi8(pi1, tmp1); - pi2 = _mm256_unpacklo_epi8(pi2, tmp2); + /* Barrett reduction of (still unsigned) values */ + g2 = _mm256_mulhi_epu16(f0, v); + g3 = _mm256_mulhi_epu16(f1, v); + g2 = _mm256_srli_epi16(g2, 10); + g3 = _mm256_srli_epi16(g3, 10); + g2 = _mm256_mullo_epi16(g2, kyberq); + g3 = _mm256_mullo_epi16(g3, kyberq); + f0 = _mm256_sub_epi16(f0, g2); + f1 = _mm256_sub_epi16(f1, g3); - d0 = _mm256_shuffle_epi8(d0, pi0); - d1 = _mm256_shuffle_epi8(d1, pi1); - d2 = _mm256_shuffle_epi8(d2, pi2); + g2 = _mm256_add_epi8(g0, ones); + g3 = _mm256_add_epi8(g1, ones); + g0 = _mm256_unpacklo_epi8(g0, g2); + g1 = _mm256_unpacklo_epi8(g1, g3); - /* Barrett reduction of (still unsigned) d values */ - tmp0 = _mm256_mulhi_epu16(d0, v); - tmp1 = _mm256_mulhi_epu16(d1, v); - tmp2 = _mm256_mulhi_epu16(d2, v); - tmp0 = _mm256_srli_epi16(tmp0, 10); - tmp1 = _mm256_srli_epi16(tmp1, 10); - tmp2 = _mm256_srli_epi16(tmp2, 10); - tmp0 = _mm256_mullo_epi16(tmp0, kyberq); - tmp1 = _mm256_mullo_epi16(tmp1, kyberq); - tmp2 = _mm256_mullo_epi16(tmp2, kyberq); - d0 = _mm256_sub_epi16(d0, tmp0); - d1 = _mm256_sub_epi16(d1, tmp1); - d2 = _mm256_sub_epi16(d2, tmp2); + f0 = _mm256_shuffle_epi8(f0, g0); + f1 = _mm256_shuffle_epi8(f1, g1); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0)); - ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1)); - ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2)); - ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF); - pos += 96; + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); + ctr += _mm_popcnt_u32((good >> 0) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); + ctr += _mm_popcnt_u32((good >> 16) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); + ctr += _mm_popcnt_u32((good >> 8) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); + ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr + 8 <= len && pos + 16 <= buflen) { - d = _mm_loadu_si128((__m128i *)&buf[pos]); - tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d); - good0 = (uint32_t)_mm_movemask_epi8(tmp); - good0 = _pext_u32(good0, 0x55555555); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0]); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { + f = _mm_load_si128((__m128i *)&buf[pos]); + t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + good = _mm_movemask_epi8(t); + good = _pext_u32(good, 0x5555); + pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - d = _mm_shuffle_epi8(d, pilo); /* Barrett reduction */ - tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v)); - tmp = _mm_srli_epi16(tmp, 10); - tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq)); - d = _mm_sub_epi16(d, tmp); + t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); + t = _mm_srli_epi16(t, 10); + t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); + f = _mm_sub_epi16(f, t); - _mm_storeu_si128((__m128i *)&r[ctr], d); - ctr += (unsigned int)_mm_popcnt_u32(good0); + f = _mm_shuffle_epi8(f, pilo); + _mm_storeu_si128((__m128i *)&r[ctr], f); + ctr += _mm_popcnt_u32(good); pos += 16; } - while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = (int16_t)val; + r[ctr++] = val; } } diff --git a/crypto_kem/kyber1024/avx2/rejsample.h b/crypto_kem/kyber1024/avx2/rejsample.h index 48dccdd6..b4cf4d74 100644 --- a/crypto_kem/kyber1024/avx2/rejsample.h +++ b/crypto_kem/kyber1024/avx2/rejsample.h @@ -1,12 +1,11 @@ #ifndef REJSAMPLE_H #define REJSAMPLE_H -#include +#include "params.h" #include -size_t PQCLEAN_KYBER1024_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); + +unsigned int PQCLEAN_KYBER1024_AVX2_rej_uniform_avx(int16_t *r, + const unsigned char *buf); #endif diff --git a/crypto_kem/kyber512-90s/avx2/shuffle.s b/crypto_kem/kyber1024/avx2/shuffle.S similarity index 66% rename from crypto_kem/kyber512-90s/avx2/shuffle.s rename to crypto_kem/kyber1024/avx2/shuffle.S index dfb56b55..4224fd85 100644 --- a/crypto_kem/kyber512-90s/avx2/shuffle.s +++ b/crypto_kem/kyber1024/avx2/shuffle.S @@ -1,12 +1,9 @@ +#include "cdecl.inc" .include "fq.inc" .include "shuffle.inc" -.global PQCLEAN_KYBER51290S_AVX2_nttunpack_avx -PQCLEAN_KYBER51290S_AVX2_nttunpack_avx: -#consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xv(%rip),%ymm1 - +/* +nttpack_avx: #load vmovdqa (%rdi),%ymm4 vmovdqa 32(%rdi),%ymm5 @@ -17,18 +14,51 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -/* -#reduce -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 12 -red16 9 13 -red16 10 14 -red16 11 15 +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret */ +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + shuffle8 4,8,3,8 shuffle8 5,9,4,9 shuffle8 6,10,5,10 @@ -61,11 +91,14 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx -PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx: -#consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 +.global cdecl(PQCLEAN_KYBER1024_AVX2_nttunpack_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret +ntttobytes128_avx: #load vmovdqa (%rsi),%ymm5 vmovdqa 32(%rsi),%ymm6 @@ -77,14 +110,14 @@ vmovdqa 192(%rsi),%ymm11 vmovdqa 224(%rsi),%ymm12 #csubq -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 1 -csubq 9 13 -csubq 10 14 -csubq 11 15 -csubq 12 1 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,1 +csubq 9,13 +csubq 10,14 +csubq 11,15 +csubq 12,1 #bitpack vpsllw $12,%ymm6,%ymm4 @@ -135,11 +168,17 @@ vmovdqu %ymm9,160(%rdi) ret -.global PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx -PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx: +.global cdecl(PQCLEAN_KYBER1024_AVX2_ntttobytes_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_ntttobytes_avx): #consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xmask(%rip),%ymm0 +vmovdqa _16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret +nttfrombytes128_avx: #load vmovdqu (%rsi),%ymm4 vmovdqu 32(%rsi),%ymm5 @@ -204,3 +243,13 @@ vmovdqa %ymm15,192(%rdi) vmovdqa %ymm1,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx) +cdecl(PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx): +#consts +vmovdqa _16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret diff --git a/crypto_kem/kyber1024/avx2/shuffle.inc b/crypto_kem/kyber1024/avx2/shuffle.inc index df352030..d4b092bc 100644 --- a/crypto_kem/kyber1024/avx2/shuffle.inc +++ b/crypto_kem/kyber1024/avx2/shuffle.inc @@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 +#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 vpsllq $32,%ymm\r1,%ymm12 vpsrlq $32,%ymm\r0,%ymm13 vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 diff --git a/crypto_kem/kyber1024/avx2/symmetric-fips202.c b/crypto_kem/kyber1024/avx2/symmetric-fips202.c deleted file mode 100644 index 483d7c59..00000000 --- a/crypto_kem/kyber1024/avx2/symmetric-fips202.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "fips202.h" -#include "symmetric.h" - -#include -/************************************************* -* Name: kyber_shake128_absorb -* -* Description: Absorb step of the SHAKE128 specialized for the Kyber context. - -* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *input: pointer to KYBER_SYMBYTES input to be absorbed into s -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input -**************************************************/ -void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) { - size_t i; - uint8_t extseed[KYBER_SYMBYTES + 2]; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extseed[i] = input[i]; - } - extseed[i++] = x; - extseed[i] = y; - shake128_absorb(s, extseed, KYBER_SYMBYTES + 2); -} - -/************************************************* -* Name: kyber_shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. -* Modifies the state. Can be called multiple times to keep squeezing, -* i.e., is incremental. -* -* Arguments: - uint8_t *output: pointer to output blocks -* - unsigned long long nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to in/output Keccak state -**************************************************/ -void PQCLEAN_KYBER1024_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s) { - shake128_squeezeblocks(output, nblocks, s); -} - -/************************************************* -* Name: shake256_prf -* -* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input -* and then generates outlen bytes of SHAKE256 output -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t * key: pointer to the key (of length KYBER_SYMBYTES) -* - const uint8_t nonce: single-byte nonce (public PRF input) -**************************************************/ -void PQCLEAN_KYBER1024_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t extkey[KYBER_SYMBYTES + 1]; - size_t i; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extkey[i] = key[i]; - } - extkey[i] = nonce; - - shake256(output, outlen, extkey, KYBER_SYMBYTES + 1); -} diff --git a/crypto_kem/kyber1024/avx2/symmetric-shake.c b/crypto_kem/kyber1024/avx2/symmetric-shake.c new file mode 100644 index 00000000..0a7ae70a --- /dev/null +++ b/crypto_kem/kyber1024/avx2/symmetric-shake.c @@ -0,0 +1,60 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include +#include + +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. +* +* Arguments: - keccak_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input +* to be absorbed into state +* - uint8_t i additional byte of input +* - uint8_t j additional byte of input +**************************************************/ +void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(xof_state *state, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y) { + unsigned int i = 0; + uint8_t extseed[KYBER_SYMBYTES + 2]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extseed[i] = seed[i]; + } + extseed[i++] = x; + extseed[i] = y; + + shake128_absorb(state, extseed, sizeof(extseed)); +} + +/************************************************* +* Name: PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key +* (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce) { + unsigned int i = 0; + uint8_t extkey[KYBER_SYMBYTES + 1]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extkey[i] = key[i]; + } + extkey[i] = nonce; + + shake256(out, outlen, extkey, sizeof(extkey)); +} diff --git a/crypto_kem/kyber1024/avx2/symmetric.h b/crypto_kem/kyber1024/avx2/symmetric.h index adba822a..31fb92c7 100644 --- a/crypto_kem/kyber1024/avx2/symmetric.h +++ b/crypto_kem/kyber1024/avx2/symmetric.h @@ -2,28 +2,36 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "fips202.h" #include "fips202x4.h" -typedef shake128ctx keccak_state; +typedef shake128ctx xof_state; -void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y); -void PQCLEAN_KYBER1024_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s); -void PQCLEAN_KYBER1024_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); +void PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(shake128ctx *s, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y); + +void PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce); + +#define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, IN, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_AVX2_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER1024_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) -#define XOF_BLOCKBYTES SHAKE128_RATE - -typedef keccak_state xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber1024/avx2/verify.c b/crypto_kem/kyber1024/avx2/verify.c index c66be4de..2f8abd6e 100644 --- a/crypto_kem/kyber1024/avx2/verify.c +++ b/crypto_kem/kyber1024/avx2/verify.c @@ -1,23 +1,22 @@ #include "verify.h" - #include #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER1024_AVX2_verify * * Description: Compare two arrays for equality in constant time. * -* Arguments: const uint8_t *a: pointer to first byte array -* const uint8_t *b: pointer to second byte array +* Arguments: const unsigned char *a: pointer to first byte array +* const unsigned char *b: pointer to second byte array * size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; - uint64_t r; +int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t pos = 0; + uint64_t r = 0; __m256i avec, bvec, cvec; cvec = _mm256_setzero_si256(); @@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t avec = _mm256_xor_si256(avec, bvec); cvec = _mm256_or_si256(cvec, avec); } + r = !_mm256_testz_si256(cvec, cvec); - cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256()); - r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1); - - while (pos < len) { - r |= a[pos] ^ b[pos]; - pos += 1; + if (pos < len) { + avec = _mm256_loadu_si256((__m256i *)&a[pos]); + bvec = _mm256_loadu_si256((__m256i *)&b[pos]); + cvec = _mm256_cmpeq_epi8(avec, bvec); + r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); } r = (-r) >> 63; - return (uint8_t)r; + return r; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER1024_AVX2_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: uint8_t *r: pointer to output byte array -* const uint8_t *x: pointer to input byte array +* Arguments: unsigned char *r: pointer to output byte array +* const unsigned char *x: pointer to input byte array * size_t len: Amount of bytes to be copied -* uint8_t b: Condition bit; has to be in {0,1} +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { + size_t pos = 0; __m256i xvec, rvec, bvec; b = -b; - bvec = _mm256_set1_epi8((char)b); + bvec = _mm256_set1_epi8(b); for (pos = 0; pos + 32 <= len; pos += 32) { rvec = _mm256_loadu_si256((__m256i *)&r[pos]); diff --git a/crypto_kem/kyber1024/avx2/verify.h b/crypto_kem/kyber1024/avx2/verify.h index c890d8a4..237328bf 100644 --- a/crypto_kem/kyber1024/avx2/verify.h +++ b/crypto_kem/kyber1024/avx2/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER1024_AVX2_VERIFY_H +#define PQCLEAN_KYBER1024_AVX2_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER1024_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER1024_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber1024/clean/LICENSE b/crypto_kem/kyber1024/clean/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber1024/clean/LICENSE +++ b/crypto_kem/kyber1024/clean/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber1024/clean/Makefile b/crypto_kem/kyber1024/clean/Makefile index d6ae930c..7a5c508a 100644 --- a/crypto_kem/kyber1024/clean/Makefile +++ b/crypto_kem/kyber1024/clean/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber1024_clean.a -HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-fips202.o +HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h +OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-shake.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber1024/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber1024/clean/Makefile.Microsoft_nmake index 9ef8667d..b23e1b61 100644 --- a/crypto_kem/kyber1024/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber1024/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber1024_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-fips202.obj +OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-shake.obj # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber1024/clean/cbd.c b/crypto_kem/kyber1024/clean/cbd.c index 0f0ada3d..61e0115e 100644 --- a/crypto_kem/kyber1024/clean/cbd.c +++ b/crypto_kem/kyber1024/clean/cbd.c @@ -1,7 +1,5 @@ -#include "cbd.h" #include "params.h" - -#include +#include "cbd.h" #include /************************************************* @@ -14,8 +12,8 @@ * * Returns 32-bit unsigned integer loaded from x **************************************************/ -static uint32_t load32_littleendian(const uint8_t *x) { - uint32_t r; +static uint32_t load32_littleendian(const uint8_t x[4]) { + uint32_t r = 0; r = (uint32_t)x[0]; r |= (uint32_t)x[1] << 8; r |= (uint32_t)x[2] << 16; @@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) { } /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER1024_CLEAN_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA -* specialized for KYBER_ETA=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t *buf) { - uint32_t d, t; - int16_t a, b; +void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { + unsigned int i = 0, j = 0; + uint32_t t = 0, d = 0; + int16_t a = 0, b = 0; - for (size_t i = 0; i < KYBER_N / 8; i++) { - t = load32_littleendian(buf + 4 * i); + for (i = 0; i < KYBER_N / 8; i++) { + t = load32_littleendian(buf + 4 * i); d = t & 0x55555555; d += (t >> 1) & 0x55555555; - for (size_t j = 0; j < 8; j++) { - a = (d >> 4 * j) & 0x3; + for (j = 0; j < 8; j++) { + a = (d >> (4 * j + 0)) & 0x3; b = (d >> (4 * j + 2)) & 0x3; r->coeffs[8 * i + j] = a - b; } diff --git a/crypto_kem/kyber1024/clean/cbd.h b/crypto_kem/kyber1024/clean/cbd.h index da528362..9826089e 100644 --- a/crypto_kem/kyber1024/clean/cbd.h +++ b/crypto_kem/kyber1024/clean/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER1024_CLEAN_CBD_H +#define PQCLEAN_KYBER1024_CLEAN_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER1024_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber1024/clean/indcpa.c b/crypto_kem/kyber1024/clean/indcpa.c index a66e14d9..a15802f0 100644 --- a/crypto_kem/kyber1024/clean/indcpa.c +++ b/crypto_kem/kyber1024/clean/indcpa.c @@ -5,7 +5,7 @@ #include "polyvec.h" #include "randombytes.h" #include "symmetric.h" - +#include #include /************************************************* @@ -16,12 +16,15 @@ * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key +* polynomial vector +* - uint8_t *seed: pointer to output seed to generate +* matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(r, sk); } @@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of +* polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER1024_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER1024_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER1024_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - size_t len: requested number of 16-bit integers (uniform mod q) -* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes) -* - size_t buflen: length of input buffer in bytes +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) **************************************************/ -static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - size_t ctr, pos; - uint16_t val; +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { - val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction + val -= (val >> 12) * KYBER_Q; // Barrett reduction r[ctr++] = (int16_t)val; } } @@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER1024_CLEAN_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER1024_CLEAN_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER1024_CLEAN_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T +* is generated **************************************************/ -#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - uint8_t i, j; - uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1]; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +// Not static for benchmarking +void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { xof_absorb(&state, seed, j, i); } - xof_squeezeblocks(buf, MAXNBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES); + xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); while (ctr < KYBER_N) { xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, + XOF_BLOCKBYTES); } xof_ctx_release(&state); } @@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], e, pkpv, skpv; +void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t *publicseed = buf; - uint8_t *noiseseed = buf + KYBER_SYMBYTES; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; uint8_t nonce = 0; + polyvec a[KYBER_K], e, pkpv, skpv; randombytes(buf, KYBER_SYMBYTES); hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&skpv); PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv); - PQCLEAN_KYBER1024_CLEAN_poly_frommont(&pkpv.vec[i]); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER1024_CLEAN_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER1024_CLEAN_polyvec_add(&pkpv, &pkpv, &e); @@ -217,34 +243,40 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec sp, pkpv, ep, at[KYBER_K], bp; - poly v, k, epp; +void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER1024_CLEAN_poly_frommsg(&k, m); gen_at(at, seed); - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); } PQCLEAN_KYBER1024_CLEAN_poly_getnoise(&epp, coins, nonce++); @@ -252,14 +284,14 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c, PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER1024_CLEAN_polyvec_invntt(&bp); - PQCLEAN_KYBER1024_CLEAN_poly_invntt(&v); + PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&v); PQCLEAN_KYBER1024_CLEAN_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER1024_CLEAN_poly_add(&v, &v, &epp); @@ -271,18 +303,21 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER1024_CLEAN_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -290,8 +325,8 @@ void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER1024_CLEAN_poly_invntt(&mp); + PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER1024_CLEAN_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER1024_CLEAN_poly_reduce(&mp); diff --git a/crypto_kem/kyber1024/clean/indcpa.h b/crypto_kem/kyber1024/clean/indcpa.h index be70897f..c67d62dc 100644 --- a/crypto_kem/kyber1024/clean/indcpa.h +++ b/crypto_kem/kyber1024/clean/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER1024_CLEAN_INDCPA_H +#define PQCLEAN_KYBER1024_CLEAN_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER1024_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER1024_CLEAN_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER1024_CLEAN_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER1024_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER1024_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber1024/clean/kem.c b/crypto_kem/kyber1024/clean/kem.c index ccb2a59f..20cd3ab0 100644 --- a/crypto_kem/kyber1024/clean/kem.c +++ b/crypto_kem/kyber1024/clean/kem.c @@ -1,99 +1,125 @@ -#include "api.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER1024_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { uint8_t buf[2 * KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + /* Don't release system RNG output */ + hash_h(buf, buf, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER1024_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER1024_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; PQCLEAN_KYBER1024_CLEAN_indcpa_dec(buf, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER1024_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER1024_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER1024_CLEAN_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER1024_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER1024_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber1024/clean/kem.h b/crypto_kem/kyber1024/clean/kem.h new file mode 100644 index 00000000..9160cf9d --- /dev/null +++ b/crypto_kem/kyber1024/clean/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER1024_CLEAN_KEM_H +#define PQCLEAN_KYBER1024_CLEAN_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER1024_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber1024/clean/ntt.c b/crypto_kem/kyber1024/clean/ntt.c index 5937947c..e6d200dc 100644 --- a/crypto_kem/kyber1024/clean/ntt.c +++ b/crypto_kem/kyber1024/clean/ntt.c @@ -1,11 +1,9 @@ -#include "ntt.h" #include "params.h" +#include "ntt.h" #include "reduce.h" - -#include #include -/* Code to generate zetas and zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER1024_CLEAN_zetas and PQCLEAN_KYBER1024_CLEAN_zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 @@ -17,12 +15,8 @@ static const uint16_t tree[128] = { 1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121, 5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125, 3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123, - 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127}; - - -static int16_t fqmul(int16_t a, int16_t b) { - return montgomery_reduce((int32_t)a*b); -} + 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127 +}; void init_ntt() { unsigned int i, j, k; @@ -33,40 +27,44 @@ void init_ntt() { tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); for(i = 0; i < 128; ++i) - zetas[i] = tmp[tree[i]]; + PQCLEAN_KYBER1024_CLEAN_zetas[i] = tmp[tree[i]]; k = 0; for(i = 64; i >= 1; i >>= 1) for(j = i; j < 2*i; ++j) - zetas_inv[k++] = -tmp[128 - tree[j]]; + PQCLEAN_KYBER1024_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + PQCLEAN_KYBER1024_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; } */ + const int16_t PQCLEAN_KYBER1024_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, - 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, - 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, - 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, - 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, - 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, + 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, + 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, + 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, + 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, + 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, + 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, + 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, + 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 }; const int16_t PQCLEAN_KYBER1024_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, - 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, - 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, - 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, - 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, - 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 + 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, + 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, + 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, + 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, + 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, + 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, + 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, + 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, + 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 }; - /************************************************* * Name: fqmul * @@ -82,68 +80,73 @@ static int16_t fqmul(int16_t a, int16_t b) { } /************************************************* -* Name: ntt +* Name: PQCLEAN_KYBER1024_CLEAN_ntt * * Description: Inplace number-theoretic transform (NTT) in Rq * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t poly[256]) { - size_t j, k = 1; - int16_t t, zeta; +void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]) { + unsigned int len = 0, start = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 128; len >= 2; len >>= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 1; + for (len = 128; len >= 2; len >>= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER1024_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = fqmul(zeta, poly[j + len]); - poly[j + len] = poly[j] - t; - poly[j] = poly[j] + t; + t = fqmul(zeta, r[j + len]); + r[j + len] = r[j] - t; + r[j] = r[j] + t; } } } } /************************************************* -* Name: invntt +* Name: invntt_tomont * -* Description: Inplace inverse number-theoretic transform in Rq -* input is in bitreversed order, output is in standard order +* Description: Inplace inverse number-theoretic transform in Rq and +* multiplication by Montgomery factor 2^16. +* Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t poly[256]) { - size_t j, k = 0; - int16_t t, zeta; +void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]) { + unsigned int start = 0, len = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 2; len <= 128; len <<= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 0; + for (len = 2; len <= 128; len <<= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER1024_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { - t = poly[j]; - poly[j] = PQCLEAN_KYBER1024_CLEAN_barrett_reduce(t + poly[j + len]); - poly[j + len] = t - poly[j + len]; - poly[j + len] = fqmul(zeta, poly[j + len]); + t = r[j]; + r[j] = PQCLEAN_KYBER1024_CLEAN_barrett_reduce(t + r[j + len]); + r[j + len] = t - r[j + len]; + r[j + len] = fqmul(zeta, r[j + len]); } } } for (j = 0; j < 256; ++j) { - poly[j] = fqmul(poly[j], PQCLEAN_KYBER1024_CLEAN_zetas_inv[127]); + r[j] = fqmul(r[j], PQCLEAN_KYBER1024_CLEAN_zetas_inv[127]); } } /************************************************* -* Name: basemul +* Name: PQCLEAN_KYBER1024_CLEAN_basemul * -* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta)) +* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); diff --git a/crypto_kem/kyber1024/clean/ntt.h b/crypto_kem/kyber1024/clean/ntt.h index 78473592..69517a1a 100644 --- a/crypto_kem/kyber1024/clean/ntt.h +++ b/crypto_kem/kyber1024/clean/ntt.h @@ -1,13 +1,22 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER1024_CLEAN_NTT_H +#define PQCLEAN_KYBER1024_CLEAN_NTT_H +#include "params.h" #include + extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetasinv[128]; -void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t *poly); -void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t *poly); + +extern const int16_t PQCLEAN_KYBER1024_CLEAN_zetas_inv[128]; + + +void PQCLEAN_KYBER1024_CLEAN_ntt(int16_t r[256]); + + +void PQCLEAN_KYBER1024_CLEAN_invntt(int16_t r[256]); + + void PQCLEAN_KYBER1024_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber1024/clean/params.h b/crypto_kem/kyber1024/clean/params.h index 85dcf73a..ad5b5a2e 100644 --- a/crypto_kem/kyber1024/clean/params.h +++ b/crypto_kem/kyber1024/clean/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER1024_CLEAN_PARAMS_H +#define PQCLEAN_KYBER1024_CLEAN_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 4 #define KYBER_POLYCOMPRESSEDBYTES 160 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber1024/clean/poly.c b/crypto_kem/kyber1024/clean/poly.c index 6345e536..8da3f722 100644 --- a/crypto_kem/kyber1024/clean/poly.c +++ b/crypto_kem/kyber1024/clean/poly.c @@ -1,120 +1,177 @@ +#include "params.h" #include "cbd.h" #include "ntt.h" -#include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include + /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER1024_CLEAN_poly_compress * * Description: Compression and subsequent serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t k = 0; PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N; i += 8) { - for (size_t j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + t[j] = ((((uint32_t)a->coeffs[8 * i + j] << 5) + KYBER_Q / 2) / KYBER_Q) & 31; } - r[k] = (uint8_t)( t[0] | (t[1] << 5)); - r[k + 1] = (uint8_t)((t[1] >> 3) | (t[2] << 2) | (t[3] << 7)); - r[k + 2] = (uint8_t)((t[3] >> 1) | (t[4] << 4)); - r[k + 3] = (uint8_t)((t[4] >> 4) | (t[5] << 1) | (t[6] << 6)); - r[k + 4] = (uint8_t)((t[6] >> 2) | (t[7] << 3)); - k += 5; + r[0] = (t[0] >> 0) | (t[1] << 5); + r[1] = (t[1] >> 3) | (t[2] << 2) | (t[3] << 7); + r[2] = (t[3] >> 1) | (t[4] << 4); + r[3] = (t[4] >> 4) | (t[5] << 1) | (t[6] << 6); + r[4] = (t[6] >> 2) | (t[7] << 3); + r += 5; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER1024_CLEAN_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER1024_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)( (((a[0] & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 1] = (int16_t)(((((a[0] >> 5) | ((a[1] & 3) << 3)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 2] = (int16_t)(((((a[1] >> 2) & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 3] = (int16_t)(((((a[1] >> 7) | ((a[2] & 15) << 1)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 4] = (int16_t)(((((a[2] >> 4) | ((a[3] & 1) << 4)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 5] = (int16_t)(((((a[3] >> 1) & 31) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 6] = (int16_t)(((((a[3] >> 6) | ((a[4] & 7) << 2)) * KYBER_Q) + 16) >> 5); - r->coeffs[i + 7] = (int16_t)( (((a[4] >> 3) * KYBER_Q) + 16) >> 5); +void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + unsigned int j = 0; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 5) | (a[1] << 3); + t[2] = (a[1] >> 2); + t[3] = (a[1] >> 7) | (a[2] << 1); + t[4] = (a[2] >> 4) | (a[3] << 4); + t[5] = (a[3] >> 1); + t[6] = (a[3] >> 6) | (a[4] << 2); + t[7] = (a[4] >> 3); a += 5; + + for (j = 0; j < 8; j++) { + r->coeffs[8 * i + j] = ((uint32_t)(t[j] & 31) * KYBER_Q + 16) >> 5; + } } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER1024_CLEAN_poly_tobytes * * Description: Serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t *r, poly *a) { - int16_t t0, t1; +void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + unsigned int i = 0; + uint16_t t0 = 0, t1 = 0; PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N / 2; i++) { + for (i = 0; i < KYBER_N / 2; i++) { t0 = a->coeffs[2 * i]; t1 = a->coeffs[2 * i + 1]; - r[3 * i] = t0 & 0xff; - r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 & 0xf) << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER1024_CLEAN_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER1024_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N / 2; i++) { - r->coeffs[2 * i] = (int16_t)(a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8); - r->coeffs[2 * i + 1] = (int16_t)(a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4); +void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF; + r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF; } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER1024_CLEAN_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + unsigned int i = 0, j = 0; + int16_t mask = 0; + + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + mask = -(int16_t)((msg[i] >> j) & 1); + r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER1024_CLEAN_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { + unsigned int i = 0, j = 0; + uint16_t t = 0; + + PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); + + for (i = 0; i < KYBER_N / 8; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER1024_CLEAN_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { +void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); + prf(buf, sizeof(buf), seed, nonce); PQCLEAN_KYBER1024_CLEAN_cbd(r, buf); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER1024_CLEAN_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -128,20 +185,20 @@ void PQCLEAN_KYBER1024_CLEAN_poly_ntt(poly *r) { } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_invntt(poly *r) { +void PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(poly *r) { PQCLEAN_KYBER1024_CLEAN_invntt(r->coeffs); } /************************************************* -* Name: poly_basemul +* Name: PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery * * Description: Multiplication of two polynomials in NTT domain * @@ -149,68 +206,64 @@ void PQCLEAN_KYBER1024_CLEAN_poly_invntt(poly *r) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N / 4; ++i) { - PQCLEAN_KYBER1024_CLEAN_basemul( - r->coeffs + 4 * i, - a->coeffs + 4 * i, - b->coeffs + 4 * i, - PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER1024_CLEAN_basemul( - r->coeffs + 4 * i + 2, - a->coeffs + 4 * i + 2, - b->coeffs + 4 * i + 2, - -PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); +void PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 4; i++) { + PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER1024_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], + -PQCLEAN_KYBER1024_CLEAN_zetas[64 + i]); } } /************************************************* -* Name: poly_frommont +* Name: PQCLEAN_KYBER1024_CLEAN_poly_tomont * * Description: Inplace conversion of all coefficients of a polynomial -* from Montgomery domain to normal domain +* from normal domain to Montgomery domain * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_frommont(poly *r) { +void PQCLEAN_KYBER1024_CLEAN_poly_tomont(poly *r) { + unsigned int i = 0; const int16_t f = (1ULL << 32) % KYBER_Q; - - for (size_t i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER1024_CLEAN_montgomery_reduce( - (int32_t)r->coeffs[i] * f); + for (i = 0; i < KYBER_N; i++) { + r->coeffs[i] = PQCLEAN_KYBER1024_CLEAN_montgomery_reduce((int32_t)r->coeffs[i] * f); } } /************************************************* -* Name: poly_reduce +* Name: PQCLEAN_KYBER1024_CLEAN_poly_reduce * * Description: Applies Barrett reduction to all coefficients of a polynomial * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER1024_CLEAN_poly_reduce(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER1024_CLEAN_barrett_reduce(r->coeffs[i]); } } /************************************************* -* Name: poly_csubq +* Name: PQCLEAN_KYBER1024_CLEAN_poly_csubq * -* Description: Applies conditional subtraction of q to each coefficient of a polynomial -* for details of conditional subtraction of q see comments in reduce.c +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER1024_CLEAN_csubq(r->coeffs[i]); } } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER1024_CLEAN_poly_add * * Description: Add two polynomials * @@ -219,13 +272,14 @@ void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER1024_CLEAN_poly_sub * * Description: Subtract two polynomials * @@ -234,48 +288,8 @@ void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER1024_CLEAN_poly_sub(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] - b->coeffs[i]; } } - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - uint16_t mask; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - for (size_t j = 0; j < 8; j++) { - mask = -((msg[i] >> j) & 1); - r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); - } - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint16_t t; - - PQCLEAN_KYBER1024_CLEAN_poly_csubq(a); - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - msg[i] = 0; - for (size_t j = 0; j < 8; j++) { - t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; - msg[i] |= t << j; - } - } -} diff --git a/crypto_kem/kyber1024/clean/poly.h b/crypto_kem/kyber1024/clean/poly.h index 12e0e11c..6a896e22 100644 --- a/crypto_kem/kyber1024/clean/poly.h +++ b/crypto_kem/kyber1024/clean/poly.h @@ -1,9 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER1024_CLEAN_POLY_H +#define PQCLEAN_KYBER1024_CLEAN_POLY_H #include "params.h" - #include + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -12,26 +12,41 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER1024_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER1024_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER1024_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER1024_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER1024_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER1024_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER1024_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + void PQCLEAN_KYBER1024_CLEAN_poly_ntt(poly *r); -void PQCLEAN_KYBER1024_CLEAN_poly_invntt(poly *r); -void PQCLEAN_KYBER1024_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER1024_CLEAN_poly_frommont(poly *r); + +void PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(poly *r); + +void PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER1024_CLEAN_poly_tomont(poly *r); + void PQCLEAN_KYBER1024_CLEAN_poly_reduce(poly *r); + void PQCLEAN_KYBER1024_CLEAN_poly_csubq(poly *r); + void PQCLEAN_KYBER1024_CLEAN_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER1024_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber1024/clean/polyvec.c b/crypto_kem/kyber1024/clean/polyvec.c index 76002c2d..dab073cd 100644 --- a/crypto_kem/kyber1024/clean/polyvec.c +++ b/crypto_kem/kyber1024/clean/polyvec.c @@ -1,138 +1,163 @@ -#include "polyvec.h" - +#include "params.h" #include "poly.h" - -#include +#include "polyvec.h" #include + /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_compress * * Description: Compress and serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(a); uint16_t t[8]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 8; j++) { - for (size_t k = 0; k < 8; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) / KYBER_Q) & 0x7ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + for (k = 0; k < 8; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[8 * j + k] << 11) + KYBER_Q / 2) + / KYBER_Q) & 0x7ff; + } } - r[11 * j + 0] = (uint8_t)t[0]; - r[11 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x1f) << 3)); - r[11 * j + 2] = (uint8_t)((t[1] >> 5) | ((t[2] & 0x03) << 6)); - r[11 * j + 3] = (uint8_t)((t[2] >> 2)); - r[11 * j + 4] = (uint8_t)((t[2] >> 10) | ((t[3] & 0x7f) << 1)); - r[11 * j + 5] = (uint8_t)((t[3] >> 7) | ((t[4] & 0x0f) << 4)); - r[11 * j + 6] = (uint8_t)((t[4] >> 4) | ((t[5] & 0x01) << 7)); - r[11 * j + 7] = (uint8_t)((t[5] >> 1)); - r[11 * j + 8] = (uint8_t)((t[5] >> 9) | ((t[6] & 0x3f) << 2)); - r[11 * j + 9] = (uint8_t)((t[6] >> 6) | ((t[7] & 0x07) << 5)); - r[11 * j + 10] = (uint8_t)((t[7] >> 3)); + r[ 0] = (t[0] >> 0); + r[ 1] = (t[0] >> 8) | (t[1] << 3); + r[ 2] = (t[1] >> 5) | (t[2] << 6); + r[ 3] = (t[2] >> 2); + r[ 4] = (t[2] >> 10) | (t[3] << 1); + r[ 5] = (t[3] >> 7) | (t[4] << 4); + r[ 6] = (t[4] >> 4) | (t[5] << 7); + r[ 7] = (t[5] >> 1); + r[ 8] = (t[5] >> 9) | (t[6] << 2); + r[ 9] = (t[6] >> 6) | (t[7] << 5); + r[10] = (t[7] >> 3); + r += 11; } - r += 352; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER1024_CLEAN_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 8; j++) { - r->vec[i].coeffs[8 * j + 0] = (int16_t)( (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 1] = (int16_t)(((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 2] = (int16_t)(((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) | (((uint32_t)a[11 * j + 4] & 0x01) << 10)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 3] = (int16_t)(((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 4] = (int16_t)(((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 5] = (int16_t)(((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) | (((uint32_t)a[11 * j + 8] & 0x03) << 9)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 6] = (int16_t)(((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) * KYBER_Q) + 1024) >> 11); - r->vec[i].coeffs[8 * j + 7] = (int16_t)(((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) * KYBER_Q) + 1024) >> 11); +void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[8]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 8; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[ 1] << 8); + t[1] = (a[1] >> 3) | ((uint16_t)a[ 2] << 5); + t[2] = (a[2] >> 6) | ((uint16_t)a[ 3] << 2) | ((uint16_t)a[4] << 10); + t[3] = (a[4] >> 1) | ((uint16_t)a[ 5] << 7); + t[4] = (a[5] >> 4) | ((uint16_t)a[ 6] << 4); + t[5] = (a[6] >> 7) | ((uint16_t)a[ 7] << 1) | ((uint16_t)a[8] << 9); + t[6] = (a[8] >> 2) | ((uint16_t)a[ 9] << 6); + t[7] = (a[9] >> 5) | ((uint16_t)a[10] << 3); + a += 11; + + for (k = 0; k < 8; k++) { + r->vec[i].coeffs[8 * j + k] = ((uint32_t)(t[k] & 0x7FF) * KYBER_Q + 1024) >> 11; + } } - a += 352; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes * * Description: Serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) +* Arguments: - uint8_t *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_CLEAN_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { +void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + unsigned int i = 0; poly t; - PQCLEAN_KYBER1024_CLEAN_poly_basemul(r, &a->vec[0], &b->vec[0]); - for (size_t i = 1; i < KYBER_K; i++) { - PQCLEAN_KYBER1024_CLEAN_poly_basemul(&t, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER1024_CLEAN_poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]); PQCLEAN_KYBER1024_CLEAN_poly_add(r, r, &t); } @@ -140,37 +165,40 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, co } /************************************************* -* Name: polyvec_reduce +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient * of each element of a vector of polynomials * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_reduce(&r->vec[i]); } } /************************************************* -* Name: polyvec_csubq +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_csubq * * Description: Applies conditional subtraction of q to each coefficient * of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in reduce.c +* for details of conditional subtraction of q see comments in +* reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER1024_CLEAN_polyvec_add * * Description: Add vectors of polynomials * @@ -179,7 +207,8 @@ void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER1024_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER1024_CLEAN_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber1024/clean/polyvec.h b/crypto_kem/kyber1024/clean/polyvec.h index c4712e12..b1281275 100644 --- a/crypto_kem/kyber1024/clean/polyvec.h +++ b/crypto_kem/kyber1024/clean/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER1024_CLEAN_POLYVEC_H +#define PQCLEAN_KYBER1024_CLEAN_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER1024_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER1024_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER1024_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER1024_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER1024_CLEAN_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER1024_CLEAN_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER1024_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER1024_CLEAN_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER1024_CLEAN_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER1024_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber1024/clean/reduce.c b/crypto_kem/kyber1024/clean/reduce.c index 340fd618..03403dd2 100644 --- a/crypto_kem/kyber1024/clean/reduce.c +++ b/crypto_kem/kyber1024/clean/reduce.c @@ -1,32 +1,32 @@ -#include "reduce.h" - #include "params.h" - +#include "reduce.h" #include + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_KYBER1024_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes * 16-bit integer congruent to a * R^-1 mod q, * where R=2^16 * -* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1} +* Arguments: - int32_t a: input integer to be reduced; +* has to be in {-q2^15,...,q2^15-1} * * Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. **************************************************/ int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a) { - int32_t t; - int16_t u; + int32_t t = 0; + int16_t u = 0; u = (int16_t)(a * (int64_t)QINV); t = (int32_t)u * KYBER_Q; t = a - t; t >>= 16; - return (int16_t)t; + return t; } /************************************************* -* Name: barrett_reduce +* Name: PQCLEAN_KYBER1024_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes * 16-bit integer congruent to a mod q in {0,...,q} @@ -36,21 +36,20 @@ int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a) { * Returns: integer in {0,...,q} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER1024_CLEAN_barrett_reduce(int16_t a) { - int32_t t; - const int32_t v = (1U << 26) / KYBER_Q + 1; + int16_t t = 0; + const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = v * a; - t >>= 26; + t = (int32_t)v * a >> 26; t *= KYBER_Q; - return a - (int16_t)t; + return a - t; } /************************************************* -* Name: csubq +* Name: PQCLEAN_KYBER1024_CLEAN_csubq * * Description: Conditionallly subtract q * -* Arguments: - int16_t a: input integer +* Arguments: - int16_t x: input integer * * Returns: a - q if a >= q, else a **************************************************/ diff --git a/crypto_kem/kyber1024/clean/reduce.h b/crypto_kem/kyber1024/clean/reduce.h index 248fd8e1..3148e692 100644 --- a/crypto_kem/kyber1024/clean/reduce.h +++ b/crypto_kem/kyber1024/clean/reduce.h @@ -1,15 +1,19 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER1024_CLEAN_REDUCE_H +#define PQCLEAN_KYBER1024_CLEAN_REDUCE_H +#include "params.h" #include -#define MONT 2285 // 2^16 % Q -#define QINV 62209 // q^(-1) mod 2^16 +#define MONT 2285 // 2^16 mod q +#define QINV 62209 // q^-1 mod 2^16 + int16_t PQCLEAN_KYBER1024_CLEAN_montgomery_reduce(int32_t a); + int16_t PQCLEAN_KYBER1024_CLEAN_barrett_reduce(int16_t a); + int16_t PQCLEAN_KYBER1024_CLEAN_csubq(int16_t a); #endif diff --git a/crypto_kem/kyber1024/clean/symmetric-fips202.c b/crypto_kem/kyber1024/clean/symmetric-fips202.c deleted file mode 100644 index 949983a9..00000000 --- a/crypto_kem/kyber1024/clean/symmetric-fips202.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "fips202.h" -#include "symmetric.h" - -#include -/************************************************* -* Name: kyber_shake128_absorb -* -* Description: Absorb step of the SHAKE128 specialized for the Kyber context. - -* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *input: pointer to KYBER_SYMBYTES input to be absorbed into s -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input -**************************************************/ -void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) { - size_t i; - uint8_t extseed[KYBER_SYMBYTES + 2]; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extseed[i] = input[i]; - } - extseed[i++] = x; - extseed[i] = y; - shake128_absorb(s, extseed, KYBER_SYMBYTES + 2); -} - -/************************************************* -* Name: kyber_shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. -* Modifies the state. Can be called multiple times to keep squeezing, -* i.e., is incremental. -* -* Arguments: - uint8_t *output: pointer to output blocks -* - unsigned long long nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to in/output Keccak state -**************************************************/ -void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s) { - shake128_squeezeblocks(output, nblocks, s); -} - -/************************************************* -* Name: shake256_prf -* -* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input -* and then generates outlen bytes of SHAKE256 output -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t * key: pointer to the key (of length KYBER_SYMBYTES) -* - const uint8_t nonce: single-byte nonce (public PRF input) -**************************************************/ -void PQCLEAN_KYBER1024_CLEAN_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t extkey[KYBER_SYMBYTES + 1]; - size_t i; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extkey[i] = key[i]; - } - extkey[i] = nonce; - - shake256(output, outlen, extkey, KYBER_SYMBYTES + 1); -} diff --git a/crypto_kem/kyber1024/clean/symmetric-shake.c b/crypto_kem/kyber1024/clean/symmetric-shake.c new file mode 100644 index 00000000..ee5622e9 --- /dev/null +++ b/crypto_kem/kyber1024/clean/symmetric-shake.c @@ -0,0 +1,60 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include +#include + +/************************************************* +* Name: PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. +* +* Arguments: - keccak_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input +* to be absorbed into state +* - uint8_t i additional byte of input +* - uint8_t j additional byte of input +**************************************************/ +void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *state, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y) { + unsigned int i = 0; + uint8_t extseed[KYBER_SYMBYTES + 2]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extseed[i] = seed[i]; + } + extseed[i++] = x; + extseed[i] = y; + + shake128_absorb(state, extseed, sizeof(extseed)); +} + +/************************************************* +* Name: PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key +* (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce) { + unsigned int i = 0; + uint8_t extkey[KYBER_SYMBYTES + 1]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extkey[i] = key[i]; + } + extkey[i] = nonce; + + shake256(out, outlen, extkey, sizeof(extkey)); +} diff --git a/crypto_kem/kyber1024/clean/symmetric.h b/crypto_kem/kyber1024/clean/symmetric.h index 2320e411..263574ad 100644 --- a/crypto_kem/kyber1024/clean/symmetric.h +++ b/crypto_kem/kyber1024/clean/symmetric.h @@ -2,29 +2,35 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "fips202.h" -#include +typedef shake128ctx xof_state; + +void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(xof_state *s, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y); -typedef shake128ctx keccak_state; +void PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce); -void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y); -void PQCLEAN_KYBER1024_CLEAN_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s); -void PQCLEAN_KYBER1024_CLEAN_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); +#define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(STATE, IN, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER1024_CLEAN_kyber_shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER1024_CLEAN_kyber_shake128_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER1024_CLEAN_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER1024_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) -#define XOF_BLOCKBYTES 168 - -typedef keccak_state xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber1024/clean/verify.c b/crypto_kem/kyber1024/clean/verify.c index 78657d64..bd1dc887 100644 --- a/crypto_kem/kyber1024/clean/verify.c +++ b/crypto_kem/kyber1024/clean/verify.c @@ -1,34 +1,31 @@ #include "verify.h" - #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER1024_CLEAN_verify * * Description: Compare two arrays for equality in constant time. * * Arguments: const uint8_t *a: pointer to first byte array * const uint8_t *b: pointer to second byte array -* size_t len: length of the byte arrays +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER1024_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { - uint64_t r; - size_t i; - r = 0; +int PQCLEAN_KYBER1024_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t i = 0; + uint8_t r = 0; for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } - r = (-r) >> 63; - return (uint8_t)r; + return (-(uint64_t)r) >> 63; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER1024_CLEAN_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; @@ -37,14 +34,14 @@ uint8_t PQCLEAN_KYBER1024_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_ * * Arguments: uint8_t *r: pointer to output byte array * const uint8_t *x: pointer to input byte array -* size_t len: Amount of bytes to be copied +* size_t len: Amount of bytes to be copied * uint8_t b: Condition bit; has to be in {0,1} **************************************************/ void PQCLEAN_KYBER1024_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t i; + size_t i = 0; b = -b; for (i = 0; i < len; i++) { - r[i] ^= b & (x[i] ^ r[i]); + r[i] ^= b & (r[i] ^ x[i]); } } diff --git a/crypto_kem/kyber1024/clean/verify.h b/crypto_kem/kyber1024/clean/verify.h index 53a7f086..d24f4fe0 100644 --- a/crypto_kem/kyber1024/clean/verify.h +++ b/crypto_kem/kyber1024/clean/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER1024_CLEAN_VERIFY_H +#define PQCLEAN_KYBER1024_CLEAN_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER1024_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER1024_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER1024_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber512-90s/META.yml b/crypto_kem/kyber512-90s/META.yml index 2d2467ec..5902a9c4 100644 --- a/crypto_kem/kyber512-90s/META.yml +++ b/crypto_kem/kyber512-90s/META.yml @@ -28,6 +28,7 @@ implementations: - architecture: x86_64 operating_systems: - Linux + - Darwin required_flags: - aes - avx2 diff --git a/crypto_kem/kyber512-90s/avx2/LICENSE b/crypto_kem/kyber512-90s/avx2/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber512-90s/avx2/LICENSE +++ b/crypto_kem/kyber512-90s/avx2/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber512-90s/avx2/Makefile b/crypto_kem/kyber512-90s/avx2/Makefile index d58118c3..d8906ce5 100644 --- a/crypto_kem/kyber512-90s/avx2/Makefile +++ b/crypto_kem/kyber512-90s/avx2/Makefile @@ -1,9 +1,40 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber512-90s_avx2.a -HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h aes256ctr.h -OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \ - verify.o indcpa.o rejsample.o aes256ctr.o +HEADERS= \ + aes256ctr.h \ + align.h \ + api.h \ + cbd.h \ + cdecl.inc \ + consts.h \ + fq.inc \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + rejsample.h \ + shuffle.inc \ + symmetric.h \ + verify.h +OBJECTS= \ + aes256ctr.o \ + basemul.o \ + cbd.o \ + consts.o \ + fq.o \ + indcpa.o \ + invntt.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + rejsample.o \ + shuffle.o \ + verify.o CFLAGS=-mavx2 -maes -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ -Wmissing-prototypes -Wredundant-decls -std=c99 \ @@ -14,11 +45,8 @@ all: $(LIB) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) $(CFLAGS) -c -o $@ $< $(LIB): $(OBJECTS) $(AR) -r $@ $(OBJECTS) diff --git a/crypto_kem/kyber512-90s/avx2/aes256ctr.c b/crypto_kem/kyber512-90s/avx2/aes256ctr.c index 1a8128e4..e1428f2b 100644 --- a/crypto_kem/kyber512-90s/avx2/aes256ctr.c +++ b/crypto_kem/kyber512-90s/avx2/aes256ctr.c @@ -1,94 +1,68 @@ /* - crypto_stream_aes256ctr - based heavily on public-domain code by Romain Dolbeau + Based heavily on public-domain code by Romain Dolbeau Different handling of nonce+counter than original version - using separated 96-bit nonce and internal 32-bit counter, starting from zero + using separated 64-bit nonce and internal 64-bit counter, starting from zero Public Domain */ #include "aes256ctr.h" - #include +#include #include -static inline void aesni_encrypt8(uint8_t *out, +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { - __m128i nv0; - __m128i nv1; - __m128i nv2; - __m128i nv3; - __m128i nv4; - __m128i nv5; - __m128i nv6; - __m128i nv7; + __m128i f, f0, f1, f2, f3, t; /* Load current counter value */ - __m128i nv0i = _mm_load_si128(n); - - /* Increase counter in 8 consecutive blocks */ - nv0 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(0, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv1 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(1, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv2 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(2, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv3 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(3, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv4 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(4, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv5 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(5, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv6 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(6, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv7 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(7, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - - /* Write counter for next iteration, increased by 8 */ - _mm_store_si128(n, _mm_add_epi32(nv0i, _mm_set_epi64x(8, 0))); - - /* Actual AES encryption, 8x interleaved */ - __m128i temp0 = _mm_xor_si128(nv0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(nv1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(nv2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(nv3, rkeys[0]); - __m128i temp4 = _mm_xor_si128(nv4, rkeys[0]); - __m128i temp5 = _mm_xor_si128(nv5, rkeys[0]); - __m128i temp6 = _mm_xor_si128(nv6, rkeys[0]); - __m128i temp7 = _mm_xor_si128(nv7, rkeys[0]); - - for (uint8_t i = 1; i < 14; i++) { - temp0 = _mm_aesenc_si128(temp0, rkeys[i]); - temp1 = _mm_aesenc_si128(temp1, rkeys[i]); - temp2 = _mm_aesenc_si128(temp2, rkeys[i]); - temp3 = _mm_aesenc_si128(temp3, rkeys[i]); - temp4 = _mm_aesenc_si128(temp4, rkeys[i]); - temp5 = _mm_aesenc_si128(temp5, rkeys[i]); - temp6 = _mm_aesenc_si128(temp6, rkeys[i]); - temp7 = _mm_aesenc_si128(temp7, rkeys[i]); + f = _mm_load_si128(n); + + /* Increase counter in 4 consecutive blocks */ + t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); + + /* Write counter for next iteration, increased by 4 */ + _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); + + /* Actual AES encryption, 4x interleaved */ + t = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, t); + f1 = _mm_xor_si128(f1, t); + f2 = _mm_xor_si128(f2, t); + f3 = _mm_xor_si128(f3, t); + + for (int i = 1; i < 14; i++) { + t = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, t); + f1 = _mm_aesenc_si128(f1, t); + f2 = _mm_aesenc_si128(f2, t); + f3 = _mm_aesenc_si128(f3, t); } - temp0 = _mm_aesenclast_si128(temp0, rkeys[14]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[14]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[14]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[14]); - temp4 = _mm_aesenclast_si128(temp4, rkeys[14]); - temp5 = _mm_aesenclast_si128(temp5, rkeys[14]); - temp6 = _mm_aesenclast_si128(temp6, rkeys[14]); - temp7 = _mm_aesenclast_si128(temp7, rkeys[14]); + t = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, t); + f1 = _mm_aesenclast_si128(f1, t); + f2 = _mm_aesenclast_si128(f2, t); + f3 = _mm_aesenclast_si128(f3, t); /* Write results */ - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); - _mm_storeu_si128((__m128i *)(out + 64), temp4); - _mm_storeu_si128((__m128i *)(out + 80), temp5); - _mm_storeu_si128((__m128i *)(out + 96), temp6); - _mm_storeu_si128((__m128i *)(out + 112), temp7); + _mm_storeu_si128((__m128i *)(out + 0), f0); + _mm_storeu_si128((__m128i *)(out + 16), f1); + _mm_storeu_si128((__m128i *)(out + 32), f2); + _mm_storeu_si128((__m128i *)(out + 48), f3); } -void PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(aes256ctr_ctx *state, - const uint8_t *key, - uint16_t nonce) { - __m128i key0 = _mm_loadu_si128((__m128i *)(key + 0)); - __m128i key1 = _mm_loadu_si128((__m128i *)(key + 16)); - __m128i temp0, temp1, temp2, temp4; - size_t idx = 0; +void PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) { + __m128i key0, key1, temp0, temp1, temp2, temp4; + int idx = 0; - state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48); + key0 = _mm_loadu_si128((__m128i *)(key + 0)); + key1 = _mm_loadu_si128((__m128i *)(key + 16)); + state->n = _mm_loadl_epi64((__m128i *)&nonce); state->rkeys[idx++] = key0; temp0 = key0; @@ -137,38 +111,33 @@ void PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(aes256ctr_ctx *state, state->rkeys[idx++] = temp0; } -void PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce) { - state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48); -} - void PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *state) { - size_t i; - + size_t i = 0; for (i = 0; i < nblocks; i++) { - aesni_encrypt8(out, &state->n, state->rkeys); - out += 128; + aesni_encrypt4(out, &state->n, state->rkeys); + out += 64; } } void PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(uint8_t *out, size_t outlen, - const uint8_t *seed, - uint8_t nonce) { - size_t i; - uint8_t buf[128]; + const uint8_t seed[32], + uint64_t nonce) { + unsigned int i = 0; + uint8_t buf[64]; aes256ctr_ctx state; - PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, seed, (uint16_t)nonce << 8); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, seed, nonce); - while (outlen >= 128) { - aesni_encrypt8(out, &state.n, state.rkeys); - outlen -= 128; + while (outlen >= 64) { + aesni_encrypt4(out, &state.n, state.rkeys); + outlen -= 64; } if (outlen) { - aesni_encrypt8(buf, &state.n, state.rkeys); + aesni_encrypt4(buf, &state.n, state.rkeys); for (i = 0; i < outlen; i++) { out[i] = buf[i]; } diff --git a/crypto_kem/kyber512-90s/avx2/aes256ctr.h b/crypto_kem/kyber512-90s/avx2/aes256ctr.h index a106fa21..c6a58d87 100644 --- a/crypto_kem/kyber512-90s/avx2/aes256ctr.h +++ b/crypto_kem/kyber512-90s/avx2/aes256ctr.h @@ -5,22 +5,17 @@ #include #include +#define AES256CTR_NAMESPACE(s) pqcrystals_aes256ctr_avx2##s + +#define AES256CTR_BLOCKBYTES 64 + typedef struct { __m128i rkeys[16]; __m128i n; } aes256ctr_ctx; -void PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(aes256ctr_ctx *state, - const uint8_t *key, - uint16_t nonce); -void PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce); -void PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, - size_t nblocks, - aes256ctr_ctx *state); - -void PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(uint8_t *out, - size_t outlen, - const uint8_t *seed, - uint8_t nonce); +void PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce); +void PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *state); +void PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t seed[32], uint64_t nonce); #endif diff --git a/crypto_kem/kyber512-90s/avx2/align.h b/crypto_kem/kyber512-90s/avx2/align.h new file mode 100644 index 00000000..1581af7a --- /dev/null +++ b/crypto_kem/kyber512-90s/avx2/align.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_KYBER51290S_AVX2_ALIGN_H +#define PQCLEAN_KYBER51290S_AVX2_ALIGN_H +#include + +#define ALIGN16_TYPE(t) \ + union { \ + __m128i vec; \ + t orig; \ + } + +#define ALIGN32_ARRAY(t, s) \ + union { \ + __m256i vec; \ + t arr[(s)]; \ + } + +#define ALIGN32_ARRAY_2D(t, n, m) \ + union { \ + __m256i vec; \ + t arr[(n)][(m)]; \ + } +#endif diff --git a/crypto_kem/kyber512-90s/avx2/basemul.S b/crypto_kem/kyber512-90s/avx2/basemul.S index 4ba883a2..0c17514f 100644 --- a/crypto_kem/kyber512-90s/avx2/basemul.S +++ b/crypto_kem/kyber512-90s/avx2/basemul.S @@ -1,4 +1,5 @@ #include "params.h" +#include "cdecl.inc" .macro schoolbook off,sign #load @@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0 vpaddd %ymm7,%ymm8,%ymm7 # y1 .endm -.macro red a0,a1,b0,b1 x,y,z +.macro red a0,a1,b0,b1,x,y,z #pack vpxor %ymm\x,%ymm\x,%ymm\x vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y @@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0 vpsubw %ymm\y,%ymm\b0,%ymm\b0 .endm -.global PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx -PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx: -#consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 - +.text +basemul64_acc_avx: poly0.0: schoolbook 0,0 @@ -101,7 +97,7 @@ vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,(%rdi) @@ -128,7 +124,7 @@ vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,64(%rdi) @@ -136,17 +132,40 @@ vmovdqa %ymm5,96(%rdi) ret -.global PQCLEAN_KYBER51290S_AVX2_basemul_avx -PQCLEAN_KYBER51290S_AVX2_basemul_avx: +.global cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx): #consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +ret +basemul64_avx: schoolbook 0,0 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,(%rdi) @@ -155,10 +174,39 @@ vmovdqa %ymm12,32(%rdi) schoolbook 64,1 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,64(%rdi) vmovdqa %ymm12,96(%rdi) ret + +.global cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_basemul_avx): +#consts +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +ret diff --git a/crypto_kem/kyber512-90s/avx2/cbd.c b/crypto_kem/kyber512-90s/avx2/cbd.c index 9f280484..b23f5263 100644 --- a/crypto_kem/kyber512-90s/avx2/cbd.c +++ b/crypto_kem/kyber512-90s/avx2/cbd.c @@ -1,27 +1,27 @@ -#include "cbd.h" #include "params.h" - +#include "cbd.h" #include #include /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER51290S_AVX2_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA * * Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *buf: pointer to input byte array +* - const unsigned char *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_cbd(poly *r, const uint8_t *buf) { +void PQCLEAN_KYBER51290S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { + unsigned int i = 0; __m256i vec0, vec1, vec2, vec3, tmp; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); - for (size_t i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]); + for (i = 0; i < KYBER_N / 64; i++) { + vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); vec1 = _mm256_srli_epi32(vec0, 1); vec0 = _mm256_and_si256(mask55, vec0); diff --git a/crypto_kem/kyber512-90s/avx2/cbd.h b/crypto_kem/kyber512-90s/avx2/cbd.h index 5d49aa72..040a26a6 100644 --- a/crypto_kem/kyber512-90s/avx2/cbd.h +++ b/crypto_kem/kyber512-90s/avx2/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER51290S_AVX2_CBD_H +#define PQCLEAN_KYBER51290S_AVX2_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER51290S_AVX2_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER51290S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber512-90s/avx2/cdecl.inc b/crypto_kem/kyber512-90s/avx2/cdecl.inc new file mode 100644 index 00000000..8ded53b1 --- /dev/null +++ b/crypto_kem/kyber512-90s/avx2/cdecl.inc @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL +#define PQCLEAN_DILITHIUM2_AVX2_CDECL + +#define _16XQ 0 +#define _16XQINV 16 +#define _16XV 32 +#define _16XFLO 48 +#define _16XFHI 64 +#define _16XMONTSQLO 80 +#define _16XMONTSQHI 96 +#define _16XMASK 112 +#define _ZETAS_EXP 128 +#define _ZETAS_INV_EXP 528 + + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_kem/kyber512-90s/avx2/consts.c b/crypto_kem/kyber512-90s/avx2/consts.c index be777ae6..30d81bef 100644 --- a/crypto_kem/kyber512-90s/avx2/consts.c +++ b/crypto_kem/kyber512-90s/avx2/consts.c @@ -1,34 +1,155 @@ -#include "consts.h" #include "params.h" - -const uint16_t PQCLEAN_KYBER51290S_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628}; -const uint16_t PQCLEAN_KYBER51290S_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932}; +#include "consts.h" +#include #define Q KYBER_Q -#define MONT ((1U << 16) % KYBER_Q) +#define MONT ((1U << 16) % Q) #define QINV 62209 // q^-1 mod 2^16 -#define V ((1U << 26)/KYBER_Q + 1) -#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q) -#define FLO (FHI * QINV % 65536) -#define MONTSQHI (MONT * MONT % KYBER_Q) -#define MONTSQLO (MONTSQHI * QINV % 65536) +#define V (((1U << 26) + Q/2)/Q) +#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) +#define FLO (FHI*QINV % 65536) +#define MONTSQHI (MONT*MONT % Q) +#define MONTSQLO (MONTSQHI*QINV % 65536) #define MASK 4095 -const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}}; -const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}}; -const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}}; -const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}}; -const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}}; -const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}}; - -#undef Q -#undef QINV -#undef MONT -#undef V -#undef FLO -#undef FHI -#undef MONTSQLO -#undef MONTSQHI -#undef MASK + +const qdata_t PQCLEAN_KYBER51290S_AVX2_qdata = {.as_arr = { +#define _16XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, + +#define _16XQINV 16 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +#define _16XV 32 + V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + +#define _16XFLO 48 + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + +#define _16XFHI 64 + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + +#define _16XMONTSQLO 80 + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + +#define _16XMONTSQHI 96 + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + +#define _16XMASK 112 + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + +#define _ZETAS_EXP 128 + 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, + 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, + 3158, 3158, 3158, 3158, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, + 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, + 573, 573, 2004, 2004, 264, 264, 383, 383, + 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, + 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, + 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, + 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, + 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, + 2226, 555, 2078, 1550, 422, 177, 3038, 1574, + 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, + 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, + 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, + 430, 843, 871, 105, 587, 3094, 2869, 1653, + 778, 3182, 1483, 1119, 644, 349, 329, 3254, + 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, + 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, + 48842, 48842, 48842, 48842, 287, 287, 287, 287, + 287, 287, 287, 287, 202, 202, 202, 202, + 202, 202, 202, 202, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, + 31164, 31164, 31164, 31164, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, + 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, + 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, + 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, + 732, 732, 608, 608, 1787, 1787, 411, 411, + 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, + 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, + 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, + 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, + 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, + 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, + 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, + 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, + 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, + 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, + 3193, 1994, 220, 1670, 1799, 794, 2475, 478, + 3021, 991, 1869, 1628, 0, 0, 0, 0, + +#define _ZETAS_INV_EXP 528 + 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, + 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, + 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, + 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, + 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, + 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, + 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, + 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, + 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, + 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, + 951, 247, 1421, 3222, 2499, 271, 90, 853, + 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, + 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, + 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, + 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, + 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, + 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, + 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, + 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, + 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, + 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, + 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, + 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, + 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, + 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, + 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, + 2210, 1846, 147, 2551, 1676, 460, 235, 2742, + 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, + 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, + 45043, 32227, 11478, 335, 156, 2911, 872, 1590, + 602, 777, 2170, 246, 1755, 291, 3152, 2907, + 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, + 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, + 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, + 666, 320, 8, 2813, 1544, 282, 1838, 1293, + 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, + 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, + 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, + 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, + 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, + 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, + 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, + 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, + 171, 171, 171, 171, 12403, 12403, 12403, 12403, + 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, + 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, + 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, + 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, + 60300, 60300, 1932, 1932, 0, 0, 0, 0 + } +}; diff --git a/crypto_kem/kyber512-90s/avx2/consts.h b/crypto_kem/kyber512-90s/avx2/consts.h index 1409751a..c00f2983 100644 --- a/crypto_kem/kyber512-90s/avx2/consts.h +++ b/crypto_kem/kyber512-90s/avx2/consts.h @@ -1,24 +1,20 @@ -#ifndef CONSTS_H -#define CONSTS_H +#ifndef PQCLEAN_KYBER51290S_AVX2_CONSTS_H +#define PQCLEAN_KYBER51290S_AVX2_CONSTS_H +#include "cdecl.inc" + +#include "params.h" #include #include -typedef union { - uint16_t as_arr[16]; - __m256i as_vec; -} aligned_uint16_t; +#define ALIGNED_UINT16_T(N) \ + union { \ + __m256i as_vec; \ + uint16_t as_arr[(N)]; \ + } -extern const uint16_t PQCLEAN_KYBER51290S_AVX2_zetas_exp[396]; -extern const uint16_t PQCLEAN_KYBER51290S_AVX2_zetas_inv_exp[396]; +typedef ALIGNED_UINT16_T(928) qdata_t; -extern const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xq; -extern const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xqinv; -extern const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xv; -extern const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xflo; -extern const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xfhi; -extern const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xmontsqlo; -extern const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xmontsqhi; -extern const aligned_uint16_t PQCLEAN_KYBER51290S_AVX2_16xmask; +extern const qdata_t PQCLEAN_KYBER51290S_AVX2_qdata; #endif diff --git a/crypto_kem/kyber512-90s/avx2/fq.s b/crypto_kem/kyber512-90s/avx2/fq.S similarity index 54% rename from crypto_kem/kyber512-90s/avx2/fq.s rename to crypto_kem/kyber512-90s/avx2/fq.S index e2c26353..78670c10 100644 --- a/crypto_kem/kyber512-90s/avx2/fq.s +++ b/crypto_kem/kyber512-90s/avx2/fq.S @@ -1,11 +1,8 @@ +#include "cdecl.inc" .include "fq.inc" -.global PQCLEAN_KYBER51290S_AVX2_reduce_avx -PQCLEAN_KYBER51290S_AVX2_reduce_avx: -#consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xv(%rip),%ymm1 - +.text +reduce128_avx: #load vmovdqa (%rdi),%ymm2 vmovdqa 32(%rdi),%ymm3 @@ -16,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2 10 -red16 3 11 -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 10 -red16 9 11 +red16 2,10 +red16 3,11 +red16 4,12 +red16 5,13 +red16 6,14 +red16 7,15 +red16 8,10 +red16 9,11 #store vmovdqa %ymm2,(%rdi) @@ -37,11 +34,17 @@ vmovdqa %ymm9,224(%rdi) ret -.global PQCLEAN_KYBER51290S_AVX2_csubq_avx -PQCLEAN_KYBER51290S_AVX2_csubq_avx: +.global cdecl(PQCLEAN_KYBER51290S_AVX2_reduce_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_reduce_avx): #consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XV*2(%rsi),%ymm1 +call reduce128_avx +add $256,%rdi +call reduce128_avx +ret +csubq128_avx: #load vmovdqa (%rdi),%ymm1 vmovdqa 32(%rdi),%ymm2 @@ -52,14 +55,14 @@ vmovdqa 160(%rdi),%ymm6 vmovdqa 192(%rdi),%ymm7 vmovdqa 224(%rdi),%ymm8 -csubq 1 9 -csubq 2 10 -csubq 3 11 -csubq 4 12 -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 9 +csubq 1,9 +csubq 2,10 +csubq 3,11 +csubq 4,12 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,9 #store vmovdqa %ymm1,(%rdi) @@ -73,13 +76,16 @@ vmovdqa %ymm8,224(%rdi) ret -.global PQCLEAN_KYBER51290S_AVX2_frommont_avx -PQCLEAN_KYBER51290S_AVX2_frommont_avx: +.global cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_csubq_avx): #consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xmontsqlo(%rip),%ymm1 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xmontsqhi(%rip),%ymm2 +vmovdqa _16XQ*2(%rsi),%ymm0 +call csubq128_avx +add $256,%rdi +call csubq128_avx +ret +tomont128_avx: #load vmovdqa (%rdi),%ymm3 vmovdqa 32(%rdi),%ymm4 @@ -90,14 +96,14 @@ vmovdqa 160(%rdi),%ymm8 vmovdqa 192(%rdi),%ymm9 vmovdqa 224(%rdi),%ymm10 -fqmulprecomp 1,2,3 11 -fqmulprecomp 1,2,4 12 -fqmulprecomp 1,2,5 13 -fqmulprecomp 1,2,6 14 -fqmulprecomp 1,2,7 15 -fqmulprecomp 1,2,8 11 -fqmulprecomp 1,2,9 12 -fqmulprecomp 1,2,10 13 +fqmulprecomp 1,2,3,11 +fqmulprecomp 1,2,4,12 +fqmulprecomp 1,2,5,13 +fqmulprecomp 1,2,6,14 +fqmulprecomp 1,2,7,15 +fqmulprecomp 1,2,8,11 +fqmulprecomp 1,2,9,12 +fqmulprecomp 1,2,10,13 #store vmovdqa %ymm3,(%rdi) @@ -110,3 +116,14 @@ vmovdqa %ymm9,192(%rdi) vmovdqa %ymm10,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER51290S_AVX2_tomont_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_tomont_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx +add $256,%rdi +call tomont128_avx +ret diff --git a/crypto_kem/kyber512-90s/avx2/fq.inc b/crypto_kem/kyber512-90s/avx2/fq.inc index 56e36a02..75df098a 100644 --- a/crypto_kem/kyber512-90s/avx2/fq.inc +++ b/crypto_kem/kyber512-90s/avx2/fq.inc @@ -1,24 +1,27 @@ -.macro red16 r x=12 +.macro red16 r,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x vpsraw $10,%ymm\x,%ymm\x vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro csubq r x=12 +.macro csubq r,x=12 vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r +#vpcmpgtw %ymm0,%ymm\r,%ymm\x +#vpand %ymm0,%ymm\x,%ymm\x +#vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro caddq r x=12 +.macro caddq r,x=12 vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r .endm -.macro fqmulprecomp al,ah,b x=12 +.macro fqmulprecomp al,ah,b,x=12 vpmullw %ymm\al,%ymm\b,%ymm\x vpmulhw %ymm\ah,%ymm\b,%ymm\b vpmulhw %ymm0,%ymm\x,%ymm\x diff --git a/crypto_kem/kyber512-90s/avx2/indcpa.c b/crypto_kem/kyber512-90s/avx2/indcpa.c index 1118acca..944969e5 100644 --- a/crypto_kem/kyber512-90s/avx2/indcpa.c +++ b/crypto_kem/kyber512-90s/avx2/indcpa.c @@ -1,26 +1,33 @@ +#include "align.h" #include "cbd.h" #include "indcpa.h" #include "ntt.h" +#include "params.h" #include "poly.h" #include "polyvec.h" #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include +#include /************************************************* * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* compressed and serialized vector of polynomials pk +* serialized vector of polynomials pk * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { /************************************************* * Name: unpack_pk * -* Description: De-serialize and decompress public key from a byte array; +* Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes(r, sk); } @@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials +* (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(sk, packedsk); } @@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER51290S_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER51290S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER51290S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } -static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - unsigned int ctr, pos; - uint16_t val; +/************************************************* +* Name: rej_uniform +* +* Description: Run rejection sampling on uniform random bytes to generate +* uniform random integers mod q +* +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes +* +* Returns number of sampled 16-bit integers (at most len) +**************************************************/ +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { @@ -116,46 +150,47 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER51290S_AVX2_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER51290S_AVX2_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER51290S_AVX2_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - union { - uint8_t x[XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS]; - __m256i _dummy; - } buf; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +void PQCLEAN_KYBER51290S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; + ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf; aes256ctr_ctx state; PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, seed, 0); - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_K; j++) { + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_K; j++) { if (transposed) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(&state, (i << 8) + j); + nonce.orig = (j << 8) | i; } else { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(&state, (j << 8) + i); + nonce.orig = (i << 8) | j; } - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.x, GEN_MATRIX_MAXNBLOCKS, &state); - ctr = PQCLEAN_KYBER51290S_AVX2_rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf.x, GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES); + state.n = _mm_loadl_epi64(&nonce.vec); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state); + ctr = PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr); while (ctr < KYBER_N) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.x, 1, &state); - ctr += rej_uniform_ref(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.x, XOF_BLOCKBYTES); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr, + XOF_BLOCKBYTES); } PQCLEAN_KYBER51290S_AVX2_poly_nttunpack(&a[i].vec[j]); @@ -164,47 +199,53 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER51290S_AVX2_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], skpv, e, pkpv; - uint8_t buf[2 * KYBER_SYMBYTES]; - const uint8_t *publicseed = buf; - const uint8_t *noiseseed = buf + KYBER_SYMBYTES; - uint8_t nonce = 0; +void PQCLEAN_KYBER51290S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + const uint8_t *publicseed = buf.arr; + const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf, KYBER_SYMBYTES); - hash_g(buf, buf, KYBER_SYMBYTES); + randombytes(buf.arr, KYBER_SYMBYTES); + hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); gen_a(a, publicseed); + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; aes256ctr_ctx state; - uint8_t coins[128]; - PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, noiseseed, 0); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state); - PQCLEAN_KYBER51290S_AVX2_cbd(skpv.vec + i, coins); + ALIGN32_ARRAY(uint8_t, 128) coins; + PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER51290S_AVX2_cbd(&skpv.vec[i], coins.arr); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state); - PQCLEAN_KYBER51290S_AVX2_cbd(e.vec + i, coins); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER51290S_AVX2_cbd(&e.vec[i], coins.arr); } PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&skpv); PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv); - PQCLEAN_KYBER51290S_AVX2_poly_frommont(pkpv.vec + i); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER51290S_AVX2_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER51290S_AVX2_polyvec_add(&pkpv, &pkpv, &e); @@ -215,58 +256,67 @@ void PQCLEAN_KYBER51290S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER51290S_AVX2_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec at[KYBER_K], pkpv, sp, ep, bp; - poly k, v, epp; - uint8_t seed[KYBER_SYMBYTES]; - uint8_t nonce = 0; - - unpack_pk(&pkpv, seed, pk); +void PQCLEAN_KYBER51290S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; + + unpack_pk(&pkpv, seed.arr, pk); PQCLEAN_KYBER51290S_AVX2_poly_frommsg(&k, m); - gen_at(at, seed); + gen_at(at, seed.arr); + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; aes256ctr_ctx state; - uint8_t buf[128]; - PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, coins, 0); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER51290S_AVX2_cbd(sp.vec + i, buf); + ALIGN32_ARRAY(uint8_t, 128) buf; + PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(&state, coins, nonce.orig++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER51290S_AVX2_cbd(&sp.vec[i], buf.arr); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER51290S_AVX2_cbd(ep.vec + i, buf); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER51290S_AVX2_cbd(&ep.vec[i], buf.arr); } - PQCLEAN_KYBER51290S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER51290S_AVX2_cbd(&epp, buf); + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER51290S_AVX2_cbd(&epp, buf.arr); PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER51290S_AVX2_polyvec_invntt(&bp); - PQCLEAN_KYBER51290S_AVX2_poly_invntt(&v); + PQCLEAN_KYBER51290S_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(&v); PQCLEAN_KYBER51290S_AVX2_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER51290S_AVX2_poly_add(&v, &v, &epp); @@ -278,18 +328,21 @@ void PQCLEAN_KYBER51290S_AVX2_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER51290S_AVX2_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER51290S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -297,8 +350,8 @@ void PQCLEAN_KYBER51290S_AVX2_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER51290S_AVX2_poly_invntt(&mp); + PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER51290S_AVX2_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER51290S_AVX2_poly_reduce(&mp); diff --git a/crypto_kem/kyber512-90s/avx2/indcpa.h b/crypto_kem/kyber512-90s/avx2/indcpa.h index 0a1f8a07..34aa1ffc 100644 --- a/crypto_kem/kyber512-90s/avx2/indcpa.h +++ b/crypto_kem/kyber512-90s/avx2/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER51290S_AVX2_INDCPA_H +#define PQCLEAN_KYBER51290S_AVX2_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER51290S_AVX2_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER51290S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER51290S_AVX2_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER51290S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER51290S_AVX2_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER51290S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER51290S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber1024/avx2/invntt.s b/crypto_kem/kyber512-90s/avx2/invntt.S similarity index 76% rename from crypto_kem/kyber1024/avx2/invntt.s rename to crypto_kem/kyber512-90s/avx2/invntt.S index b15ab10c..48974cf8 100644 --- a/crypto_kem/kyber1024/avx2/invntt.s +++ b/crypto_kem/kyber512-90s/avx2/invntt.S @@ -1,7 +1,8 @@ +#include "cdecl.inc" .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 #update & mul vpsubw %ymm\rh0,%ymm\rl0,%ymm12 vpsubw %ymm\rh1,%ymm\rl1,%ymm13 @@ -36,12 +37,8 @@ vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.global PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx -.p2align 5 -PQCLEAN_KYBER1024_AVX2_invntt_levels0t5_avx: -#consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 - +.text +invntt_levels0t5_avx: level0: #zetas vmovdqu (%rsi),%ymm15 @@ -59,14 +56,14 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -butterfly 4,5,8,9,6,7,10,11 15,3,1,2 +butterfly 4,5,8,9,6,7,10,11,15,3,1,2 level1: #zetas vmovdqu 128(%rsi),%ymm3 vmovdqu 160(%rsi),%ymm2 -butterfly 4,5,6,7,8,9,10,11 3,3,2,2 +butterfly 4,5,6,7,8,9,10,11,3,3,2,2 shuffle1 4,5,3,5 shuffle1 6,7,4,7 @@ -79,9 +76,9 @@ vmovdqu 192(%rsi),%ymm10 vmovdqu 224(%rsi),%ymm2 #consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xv(%rip),%ymm1 +vmovdqa _16XV*2(%rdx),%ymm1 -butterfly 3,4,6,8,5,7,9,11 10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,10,10,2,2 red16 3 @@ -95,7 +92,7 @@ level3: vmovdqu 256(%rsi),%ymm9 vmovdqu 288(%rsi),%ymm2 -butterfly 10,3,6,5,4,8,7,11 9,9,2,2 +butterfly 10,3,6,5,4,8,7,11,9,9,2,2 red16 10 @@ -109,7 +106,7 @@ level4: vmovdqu 320(%rsi),%ymm7 vmovdqu 352(%rsi),%ymm2 -butterfly 9,10,6,4,3,5,8,11 7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,7,7,2,2 red16 9 @@ -123,7 +120,7 @@ level5: vpbroadcastd 384(%rsi),%ymm8 vpbroadcastd 388(%rsi),%ymm2 -butterfly 7,9,6,3,10,4,5,11 8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,8,8,2,2 red16 7 @@ -139,11 +136,7 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_KYBER1024_AVX2_invntt_level6_avx -PQCLEAN_KYBER1024_AVX2_invntt_level6_avx: -#consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 - +invntt_level6_avx: #zetas vpbroadcastd (%rsi),%ymm1 vpbroadcastd 4(%rsi),%ymm2 @@ -161,8 +154,8 @@ vmovdqa 352(%rdi),%ymm11 butterfly 4,5,6,7,8,9,10,11 #consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xfhi(%rip),%ymm13 +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 #store vmovdqa %ymm8,256(%rdi) @@ -170,10 +163,10 @@ vmovdqa %ymm9,288(%rdi) vmovdqa %ymm10,320(%rdi) vmovdqa %ymm11,352(%rdi) -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 #store vmovdqa %ymm4,(%rdi) @@ -194,8 +187,8 @@ vmovdqa 480(%rdi),%ymm11 butterfly 4,5,6,7,8,9,10,11 #consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xfhi(%rip),%ymm13 +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 #store vmovdqa %ymm8,384(%rdi) @@ -203,10 +196,10 @@ vmovdqa %ymm9,416(%rdi) vmovdqa %ymm10,448(%rdi) vmovdqa %ymm11,480(%rdi) -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 #store vmovdqa %ymm4,128(%rdi) @@ -215,3 +208,18 @@ vmovdqa %ymm6,192(%rdi) vmovdqa %ymm7,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER51290S_AVX2_invntt_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_invntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_INV_EXP*2,%rsi +call invntt_levels0t5_avx +add $256,%rdi +add $392,%rsi +call invntt_levels0t5_avx +sub $256,%rdi +add $392,%rsi +call invntt_level6_avx +ret diff --git a/crypto_kem/kyber512-90s/avx2/kem.c b/crypto_kem/kyber512-90s/avx2/kem.c index 9fb3435d..169224ac 100644 --- a/crypto_kem/kyber512-90s/avx2/kem.c +++ b/crypto_kem/kyber512-90s/avx2/kem.c @@ -1,103 +1,127 @@ -#include "api.h" +#include "align.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include + -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER51290S_AVX2_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER51290S_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER51290S_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER51290S_AVX2_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER51290S_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - uint8_t buf[2 * KYBER_SYMBYTES]; +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + randombytes(buf.arr, KYBER_SYMBYTES); + /* Don't release system RNG output */ + hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER51290S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER51290S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER51290S_AVX2_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER51290S_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - union { - uint8_t x[KYBER_CIPHERTEXTBYTES]; - __m256i __dummy; - } _cmp; - uint8_t *cmp = _cmp.x; - uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER51290S_AVX2_indcpa_dec(buf, ct, sk); + PQCLEAN_KYBER51290S_AVX2_indcpa_dec(buf.arr, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER51290S_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER51290S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER51290S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER51290S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER51290S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber512-90s/avx2/kem.h b/crypto_kem/kyber512-90s/avx2/kem.h new file mode 100644 index 00000000..ecbd1259 --- /dev/null +++ b/crypto_kem/kyber512-90s/avx2/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER51290S_AVX2_KEM_H +#define PQCLEAN_KYBER51290S_AVX2_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER51290S_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber512/avx2/ntt.s b/crypto_kem/kyber512-90s/avx2/ntt.S similarity index 81% rename from crypto_kem/kyber512/avx2/ntt.s rename to crypto_kem/kyber512-90s/avx2/ntt.S index 5d16bbc4..477143a1 100644 --- a/crypto_kem/kyber512/avx2/ntt.s +++ b/crypto_kem/kyber512-90s/avx2/ntt.S @@ -1,7 +1,8 @@ +#include "cdecl.inc" .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=15,zl1=15,zh0=1,zh1=1 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 #mul vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 @@ -36,7 +37,7 @@ vpaddw %ymm15,%ymm\rl3,%ymm\rl3 # We break the dependency chains with the cost of slightly more additions. # But they can be run in parallel to the multiplications on execution port 5 # (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 +.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 #mul vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x @@ -73,11 +74,8 @@ vpaddw %ymm15,%ymm\rh3,%ymm\rh3 vpsubw %ymm15,%ymm\rl3,%ymm\rl3 .endm -.global PQCLEAN_KYBER512_AVX2_ntt_level0_avx -PQCLEAN_KYBER512_AVX2_ntt_level0_avx: -#consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 - +.text +ntt_level0_avx: level0: #zetas vpbroadcastd (%rsi),%ymm15 @@ -107,11 +105,7 @@ vmovdqa %ymm11,352(%rdi) ret -.global PQCLEAN_KYBER512_AVX2_ntt_levels1t6_avx -PQCLEAN_KYBER512_AVX2_ntt_levels1t6_avx: -#consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 - +ntt_levels1t6_avx: level1: #zetas vpbroadcastd (%rsi),%ymm15 @@ -127,7 +121,7 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -butterfly2 4,5,6,7,8,9,10,11 3 +butterfly2 4,5,6,7,8,9,10,11,3 level2: #zetas @@ -139,7 +133,7 @@ shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly2 3,8,4,9,5,10,6,11 7 +butterfly2 3,8,4,9,5,10,6,11,7 level3: #zetas @@ -151,7 +145,7 @@ shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly2 7,5,3,10,8,6,4,11 9 +butterfly2 7,5,3,10,8,6,4,11,9 level4: #zetas @@ -163,7 +157,7 @@ shuffle2 5,6,7,6 shuffle2 3,4,5,4 shuffle2 10,11,3,11 -butterfly2 9,8,7,6,5,4,3,11 10 +butterfly2 9,8,7,6,5,4,3,11,10 level5: #zetas @@ -175,7 +169,7 @@ shuffle1 8,4,9,4 shuffle1 7,3,8,3 shuffle1 6,11,7,11 -butterfly2 10,5,9,4,8,3,7,11 6 +butterfly2 10,5,9,4,8,3,7,11,6 level6: #zetas @@ -184,17 +178,17 @@ vmovdqu 328(%rsi),%ymm15 vmovdqu 296(%rsi),%ymm1 vmovdqu 360(%rsi),%ymm2 -butterfly2 10,5,8,3,9,4,7,11 6,1,14,15,1,2 +butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 -vmovdqa PQCLEAN_KYBER512_AVX2_16xv(%rip),%ymm1 -red16 10 12 -red16 5 13 -red16 9 14 -red16 4 15 -red16 8 2 -red16 3 6 -red16 7 12 -red16 11 13 +vmovdqa _16XV*2(%rdx),%ymm1 +red16 10,12 +red16 5,13 +red16 9,14 +red16 4,15 +red16 8,2 +red16 3,6 +red16 7,12 +red16 11,13 #store vmovdqa %ymm10,(%rdi) @@ -207,3 +201,20 @@ vmovdqa %ymm7,192(%rdi) vmovdqa %ymm11,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER51290S_AVX2_ntt_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_ntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_EXP*2,%rsi +call ntt_level0_avx +add $128,%rdi +call ntt_level0_avx +sub $128,%rdi +add $8,%rsi +call ntt_levels1t6_avx +add $256,%rdi +add $392,%rsi +call ntt_levels1t6_avx +ret diff --git a/crypto_kem/kyber512-90s/avx2/ntt.h b/crypto_kem/kyber512-90s/avx2/ntt.h index 5ec047c3..b3f560ab 100644 --- a/crypto_kem/kyber512-90s/avx2/ntt.h +++ b/crypto_kem/kyber512-90s/avx2/ntt.h @@ -2,19 +2,27 @@ #define NTT_H #include "consts.h" - +#include "params.h" #include -void PQCLEAN_KYBER51290S_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER51290S_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER51290S_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER51290S_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER51290S_AVX2_nttpack_avx(int16_t *r); -void PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(int16_t *r); -void PQCLEAN_KYBER51290S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); -void PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); - -void PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a); -void PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a); + +void PQCLEAN_KYBER51290S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + +void PQCLEAN_KYBER51290S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + + +void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + +void PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + + +void PQCLEAN_KYBER51290S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + +void PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + + +void PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + +void PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber512-90s/avx2/params.h b/crypto_kem/kyber512-90s/avx2/params.h index d086d4c6..ef580d09 100644 --- a/crypto_kem/kyber512-90s/avx2/params.h +++ b/crypto_kem/kyber512-90s/avx2/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER51290S_AVX2_PARAMS_H +#define PQCLEAN_KYBER51290S_AVX2_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 2 #define KYBER_POLYCOMPRESSEDBYTES 96 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber512-90s/avx2/poly.c b/crypto_kem/kyber512-90s/avx2/poly.c index 2e93f266..1aa0f7b1 100644 --- a/crypto_kem/kyber512-90s/avx2/poly.c +++ b/crypto_kem/kyber512-90s/avx2/poly.c @@ -1,111 +1,208 @@ +#include "align.h" #include "cbd.h" +#include "consts.h" #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include #include /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER51290S_AVX2_poly_compress * * Description: Compression and subsequent serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER51290S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t i, j, k = 0; PQCLEAN_KYBER51290S_AVX2_poly_csubq(a); - for (i = 0; i < KYBER_N; i += 8) { + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = (uint8_t)(((((uint16_t)a->coeffs[i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7); + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; } - r[k] = (uint8_t)( t[0] | (t[1] << 3) | (t[2] << 6)); - r[k + 1] = (uint8_t)((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)); - r[k + 2] = (uint8_t)((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)); - k += 3; + r[0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + r += 3; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER51290S_AVX2_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER51290S_AVX2_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_decompress(poly *r, const uint8_t *a) { - size_t i; - for (i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)((((a[0] & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 1] = (int16_t)(((((a[0] >> 3) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 2] = (int16_t)(((((a[0] >> 6) | ((a[1] << 2) & 4)) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 3] = (int16_t)(((((a[1] >> 1) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 4] = (int16_t)(((((a[1] >> 4) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 5] = (int16_t)(((((a[1] >> 7) | ((a[2] << 1) & 6)) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 6] = (int16_t)(((((a[2] >> 2) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 7] = (int16_t)(((((a[2] >> 5)) * KYBER_Q) + 4) >> 3); +void PQCLEAN_KYBER51290S_AVX2_poly_decompress(poly *restrict r, + const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + unsigned int j = 0; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 3); + t[2] = (a[0] >> 6) | (a[1] << 2); + t[3] = (a[1] >> 1); + t[4] = (a[1] >> 4); + t[5] = (a[1] >> 7) | (a[2] << 1); + t[6] = (a[2] >> 2); + t[7] = (a[2] >> 5); a += 3; + + for (j = 0; j < 8; j++) { + r->coeffs[8 * i + j] = ((uint16_t)(t[j] & 7) * KYBER_Q + 4) >> 3; + } } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER51290S_AVX2_poly_tobytes * * Description: Serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_tobytes(uint8_t *r, poly *a) { - PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(r, a->coeffs); - PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128); +void PQCLEAN_KYBER51290S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER51290S_AVX2_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER51290S_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) +**************************************************/ +void PQCLEAN_KYBER51290S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER51290S_AVX2_qdata); +} + +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *restrict r, + const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; + const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); + const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2); + +#define FROMMSG64(i) \ + g3 = _mm256_shuffle_epi32(f,0x55*(i)); \ + g3 = _mm256_sllv_epi32(g3,shift); \ + g3 = _mm256_shuffle_epi8(g3,idx); \ + g0 = _mm256_slli_epi16(g3,12); \ + g1 = _mm256_slli_epi16(g3,8); \ + g2 = _mm256_slli_epi16(g3,4); \ + g0 = _mm256_srai_epi16(g0,15); \ + g1 = _mm256_srai_epi16(g1,15); \ + g2 = _mm256_srai_epi16(g2,15); \ + g3 = _mm256_srai_epi16(g3,15); \ + g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ + g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ + g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ + g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ + h0 = _mm256_unpacklo_epi64(g0,g1); \ + h2 = _mm256_unpackhi_epi64(g0,g1); \ + h1 = _mm256_unpacklo_epi64(g2,g3); \ + h3 = _mm256_unpackhi_epi64(g2,g3); \ + g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ + g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ + g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ + g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + + f = _mm256_load_si256((__m256i *)msg); + FROMMSG64(0); + FROMMSG64(1); + FROMMSG64(2); + FROMMSG64(3); +} + +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_frombytes(poly *r, const uint8_t *a) { - PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(r->coeffs, a); - PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192); +void PQCLEAN_KYBER51290S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { + unsigned int i = 0; + uint32_t small = 0; + __m256i f0, f1, g0, g1; + const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); + f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); + f0 = _mm256_sub_epi16(hqs, f0); + f1 = _mm256_sub_epi16(hqs, f1); + g0 = _mm256_srai_epi16(f0, 15); + g1 = _mm256_srai_epi16(f1, 15); + f0 = _mm256_xor_si256(f0, g0); + f1 = _mm256_xor_si256(f1, g1); + f0 = _mm256_sub_epi16(hhqs, f0); + f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_packs_epi16(f0, f1); + small = _mm256_movemask_epi8(f0); + small = ~small; + msg[4 * i + 0] = small; + msg[4 * i + 1] = small >> 16; + msg[4 * i + 2] = small >> 8; + msg[4 * i + 3] = small >> 24; + } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER51290S_AVX2_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); - PQCLEAN_KYBER51290S_AVX2_cbd(r, buf); +void PQCLEAN_KYBER51290S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; + prf(buf.arr, sizeof(buf.arr), seed, nonce); + PQCLEAN_KYBER51290S_AVX2_cbd(r, buf.arr); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER51290S_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -114,73 +211,78 @@ void PQCLEAN_KYBER51290S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_ * Arguments: - uint16_t *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER51290S_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER51290S_AVX2_zetas_exp); - PQCLEAN_KYBER51290S_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER51290S_AVX2_zetas_exp); - PQCLEAN_KYBER51290S_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER51290S_AVX2_zetas_exp + 4); - PQCLEAN_KYBER51290S_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER51290S_AVX2_zetas_exp + 200); + PQCLEAN_KYBER51290S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_invntt(poly *r) { - PQCLEAN_KYBER51290S_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER51290S_AVX2_zetas_inv_exp); - PQCLEAN_KYBER51290S_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER51290S_AVX2_zetas_inv_exp + 196); - PQCLEAN_KYBER51290S_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER51290S_AVX2_zetas_inv_exp + 392); +void PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(poly *r) { + PQCLEAN_KYBER51290S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } -// FIXME void PQCLEAN_KYBER51290S_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(r->coeffs); - PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(r->coeffs + 128); + PQCLEAN_KYBER51290S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } -//XXX Add comment -void PQCLEAN_KYBER51290S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER51290S_AVX2_basemul_avx(r->coeffs, - a->coeffs, - b->coeffs, - PQCLEAN_KYBER51290S_AVX2_zetas_exp + 152); - PQCLEAN_KYBER51290S_AVX2_basemul_avx(r->coeffs + 64, - a->coeffs + 64, - b->coeffs + 64, - PQCLEAN_KYBER51290S_AVX2_zetas_exp + 184); - PQCLEAN_KYBER51290S_AVX2_basemul_avx(r->coeffs + 128, - a->coeffs + 128, - b->coeffs + 128, - PQCLEAN_KYBER51290S_AVX2_zetas_exp + 348); - PQCLEAN_KYBER51290S_AVX2_basemul_avx(r->coeffs + 192, - a->coeffs + 192, - b->coeffs + 192, - PQCLEAN_KYBER51290S_AVX2_zetas_exp + 380); +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + PQCLEAN_KYBER51290S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } -// FIXME -void PQCLEAN_KYBER51290S_AVX2_poly_frommont(poly *r) { - PQCLEAN_KYBER51290S_AVX2_frommont_avx(r->coeffs); - PQCLEAN_KYBER51290S_AVX2_frommont_avx(r->coeffs + 128); +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_poly_tomont +* +* Description: Inplace conversion of all coefficients of a polynomial +* from normal domain to Montgomery domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_KYBER51290S_AVX2_poly_tomont(poly *r) { + PQCLEAN_KYBER51290S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER51290S_AVX2_reduce_avx(r->coeffs); - PQCLEAN_KYBER51290S_AVX2_reduce_avx(r->coeffs + 128); + PQCLEAN_KYBER51290S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_poly_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER51290S_AVX2_csubq_avx(r->coeffs); - PQCLEAN_KYBER51290S_AVX2_csubq_avx(r->coeffs + 128); + PQCLEAN_KYBER51290S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER51290S_AVX2_poly_add * * Description: Add two polynomials * @@ -189,18 +291,19 @@ void PQCLEAN_KYBER51290S_AVX2_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_add_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_add_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER51290S_AVX2_poly_sub * * Description: Subtract two polynomials * @@ -209,127 +312,13 @@ void PQCLEAN_KYBER51290S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER51290S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_sub_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); - } -} - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - __m128i tmp; - __m256i a[4], d0, d1, d2, d3; - const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i zeros = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi32(1); - const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2); - - tmp = _mm_loadu_si128((__m128i *)msg); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2); - } - - tmp = _mm_loadu_si128((__m128i *)&msg[16]); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2); - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER51290S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint32_t small; - __m256i vec, tmp; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); - - for (size_t i = 0; i < KYBER_N / 16; i++) { - vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]); - vec = _mm256_sub_epi16(hqs, vec); - tmp = _mm256_srai_epi16(vec, 15); - vec = _mm256_xor_si256(vec, tmp); - vec = _mm256_sub_epi16(hhqs, vec); - small = (uint32_t)_mm256_movemask_epi8(vec); - small = _pext_u32(small, 0xAAAAAAAA); - small = ~small; - msg[2 * i + 0] = (uint8_t)small; - msg[2 * i + 1] = (uint8_t)(small >> 8); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_sub_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } diff --git a/crypto_kem/kyber512-90s/avx2/poly.h b/crypto_kem/kyber512-90s/avx2/poly.h index 6585d4f3..6e4e5a16 100644 --- a/crypto_kem/kyber512-90s/avx2/poly.h +++ b/crypto_kem/kyber512-90s/avx2/poly.h @@ -1,8 +1,7 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER51290S_AVX2_POLY_H +#define PQCLEAN_KYBER51290S_AVX2_POLY_H #include "params.h" - #include #include @@ -11,32 +10,47 @@ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] */ typedef union { + __m256i dummy; int16_t coeffs[KYBER_N]; - __m256i _dummy; } poly; -void PQCLEAN_KYBER51290S_AVX2_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER51290S_AVX2_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER51290S_AVX2_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER51290S_AVX2_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER51290S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER51290S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER51290S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER51290S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER51290S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER51290S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER51290S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER51290S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER51290S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER51290S_AVX2_poly_ntt(poly *r); -void PQCLEAN_KYBER51290S_AVX2_poly_invntt(poly *r); + +void PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(poly *r); + void PQCLEAN_KYBER51290S_AVX2_poly_nttunpack(poly *r); -void PQCLEAN_KYBER51290S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER51290S_AVX2_poly_frommont(poly *r); + +void PQCLEAN_KYBER51290S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER51290S_AVX2_poly_tomont(poly *r); + void PQCLEAN_KYBER51290S_AVX2_poly_reduce(poly *r); + void PQCLEAN_KYBER51290S_AVX2_poly_csubq(poly *r); + void PQCLEAN_KYBER51290S_AVX2_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER51290S_AVX2_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber512-90s/avx2/polyvec.c b/crypto_kem/kyber512-90s/avx2/polyvec.c index a4aa8682..bbc8289c 100644 --- a/crypto_kem/kyber512-90s/avx2/polyvec.c +++ b/crypto_kem/kyber512-90s/avx2/polyvec.c @@ -1,157 +1,188 @@ +#include "params.h" +#include "consts.h" #include "ntt.h" #include "poly.h" #include "polyvec.h" - #include /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_compress * * Description: Compress and serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], + polyvec *restrict a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER51290S_AVX2_polyvec_csubq(a); uint16_t t[4]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - for (size_t k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) + / KYBER_Q) & 0x3ff; + } } - r[5 * j + 0] = (uint8_t)t[0]; - r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x3f) << 2)); - r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] & 0x0f) << 4)); - r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] & 0x03) << 6)); - r[5 * j + 4] = (uint8_t)((t[3] >> 2)); + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; } - r += 320; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER51290S_AVX2_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - r->vec[i].coeffs[4 * j + 0] = (int16_t)( (((a[5 * j + 0] | (((uint32_t)a[5 * j + 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 1] = (int16_t)(((((a[5 * j + 1] >> 2) | (((uint32_t)a[5 * j + 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 2] = (int16_t)(((((a[5 * j + 2] >> 4) | (((uint32_t)a[5 * j + 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 3] = (int16_t)(((((a[5 * j + 3] >> 6) | (((uint32_t)a[5 * j + 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10); +void PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(polyvec *restrict r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } } - a += 320; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes * * Description: Serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_AVX2_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_AVX2_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER51290S_AVX2_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_AVX2_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { - PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(r->coeffs, - a->vec->coeffs, - b->vec->coeffs, - PQCLEAN_KYBER51290S_AVX2_zetas_exp + 152); - PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(r->coeffs + 64, - a->vec->coeffs + 64, - b->vec->coeffs + 64, - PQCLEAN_KYBER51290S_AVX2_zetas_exp + 184); - PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(r->coeffs + 128, - a->vec->coeffs + 128, - b->vec->coeffs + 128, - PQCLEAN_KYBER51290S_AVX2_zetas_exp + 348); - PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(r->coeffs + 192, - a->vec->coeffs + 192, - b->vec->coeffs + 192, - PQCLEAN_KYBER51290S_AVX2_zetas_exp + 380); +void PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + PQCLEAN_KYBER51290S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER51290S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_AVX2_poly_reduce(&r->vec[i]); } } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of each element of a vector of polynomials +* for details of conditional subtraction of q see comments in +* reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER51290S_AVX2_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_AVX2_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER51290S_AVX2_polyvec_add * * Description: Add vectors of polynomials * @@ -160,7 +191,8 @@ void PQCLEAN_KYBER51290S_AVX2_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER51290S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber512-90s/avx2/polyvec.h b/crypto_kem/kyber512-90s/avx2/polyvec.h index 9c411959..2e18524f 100644 --- a/crypto_kem/kyber512-90s/avx2/polyvec.h +++ b/crypto_kem/kyber512-90s/avx2/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER51290S_AVX2_POLYVEC_H +#define PQCLEAN_KYBER51290S_AVX2_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER51290S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER51290S_AVX2_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER51290S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER51290S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER51290S_AVX2_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER51290S_AVX2_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER51290S_AVX2_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER51290S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER51290S_AVX2_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER51290S_AVX2_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER51290S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber512-90s/avx2/reduce.h b/crypto_kem/kyber512-90s/avx2/reduce.h index 04f810b2..ea30af38 100644 --- a/crypto_kem/kyber512-90s/avx2/reduce.h +++ b/crypto_kem/kyber512-90s/avx2/reduce.h @@ -3,8 +3,14 @@ #include -int16_t PQCLEAN_KYBER51290S_AVX2_reduce_avx(int16_t *r); -int16_t PQCLEAN_KYBER51290S_AVX2_csubq_avx(int16_t *r); -int16_t PQCLEAN_KYBER51290S_AVX2_frommont_avx(int16_t *r); +#include "consts.h" +#include "params.h" + + +int16_t PQCLEAN_KYBER51290S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + +int16_t PQCLEAN_KYBER51290S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); + +int16_t PQCLEAN_KYBER51290S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER51290S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber512-90s/avx2/rejsample.c b/crypto_kem/kyber512-90s/avx2/rejsample.c index ef1bd64e..b676af83 100644 --- a/crypto_kem/kyber512-90s/avx2/rejsample.c +++ b/crypto_kem/kyber512-90s/avx2/rejsample.c @@ -1,386 +1,360 @@ +#include "align.h" #include "consts.h" #include "params.h" #include "rejsample.h" - -#include #include - -static const uint8_t idx[256][8] = { - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 2, 0, 0, 0, 0, 0, 0, 0}, - { 0, 2, 0, 0, 0, 0, 0, 0}, - { 4, 0, 0, 0, 0, 0, 0, 0}, - { 0, 4, 0, 0, 0, 0, 0, 0}, - { 2, 4, 0, 0, 0, 0, 0, 0}, - { 0, 2, 4, 0, 0, 0, 0, 0}, - { 6, 0, 0, 0, 0, 0, 0, 0}, - { 0, 6, 0, 0, 0, 0, 0, 0}, - { 2, 6, 0, 0, 0, 0, 0, 0}, - { 0, 2, 6, 0, 0, 0, 0, 0}, - { 4, 6, 0, 0, 0, 0, 0, 0}, - { 0, 4, 6, 0, 0, 0, 0, 0}, - { 2, 4, 6, 0, 0, 0, 0, 0}, - { 0, 2, 4, 6, 0, 0, 0, 0}, - { 8, 0, 0, 0, 0, 0, 0, 0}, - { 0, 8, 0, 0, 0, 0, 0, 0}, - { 2, 8, 0, 0, 0, 0, 0, 0}, - { 0, 2, 8, 0, 0, 0, 0, 0}, - { 4, 8, 0, 0, 0, 0, 0, 0}, - { 0, 4, 8, 0, 0, 0, 0, 0}, - { 2, 4, 8, 0, 0, 0, 0, 0}, - { 0, 2, 4, 8, 0, 0, 0, 0}, - { 6, 8, 0, 0, 0, 0, 0, 0}, - { 0, 6, 8, 0, 0, 0, 0, 0}, - { 2, 6, 8, 0, 0, 0, 0, 0}, - { 0, 2, 6, 8, 0, 0, 0, 0}, - { 4, 6, 8, 0, 0, 0, 0, 0}, - { 0, 4, 6, 8, 0, 0, 0, 0}, - { 2, 4, 6, 8, 0, 0, 0, 0}, - { 0, 2, 4, 6, 8, 0, 0, 0}, - {10, 0, 0, 0, 0, 0, 0, 0}, - { 0, 10, 0, 0, 0, 0, 0, 0}, - { 2, 10, 0, 0, 0, 0, 0, 0}, - { 0, 2, 10, 0, 0, 0, 0, 0}, - { 4, 10, 0, 0, 0, 0, 0, 0}, - { 0, 4, 10, 0, 0, 0, 0, 0}, - { 2, 4, 10, 0, 0, 0, 0, 0}, - { 0, 2, 4, 10, 0, 0, 0, 0}, - { 6, 10, 0, 0, 0, 0, 0, 0}, - { 0, 6, 10, 0, 0, 0, 0, 0}, - { 2, 6, 10, 0, 0, 0, 0, 0}, - { 0, 2, 6, 10, 0, 0, 0, 0}, - { 4, 6, 10, 0, 0, 0, 0, 0}, - { 0, 4, 6, 10, 0, 0, 0, 0}, - { 2, 4, 6, 10, 0, 0, 0, 0}, - { 0, 2, 4, 6, 10, 0, 0, 0}, - { 8, 10, 0, 0, 0, 0, 0, 0}, - { 0, 8, 10, 0, 0, 0, 0, 0}, - { 2, 8, 10, 0, 0, 0, 0, 0}, - { 0, 2, 8, 10, 0, 0, 0, 0}, - { 4, 8, 10, 0, 0, 0, 0, 0}, - { 0, 4, 8, 10, 0, 0, 0, 0}, - { 2, 4, 8, 10, 0, 0, 0, 0}, - { 0, 2, 4, 8, 10, 0, 0, 0}, - { 6, 8, 10, 0, 0, 0, 0, 0}, - { 0, 6, 8, 10, 0, 0, 0, 0}, - { 2, 6, 8, 10, 0, 0, 0, 0}, - { 0, 2, 6, 8, 10, 0, 0, 0}, - { 4, 6, 8, 10, 0, 0, 0, 0}, - { 0, 4, 6, 8, 10, 0, 0, 0}, - { 2, 4, 6, 8, 10, 0, 0, 0}, - { 0, 2, 4, 6, 8, 10, 0, 0}, - {12, 0, 0, 0, 0, 0, 0, 0}, - { 0, 12, 0, 0, 0, 0, 0, 0}, - { 2, 12, 0, 0, 0, 0, 0, 0}, - { 0, 2, 12, 0, 0, 0, 0, 0}, - { 4, 12, 0, 0, 0, 0, 0, 0}, - { 0, 4, 12, 0, 0, 0, 0, 0}, - { 2, 4, 12, 0, 0, 0, 0, 0}, - { 0, 2, 4, 12, 0, 0, 0, 0}, - { 6, 12, 0, 0, 0, 0, 0, 0}, - { 0, 6, 12, 0, 0, 0, 0, 0}, - { 2, 6, 12, 0, 0, 0, 0, 0}, - { 0, 2, 6, 12, 0, 0, 0, 0}, - { 4, 6, 12, 0, 0, 0, 0, 0}, - { 0, 4, 6, 12, 0, 0, 0, 0}, - { 2, 4, 6, 12, 0, 0, 0, 0}, - { 0, 2, 4, 6, 12, 0, 0, 0}, - { 8, 12, 0, 0, 0, 0, 0, 0}, - { 0, 8, 12, 0, 0, 0, 0, 0}, - { 2, 8, 12, 0, 0, 0, 0, 0}, - { 0, 2, 8, 12, 0, 0, 0, 0}, - { 4, 8, 12, 0, 0, 0, 0, 0}, - { 0, 4, 8, 12, 0, 0, 0, 0}, - { 2, 4, 8, 12, 0, 0, 0, 0}, - { 0, 2, 4, 8, 12, 0, 0, 0}, - { 6, 8, 12, 0, 0, 0, 0, 0}, - { 0, 6, 8, 12, 0, 0, 0, 0}, - { 2, 6, 8, 12, 0, 0, 0, 0}, - { 0, 2, 6, 8, 12, 0, 0, 0}, - { 4, 6, 8, 12, 0, 0, 0, 0}, - { 0, 4, 6, 8, 12, 0, 0, 0}, - { 2, 4, 6, 8, 12, 0, 0, 0}, - { 0, 2, 4, 6, 8, 12, 0, 0}, - {10, 12, 0, 0, 0, 0, 0, 0}, - { 0, 10, 12, 0, 0, 0, 0, 0}, - { 2, 10, 12, 0, 0, 0, 0, 0}, - { 0, 2, 10, 12, 0, 0, 0, 0}, - { 4, 10, 12, 0, 0, 0, 0, 0}, - { 0, 4, 10, 12, 0, 0, 0, 0}, - { 2, 4, 10, 12, 0, 0, 0, 0}, - { 0, 2, 4, 10, 12, 0, 0, 0}, - { 6, 10, 12, 0, 0, 0, 0, 0}, - { 0, 6, 10, 12, 0, 0, 0, 0}, - { 2, 6, 10, 12, 0, 0, 0, 0}, - { 0, 2, 6, 10, 12, 0, 0, 0}, - { 4, 6, 10, 12, 0, 0, 0, 0}, - { 0, 4, 6, 10, 12, 0, 0, 0}, - { 2, 4, 6, 10, 12, 0, 0, 0}, - { 0, 2, 4, 6, 10, 12, 0, 0}, - { 8, 10, 12, 0, 0, 0, 0, 0}, - { 0, 8, 10, 12, 0, 0, 0, 0}, - { 2, 8, 10, 12, 0, 0, 0, 0}, - { 0, 2, 8, 10, 12, 0, 0, 0}, - { 4, 8, 10, 12, 0, 0, 0, 0}, - { 0, 4, 8, 10, 12, 0, 0, 0}, - { 2, 4, 8, 10, 12, 0, 0, 0}, - { 0, 2, 4, 8, 10, 12, 0, 0}, - { 6, 8, 10, 12, 0, 0, 0, 0}, - { 0, 6, 8, 10, 12, 0, 0, 0}, - { 2, 6, 8, 10, 12, 0, 0, 0}, - { 0, 2, 6, 8, 10, 12, 0, 0}, - { 4, 6, 8, 10, 12, 0, 0, 0}, - { 0, 4, 6, 8, 10, 12, 0, 0}, - { 2, 4, 6, 8, 10, 12, 0, 0}, - { 0, 2, 4, 6, 8, 10, 12, 0}, - {14, 0, 0, 0, 0, 0, 0, 0}, - { 0, 14, 0, 0, 0, 0, 0, 0}, - { 2, 14, 0, 0, 0, 0, 0, 0}, - { 0, 2, 14, 0, 0, 0, 0, 0}, - { 4, 14, 0, 0, 0, 0, 0, 0}, - { 0, 4, 14, 0, 0, 0, 0, 0}, - { 2, 4, 14, 0, 0, 0, 0, 0}, - { 0, 2, 4, 14, 0, 0, 0, 0}, - { 6, 14, 0, 0, 0, 0, 0, 0}, - { 0, 6, 14, 0, 0, 0, 0, 0}, - { 2, 6, 14, 0, 0, 0, 0, 0}, - { 0, 2, 6, 14, 0, 0, 0, 0}, - { 4, 6, 14, 0, 0, 0, 0, 0}, - { 0, 4, 6, 14, 0, 0, 0, 0}, - { 2, 4, 6, 14, 0, 0, 0, 0}, - { 0, 2, 4, 6, 14, 0, 0, 0}, - { 8, 14, 0, 0, 0, 0, 0, 0}, - { 0, 8, 14, 0, 0, 0, 0, 0}, - { 2, 8, 14, 0, 0, 0, 0, 0}, - { 0, 2, 8, 14, 0, 0, 0, 0}, - { 4, 8, 14, 0, 0, 0, 0, 0}, - { 0, 4, 8, 14, 0, 0, 0, 0}, - { 2, 4, 8, 14, 0, 0, 0, 0}, - { 0, 2, 4, 8, 14, 0, 0, 0}, - { 6, 8, 14, 0, 0, 0, 0, 0}, - { 0, 6, 8, 14, 0, 0, 0, 0}, - { 2, 6, 8, 14, 0, 0, 0, 0}, - { 0, 2, 6, 8, 14, 0, 0, 0}, - { 4, 6, 8, 14, 0, 0, 0, 0}, - { 0, 4, 6, 8, 14, 0, 0, 0}, - { 2, 4, 6, 8, 14, 0, 0, 0}, - { 0, 2, 4, 6, 8, 14, 0, 0}, - {10, 14, 0, 0, 0, 0, 0, 0}, - { 0, 10, 14, 0, 0, 0, 0, 0}, - { 2, 10, 14, 0, 0, 0, 0, 0}, - { 0, 2, 10, 14, 0, 0, 0, 0}, - { 4, 10, 14, 0, 0, 0, 0, 0}, - { 0, 4, 10, 14, 0, 0, 0, 0}, - { 2, 4, 10, 14, 0, 0, 0, 0}, - { 0, 2, 4, 10, 14, 0, 0, 0}, - { 6, 10, 14, 0, 0, 0, 0, 0}, - { 0, 6, 10, 14, 0, 0, 0, 0}, - { 2, 6, 10, 14, 0, 0, 0, 0}, - { 0, 2, 6, 10, 14, 0, 0, 0}, - { 4, 6, 10, 14, 0, 0, 0, 0}, - { 0, 4, 6, 10, 14, 0, 0, 0}, - { 2, 4, 6, 10, 14, 0, 0, 0}, - { 0, 2, 4, 6, 10, 14, 0, 0}, - { 8, 10, 14, 0, 0, 0, 0, 0}, - { 0, 8, 10, 14, 0, 0, 0, 0}, - { 2, 8, 10, 14, 0, 0, 0, 0}, - { 0, 2, 8, 10, 14, 0, 0, 0}, - { 4, 8, 10, 14, 0, 0, 0, 0}, - { 0, 4, 8, 10, 14, 0, 0, 0}, - { 2, 4, 8, 10, 14, 0, 0, 0}, - { 0, 2, 4, 8, 10, 14, 0, 0}, - { 6, 8, 10, 14, 0, 0, 0, 0}, - { 0, 6, 8, 10, 14, 0, 0, 0}, - { 2, 6, 8, 10, 14, 0, 0, 0}, - { 0, 2, 6, 8, 10, 14, 0, 0}, - { 4, 6, 8, 10, 14, 0, 0, 0}, - { 0, 4, 6, 8, 10, 14, 0, 0}, - { 2, 4, 6, 8, 10, 14, 0, 0}, - { 0, 2, 4, 6, 8, 10, 14, 0}, - {12, 14, 0, 0, 0, 0, 0, 0}, - { 0, 12, 14, 0, 0, 0, 0, 0}, - { 2, 12, 14, 0, 0, 0, 0, 0}, - { 0, 2, 12, 14, 0, 0, 0, 0}, - { 4, 12, 14, 0, 0, 0, 0, 0}, - { 0, 4, 12, 14, 0, 0, 0, 0}, - { 2, 4, 12, 14, 0, 0, 0, 0}, - { 0, 2, 4, 12, 14, 0, 0, 0}, - { 6, 12, 14, 0, 0, 0, 0, 0}, - { 0, 6, 12, 14, 0, 0, 0, 0}, - { 2, 6, 12, 14, 0, 0, 0, 0}, - { 0, 2, 6, 12, 14, 0, 0, 0}, - { 4, 6, 12, 14, 0, 0, 0, 0}, - { 0, 4, 6, 12, 14, 0, 0, 0}, - { 2, 4, 6, 12, 14, 0, 0, 0}, - { 0, 2, 4, 6, 12, 14, 0, 0}, - { 8, 12, 14, 0, 0, 0, 0, 0}, - { 0, 8, 12, 14, 0, 0, 0, 0}, - { 2, 8, 12, 14, 0, 0, 0, 0}, - { 0, 2, 8, 12, 14, 0, 0, 0}, - { 4, 8, 12, 14, 0, 0, 0, 0}, - { 0, 4, 8, 12, 14, 0, 0, 0}, - { 2, 4, 8, 12, 14, 0, 0, 0}, - { 0, 2, 4, 8, 12, 14, 0, 0}, - { 6, 8, 12, 14, 0, 0, 0, 0}, - { 0, 6, 8, 12, 14, 0, 0, 0}, - { 2, 6, 8, 12, 14, 0, 0, 0}, - { 0, 2, 6, 8, 12, 14, 0, 0}, - { 4, 6, 8, 12, 14, 0, 0, 0}, - { 0, 4, 6, 8, 12, 14, 0, 0}, - { 2, 4, 6, 8, 12, 14, 0, 0}, - { 0, 2, 4, 6, 8, 12, 14, 0}, - {10, 12, 14, 0, 0, 0, 0, 0}, - { 0, 10, 12, 14, 0, 0, 0, 0}, - { 2, 10, 12, 14, 0, 0, 0, 0}, - { 0, 2, 10, 12, 14, 0, 0, 0}, - { 4, 10, 12, 14, 0, 0, 0, 0}, - { 0, 4, 10, 12, 14, 0, 0, 0}, - { 2, 4, 10, 12, 14, 0, 0, 0}, - { 0, 2, 4, 10, 12, 14, 0, 0}, - { 6, 10, 12, 14, 0, 0, 0, 0}, - { 0, 6, 10, 12, 14, 0, 0, 0}, - { 2, 6, 10, 12, 14, 0, 0, 0}, - { 0, 2, 6, 10, 12, 14, 0, 0}, - { 4, 6, 10, 12, 14, 0, 0, 0}, - { 0, 4, 6, 10, 12, 14, 0, 0}, - { 2, 4, 6, 10, 12, 14, 0, 0}, - { 0, 2, 4, 6, 10, 12, 14, 0}, - { 8, 10, 12, 14, 0, 0, 0, 0}, - { 0, 8, 10, 12, 14, 0, 0, 0}, - { 2, 8, 10, 12, 14, 0, 0, 0}, - { 0, 2, 8, 10, 12, 14, 0, 0}, - { 4, 8, 10, 12, 14, 0, 0, 0}, - { 0, 4, 8, 10, 12, 14, 0, 0}, - { 2, 4, 8, 10, 12, 14, 0, 0}, - { 0, 2, 4, 8, 10, 12, 14, 0}, - { 6, 8, 10, 12, 14, 0, 0, 0}, - { 0, 6, 8, 10, 12, 14, 0, 0}, - { 2, 6, 8, 10, 12, 14, 0, 0}, - { 0, 2, 6, 8, 10, 12, 14, 0}, - { 4, 6, 8, 10, 12, 14, 0, 0}, - { 0, 4, 6, 8, 10, 12, 14, 0}, - { 2, 4, 6, 8, 10, 12, 14, 0}, - { 0, 2, 4, 6, 8, 10, 12, 14} +static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { + {-1, -1, -1, -1, -1, -1, -1, -1}, + { 0, -1, -1, -1, -1, -1, -1, -1}, + { 2, -1, -1, -1, -1, -1, -1, -1}, + { 0, 2, -1, -1, -1, -1, -1, -1}, + { 4, -1, -1, -1, -1, -1, -1, -1}, + { 0, 4, -1, -1, -1, -1, -1, -1}, + { 2, 4, -1, -1, -1, -1, -1, -1}, + { 0, 2, 4, -1, -1, -1, -1, -1}, + { 6, -1, -1, -1, -1, -1, -1, -1}, + { 0, 6, -1, -1, -1, -1, -1, -1}, + { 2, 6, -1, -1, -1, -1, -1, -1}, + { 0, 2, 6, -1, -1, -1, -1, -1}, + { 4, 6, -1, -1, -1, -1, -1, -1}, + { 0, 4, 6, -1, -1, -1, -1, -1}, + { 2, 4, 6, -1, -1, -1, -1, -1}, + { 0, 2, 4, 6, -1, -1, -1, -1}, + { 8, -1, -1, -1, -1, -1, -1, -1}, + { 0, 8, -1, -1, -1, -1, -1, -1}, + { 2, 8, -1, -1, -1, -1, -1, -1}, + { 0, 2, 8, -1, -1, -1, -1, -1}, + { 4, 8, -1, -1, -1, -1, -1, -1}, + { 0, 4, 8, -1, -1, -1, -1, -1}, + { 2, 4, 8, -1, -1, -1, -1, -1}, + { 0, 2, 4, 8, -1, -1, -1, -1}, + { 6, 8, -1, -1, -1, -1, -1, -1}, + { 0, 6, 8, -1, -1, -1, -1, -1}, + { 2, 6, 8, -1, -1, -1, -1, -1}, + { 0, 2, 6, 8, -1, -1, -1, -1}, + { 4, 6, 8, -1, -1, -1, -1, -1}, + { 0, 4, 6, 8, -1, -1, -1, -1}, + { 2, 4, 6, 8, -1, -1, -1, -1}, + { 0, 2, 4, 6, 8, -1, -1, -1}, + {10, -1, -1, -1, -1, -1, -1, -1}, + { 0, 10, -1, -1, -1, -1, -1, -1}, + { 2, 10, -1, -1, -1, -1, -1, -1}, + { 0, 2, 10, -1, -1, -1, -1, -1}, + { 4, 10, -1, -1, -1, -1, -1, -1}, + { 0, 4, 10, -1, -1, -1, -1, -1}, + { 2, 4, 10, -1, -1, -1, -1, -1}, + { 0, 2, 4, 10, -1, -1, -1, -1}, + { 6, 10, -1, -1, -1, -1, -1, -1}, + { 0, 6, 10, -1, -1, -1, -1, -1}, + { 2, 6, 10, -1, -1, -1, -1, -1}, + { 0, 2, 6, 10, -1, -1, -1, -1}, + { 4, 6, 10, -1, -1, -1, -1, -1}, + { 0, 4, 6, 10, -1, -1, -1, -1}, + { 2, 4, 6, 10, -1, -1, -1, -1}, + { 0, 2, 4, 6, 10, -1, -1, -1}, + { 8, 10, -1, -1, -1, -1, -1, -1}, + { 0, 8, 10, -1, -1, -1, -1, -1}, + { 2, 8, 10, -1, -1, -1, -1, -1}, + { 0, 2, 8, 10, -1, -1, -1, -1}, + { 4, 8, 10, -1, -1, -1, -1, -1}, + { 0, 4, 8, 10, -1, -1, -1, -1}, + { 2, 4, 8, 10, -1, -1, -1, -1}, + { 0, 2, 4, 8, 10, -1, -1, -1}, + { 6, 8, 10, -1, -1, -1, -1, -1}, + { 0, 6, 8, 10, -1, -1, -1, -1}, + { 2, 6, 8, 10, -1, -1, -1, -1}, + { 0, 2, 6, 8, 10, -1, -1, -1}, + { 4, 6, 8, 10, -1, -1, -1, -1}, + { 0, 4, 6, 8, 10, -1, -1, -1}, + { 2, 4, 6, 8, 10, -1, -1, -1}, + { 0, 2, 4, 6, 8, 10, -1, -1}, + {12, -1, -1, -1, -1, -1, -1, -1}, + { 0, 12, -1, -1, -1, -1, -1, -1}, + { 2, 12, -1, -1, -1, -1, -1, -1}, + { 0, 2, 12, -1, -1, -1, -1, -1}, + { 4, 12, -1, -1, -1, -1, -1, -1}, + { 0, 4, 12, -1, -1, -1, -1, -1}, + { 2, 4, 12, -1, -1, -1, -1, -1}, + { 0, 2, 4, 12, -1, -1, -1, -1}, + { 6, 12, -1, -1, -1, -1, -1, -1}, + { 0, 6, 12, -1, -1, -1, -1, -1}, + { 2, 6, 12, -1, -1, -1, -1, -1}, + { 0, 2, 6, 12, -1, -1, -1, -1}, + { 4, 6, 12, -1, -1, -1, -1, -1}, + { 0, 4, 6, 12, -1, -1, -1, -1}, + { 2, 4, 6, 12, -1, -1, -1, -1}, + { 0, 2, 4, 6, 12, -1, -1, -1}, + { 8, 12, -1, -1, -1, -1, -1, -1}, + { 0, 8, 12, -1, -1, -1, -1, -1}, + { 2, 8, 12, -1, -1, -1, -1, -1}, + { 0, 2, 8, 12, -1, -1, -1, -1}, + { 4, 8, 12, -1, -1, -1, -1, -1}, + { 0, 4, 8, 12, -1, -1, -1, -1}, + { 2, 4, 8, 12, -1, -1, -1, -1}, + { 0, 2, 4, 8, 12, -1, -1, -1}, + { 6, 8, 12, -1, -1, -1, -1, -1}, + { 0, 6, 8, 12, -1, -1, -1, -1}, + { 2, 6, 8, 12, -1, -1, -1, -1}, + { 0, 2, 6, 8, 12, -1, -1, -1}, + { 4, 6, 8, 12, -1, -1, -1, -1}, + { 0, 4, 6, 8, 12, -1, -1, -1}, + { 2, 4, 6, 8, 12, -1, -1, -1}, + { 0, 2, 4, 6, 8, 12, -1, -1}, + {10, 12, -1, -1, -1, -1, -1, -1}, + { 0, 10, 12, -1, -1, -1, -1, -1}, + { 2, 10, 12, -1, -1, -1, -1, -1}, + { 0, 2, 10, 12, -1, -1, -1, -1}, + { 4, 10, 12, -1, -1, -1, -1, -1}, + { 0, 4, 10, 12, -1, -1, -1, -1}, + { 2, 4, 10, 12, -1, -1, -1, -1}, + { 0, 2, 4, 10, 12, -1, -1, -1}, + { 6, 10, 12, -1, -1, -1, -1, -1}, + { 0, 6, 10, 12, -1, -1, -1, -1}, + { 2, 6, 10, 12, -1, -1, -1, -1}, + { 0, 2, 6, 10, 12, -1, -1, -1}, + { 4, 6, 10, 12, -1, -1, -1, -1}, + { 0, 4, 6, 10, 12, -1, -1, -1}, + { 2, 4, 6, 10, 12, -1, -1, -1}, + { 0, 2, 4, 6, 10, 12, -1, -1}, + { 8, 10, 12, -1, -1, -1, -1, -1}, + { 0, 8, 10, 12, -1, -1, -1, -1}, + { 2, 8, 10, 12, -1, -1, -1, -1}, + { 0, 2, 8, 10, 12, -1, -1, -1}, + { 4, 8, 10, 12, -1, -1, -1, -1}, + { 0, 4, 8, 10, 12, -1, -1, -1}, + { 2, 4, 8, 10, 12, -1, -1, -1}, + { 0, 2, 4, 8, 10, 12, -1, -1}, + { 6, 8, 10, 12, -1, -1, -1, -1}, + { 0, 6, 8, 10, 12, -1, -1, -1}, + { 2, 6, 8, 10, 12, -1, -1, -1}, + { 0, 2, 6, 8, 10, 12, -1, -1}, + { 4, 6, 8, 10, 12, -1, -1, -1}, + { 0, 4, 6, 8, 10, 12, -1, -1}, + { 2, 4, 6, 8, 10, 12, -1, -1}, + { 0, 2, 4, 6, 8, 10, 12, -1}, + {14, -1, -1, -1, -1, -1, -1, -1}, + { 0, 14, -1, -1, -1, -1, -1, -1}, + { 2, 14, -1, -1, -1, -1, -1, -1}, + { 0, 2, 14, -1, -1, -1, -1, -1}, + { 4, 14, -1, -1, -1, -1, -1, -1}, + { 0, 4, 14, -1, -1, -1, -1, -1}, + { 2, 4, 14, -1, -1, -1, -1, -1}, + { 0, 2, 4, 14, -1, -1, -1, -1}, + { 6, 14, -1, -1, -1, -1, -1, -1}, + { 0, 6, 14, -1, -1, -1, -1, -1}, + { 2, 6, 14, -1, -1, -1, -1, -1}, + { 0, 2, 6, 14, -1, -1, -1, -1}, + { 4, 6, 14, -1, -1, -1, -1, -1}, + { 0, 4, 6, 14, -1, -1, -1, -1}, + { 2, 4, 6, 14, -1, -1, -1, -1}, + { 0, 2, 4, 6, 14, -1, -1, -1}, + { 8, 14, -1, -1, -1, -1, -1, -1}, + { 0, 8, 14, -1, -1, -1, -1, -1}, + { 2, 8, 14, -1, -1, -1, -1, -1}, + { 0, 2, 8, 14, -1, -1, -1, -1}, + { 4, 8, 14, -1, -1, -1, -1, -1}, + { 0, 4, 8, 14, -1, -1, -1, -1}, + { 2, 4, 8, 14, -1, -1, -1, -1}, + { 0, 2, 4, 8, 14, -1, -1, -1}, + { 6, 8, 14, -1, -1, -1, -1, -1}, + { 0, 6, 8, 14, -1, -1, -1, -1}, + { 2, 6, 8, 14, -1, -1, -1, -1}, + { 0, 2, 6, 8, 14, -1, -1, -1}, + { 4, 6, 8, 14, -1, -1, -1, -1}, + { 0, 4, 6, 8, 14, -1, -1, -1}, + { 2, 4, 6, 8, 14, -1, -1, -1}, + { 0, 2, 4, 6, 8, 14, -1, -1}, + {10, 14, -1, -1, -1, -1, -1, -1}, + { 0, 10, 14, -1, -1, -1, -1, -1}, + { 2, 10, 14, -1, -1, -1, -1, -1}, + { 0, 2, 10, 14, -1, -1, -1, -1}, + { 4, 10, 14, -1, -1, -1, -1, -1}, + { 0, 4, 10, 14, -1, -1, -1, -1}, + { 2, 4, 10, 14, -1, -1, -1, -1}, + { 0, 2, 4, 10, 14, -1, -1, -1}, + { 6, 10, 14, -1, -1, -1, -1, -1}, + { 0, 6, 10, 14, -1, -1, -1, -1}, + { 2, 6, 10, 14, -1, -1, -1, -1}, + { 0, 2, 6, 10, 14, -1, -1, -1}, + { 4, 6, 10, 14, -1, -1, -1, -1}, + { 0, 4, 6, 10, 14, -1, -1, -1}, + { 2, 4, 6, 10, 14, -1, -1, -1}, + { 0, 2, 4, 6, 10, 14, -1, -1}, + { 8, 10, 14, -1, -1, -1, -1, -1}, + { 0, 8, 10, 14, -1, -1, -1, -1}, + { 2, 8, 10, 14, -1, -1, -1, -1}, + { 0, 2, 8, 10, 14, -1, -1, -1}, + { 4, 8, 10, 14, -1, -1, -1, -1}, + { 0, 4, 8, 10, 14, -1, -1, -1}, + { 2, 4, 8, 10, 14, -1, -1, -1}, + { 0, 2, 4, 8, 10, 14, -1, -1}, + { 6, 8, 10, 14, -1, -1, -1, -1}, + { 0, 6, 8, 10, 14, -1, -1, -1}, + { 2, 6, 8, 10, 14, -1, -1, -1}, + { 0, 2, 6, 8, 10, 14, -1, -1}, + { 4, 6, 8, 10, 14, -1, -1, -1}, + { 0, 4, 6, 8, 10, 14, -1, -1}, + { 2, 4, 6, 8, 10, 14, -1, -1}, + { 0, 2, 4, 6, 8, 10, 14, -1}, + {12, 14, -1, -1, -1, -1, -1, -1}, + { 0, 12, 14, -1, -1, -1, -1, -1}, + { 2, 12, 14, -1, -1, -1, -1, -1}, + { 0, 2, 12, 14, -1, -1, -1, -1}, + { 4, 12, 14, -1, -1, -1, -1, -1}, + { 0, 4, 12, 14, -1, -1, -1, -1}, + { 2, 4, 12, 14, -1, -1, -1, -1}, + { 0, 2, 4, 12, 14, -1, -1, -1}, + { 6, 12, 14, -1, -1, -1, -1, -1}, + { 0, 6, 12, 14, -1, -1, -1, -1}, + { 2, 6, 12, 14, -1, -1, -1, -1}, + { 0, 2, 6, 12, 14, -1, -1, -1}, + { 4, 6, 12, 14, -1, -1, -1, -1}, + { 0, 4, 6, 12, 14, -1, -1, -1}, + { 2, 4, 6, 12, 14, -1, -1, -1}, + { 0, 2, 4, 6, 12, 14, -1, -1}, + { 8, 12, 14, -1, -1, -1, -1, -1}, + { 0, 8, 12, 14, -1, -1, -1, -1}, + { 2, 8, 12, 14, -1, -1, -1, -1}, + { 0, 2, 8, 12, 14, -1, -1, -1}, + { 4, 8, 12, 14, -1, -1, -1, -1}, + { 0, 4, 8, 12, 14, -1, -1, -1}, + { 2, 4, 8, 12, 14, -1, -1, -1}, + { 0, 2, 4, 8, 12, 14, -1, -1}, + { 6, 8, 12, 14, -1, -1, -1, -1}, + { 0, 6, 8, 12, 14, -1, -1, -1}, + { 2, 6, 8, 12, 14, -1, -1, -1}, + { 0, 2, 6, 8, 12, 14, -1, -1}, + { 4, 6, 8, 12, 14, -1, -1, -1}, + { 0, 4, 6, 8, 12, 14, -1, -1}, + { 2, 4, 6, 8, 12, 14, -1, -1}, + { 0, 2, 4, 6, 8, 12, 14, -1}, + {10, 12, 14, -1, -1, -1, -1, -1}, + { 0, 10, 12, 14, -1, -1, -1, -1}, + { 2, 10, 12, 14, -1, -1, -1, -1}, + { 0, 2, 10, 12, 14, -1, -1, -1}, + { 4, 10, 12, 14, -1, -1, -1, -1}, + { 0, 4, 10, 12, 14, -1, -1, -1}, + { 2, 4, 10, 12, 14, -1, -1, -1}, + { 0, 2, 4, 10, 12, 14, -1, -1}, + { 6, 10, 12, 14, -1, -1, -1, -1}, + { 0, 6, 10, 12, 14, -1, -1, -1}, + { 2, 6, 10, 12, 14, -1, -1, -1}, + { 0, 2, 6, 10, 12, 14, -1, -1}, + { 4, 6, 10, 12, 14, -1, -1, -1}, + { 0, 4, 6, 10, 12, 14, -1, -1}, + { 2, 4, 6, 10, 12, 14, -1, -1}, + { 0, 2, 4, 6, 10, 12, 14, -1}, + { 8, 10, 12, 14, -1, -1, -1, -1}, + { 0, 8, 10, 12, 14, -1, -1, -1}, + { 2, 8, 10, 12, 14, -1, -1, -1}, + { 0, 2, 8, 10, 12, 14, -1, -1}, + { 4, 8, 10, 12, 14, -1, -1, -1}, + { 0, 4, 8, 10, 12, 14, -1, -1}, + { 2, 4, 8, 10, 12, 14, -1, -1}, + { 0, 2, 4, 8, 10, 12, 14, -1}, + { 6, 8, 10, 12, 14, -1, -1, -1}, + { 0, 6, 8, 10, 12, 14, -1, -1}, + { 2, 6, 8, 10, 12, 14, -1, -1}, + { 0, 2, 6, 8, 10, 12, 14, -1}, + { 4, 6, 8, 10, 12, 14, -1, -1}, + { 0, 4, 6, 8, 10, 12, 14, -1}, + { 2, 4, 6, 8, 10, 12, 14, -1}, + { 0, 2, 4, 6, 8, 10, 12, 14} + } }; -#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) -#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) +#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) +#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -size_t PQCLEAN_KYBER51290S_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint16_t val; - uint32_t good0, good1, good2; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison +#define REJ_UNIFORM_BUFLEN 576 +unsigned int PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(int16_t *restrict r, + const uint8_t *restrict buf) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; + uint32_t good = 0; + const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER51290S_AVX2_16xq.as_vec); - const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER51290S_AVX2_16xv.as_vec); - __m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2; - __m128i d, tmp, pilo, pihi; + const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER51290S_AVX2_qdata.as_arr[_16XQ]); + const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER51290S_AVX2_qdata.as_arr[_16XV]); + __m256i f0, f1, g0, g1, g2, g3; + __m128i f, t, pilo, pihi; - ctr = pos = 0; - while (ctr + 48 <= len && pos + 96 <= buflen) { - d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]); - d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]); - d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]); + ctr = 0; + for (pos = 0; pos < 2 * KYBER_N; pos += 64) { + f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); + f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); - tmp0 = _mm256_cmpge_epu16(bound, d0); - tmp1 = _mm256_cmpge_epu16(bound, d1); - tmp2 = _mm256_cmpge_epu16(bound, d2); - good0 = (uint32_t)_mm256_movemask_epi8(tmp0); - good1 = (uint32_t)_mm256_movemask_epi8(tmp1); - good2 = (uint32_t)_mm256_movemask_epi8(tmp2); - good0 = _pext_u32(good0, 0x55555555); - good1 = _pext_u32(good1, 0x55555555); - good2 = _pext_u32(good2, 0x55555555); + g0 = _mm256_cmpge_epu16(bound, f0); + g1 = _mm256_cmpge_epu16(bound, f1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]); - pi0 = _mm256_castsi128_si256(pilo); - pi0 = _mm256_inserti128_si256(pi0, pihi, 1); + g0 = _mm256_packs_epi16(g0, g1); + good = _mm256_movemask_epi8(g0); - pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]); - pi1 = _mm256_castsi128_si256(pilo); - pi1 = _mm256_inserti128_si256(pi1, pihi, 1); + g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); + g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); + g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); + g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]); - pi2 = _mm256_castsi128_si256(pilo); - pi2 = _mm256_inserti128_si256(pi2, pihi, 1); + //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); + //g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8); - tmp0 = _mm256_add_epi8(pi0, ones); - tmp1 = _mm256_add_epi8(pi1, ones); - tmp2 = _mm256_add_epi8(pi2, ones); - pi0 = _mm256_unpacklo_epi8(pi0, tmp0); - pi1 = _mm256_unpacklo_epi8(pi1, tmp1); - pi2 = _mm256_unpacklo_epi8(pi2, tmp2); + /* Barrett reduction of (still unsigned) values */ + g2 = _mm256_mulhi_epu16(f0, v); + g3 = _mm256_mulhi_epu16(f1, v); + g2 = _mm256_srli_epi16(g2, 10); + g3 = _mm256_srli_epi16(g3, 10); + g2 = _mm256_mullo_epi16(g2, kyberq); + g3 = _mm256_mullo_epi16(g3, kyberq); + f0 = _mm256_sub_epi16(f0, g2); + f1 = _mm256_sub_epi16(f1, g3); - d0 = _mm256_shuffle_epi8(d0, pi0); - d1 = _mm256_shuffle_epi8(d1, pi1); - d2 = _mm256_shuffle_epi8(d2, pi2); + g2 = _mm256_add_epi8(g0, ones); + g3 = _mm256_add_epi8(g1, ones); + g0 = _mm256_unpacklo_epi8(g0, g2); + g1 = _mm256_unpacklo_epi8(g1, g3); - /* Barrett reduction of (still unsigned) d values */ - tmp0 = _mm256_mulhi_epu16(d0, v); - tmp1 = _mm256_mulhi_epu16(d1, v); - tmp2 = _mm256_mulhi_epu16(d2, v); - tmp0 = _mm256_srli_epi16(tmp0, 10); - tmp1 = _mm256_srli_epi16(tmp1, 10); - tmp2 = _mm256_srli_epi16(tmp2, 10); - tmp0 = _mm256_mullo_epi16(tmp0, kyberq); - tmp1 = _mm256_mullo_epi16(tmp1, kyberq); - tmp2 = _mm256_mullo_epi16(tmp2, kyberq); - d0 = _mm256_sub_epi16(d0, tmp0); - d1 = _mm256_sub_epi16(d1, tmp1); - d2 = _mm256_sub_epi16(d2, tmp2); + f0 = _mm256_shuffle_epi8(f0, g0); + f1 = _mm256_shuffle_epi8(f1, g1); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0)); - ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1)); - ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2)); - ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF); - pos += 96; + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); + ctr += _mm_popcnt_u32((good >> 0) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); + ctr += _mm_popcnt_u32((good >> 16) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); + ctr += _mm_popcnt_u32((good >> 8) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); + ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr + 8 <= len && pos + 16 <= buflen) { - d = _mm_loadu_si128((__m128i *)&buf[pos]); - tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d); - good0 = (uint32_t)_mm_movemask_epi8(tmp); - good0 = _pext_u32(good0, 0x55555555); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0]); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { + f = _mm_load_si128((__m128i *)&buf[pos]); + t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + good = _mm_movemask_epi8(t); + good = _pext_u32(good, 0x5555); + pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - d = _mm_shuffle_epi8(d, pilo); /* Barrett reduction */ - tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v)); - tmp = _mm_srli_epi16(tmp, 10); - tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq)); - d = _mm_sub_epi16(d, tmp); + t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); + t = _mm_srli_epi16(t, 10); + t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); + f = _mm_sub_epi16(f, t); - _mm_storeu_si128((__m128i *)&r[ctr], d); - ctr += (unsigned int)_mm_popcnt_u32(good0); + f = _mm_shuffle_epi8(f, pilo); + _mm_storeu_si128((__m128i *)&r[ctr], f); + ctr += _mm_popcnt_u32(good); pos += 16; } - while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = (int16_t)val; + r[ctr++] = val; } } diff --git a/crypto_kem/kyber512-90s/avx2/rejsample.h b/crypto_kem/kyber512-90s/avx2/rejsample.h index 78d2e2c9..eddd5096 100644 --- a/crypto_kem/kyber512-90s/avx2/rejsample.h +++ b/crypto_kem/kyber512-90s/avx2/rejsample.h @@ -1,12 +1,11 @@ #ifndef REJSAMPLE_H #define REJSAMPLE_H -#include +#include "params.h" #include -size_t PQCLEAN_KYBER51290S_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); + +unsigned int PQCLEAN_KYBER51290S_AVX2_rej_uniform_avx(int16_t *r, + const unsigned char *buf); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/shuffle.s b/crypto_kem/kyber512-90s/avx2/shuffle.S similarity index 65% rename from crypto_kem/kyber1024-90s/avx2/shuffle.s rename to crypto_kem/kyber512-90s/avx2/shuffle.S index 2a364a6c..ff87dd54 100644 --- a/crypto_kem/kyber1024-90s/avx2/shuffle.s +++ b/crypto_kem/kyber512-90s/avx2/shuffle.S @@ -1,12 +1,9 @@ +#include "cdecl.inc" .include "fq.inc" .include "shuffle.inc" -.global PQCLEAN_KYBER102490S_AVX2_nttunpack_avx -PQCLEAN_KYBER102490S_AVX2_nttunpack_avx: -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xv(%rip),%ymm1 - +/* +nttpack_avx: #load vmovdqa (%rdi),%ymm4 vmovdqa 32(%rdi),%ymm5 @@ -17,18 +14,51 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -/* -#reduce -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 12 -red16 9 13 -red16 10 14 -red16 11 15 +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret */ +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + shuffle8 4,8,3,8 shuffle8 5,9,4,9 shuffle8 6,10,5,10 @@ -61,11 +91,14 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx -PQCLEAN_KYBER102490S_AVX2_ntttobytes_avx: -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 +.global cdecl(PQCLEAN_KYBER51290S_AVX2_nttunpack_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret +ntttobytes128_avx: #load vmovdqa (%rsi),%ymm5 vmovdqa 32(%rsi),%ymm6 @@ -77,14 +110,14 @@ vmovdqa 192(%rsi),%ymm11 vmovdqa 224(%rsi),%ymm12 #csubq -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 1 -csubq 9 13 -csubq 10 14 -csubq 11 15 -csubq 12 1 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,1 +csubq 9,13 +csubq 10,14 +csubq 11,15 +csubq 12,1 #bitpack vpsllw $12,%ymm6,%ymm4 @@ -135,11 +168,17 @@ vmovdqu %ymm9,160(%rdi) ret -.global PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx -PQCLEAN_KYBER102490S_AVX2_nttfrombytes_avx: +.global cdecl(PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_ntttobytes_avx): #consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xmask(%rip),%ymm0 +vmovdqa _16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret +nttfrombytes128_avx: #load vmovdqu (%rsi),%ymm4 vmovdqu 32(%rsi),%ymm5 @@ -204,3 +243,13 @@ vmovdqa %ymm15,192(%rdi) vmovdqa %ymm1,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx) +cdecl(PQCLEAN_KYBER51290S_AVX2_nttfrombytes_avx): +#consts +vmovdqa _16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret diff --git a/crypto_kem/kyber512-90s/avx2/shuffle.inc b/crypto_kem/kyber512-90s/avx2/shuffle.inc index df352030..d4b092bc 100644 --- a/crypto_kem/kyber512-90s/avx2/shuffle.inc +++ b/crypto_kem/kyber512-90s/avx2/shuffle.inc @@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 +#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 vpsllq $32,%ymm\r1,%ymm12 vpsrlq $32,%ymm\r0,%ymm13 vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 diff --git a/crypto_kem/kyber512-90s/avx2/symmetric.h b/crypto_kem/kyber512-90s/avx2/symmetric.h index 144c3c00..b6a1109c 100644 --- a/crypto_kem/kyber512-90s/avx2/symmetric.h +++ b/crypto_kem/kyber512-90s/avx2/symmetric.h @@ -2,22 +2,26 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "aes256ctr.h" #include "sha2.h" -#define hash_h(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES)) -#define hash_g(OUT, IN, INBYTES) sha512((OUT), (IN), (INBYTES)) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER51290S_AVX2_aes256ctr_init((STATE), (IN), (Y) + ((uint16_t)(X) << 8)) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks((OUT), (OUTBLOCKS), (STATE)) -#define xof_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf((OUT), (OUTBYTES), (KEY), (NONCE)) -#define kdf(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES)) - -#define XOF_BLOCKBYTES 128 - typedef aes256ctr_ctx xof_state; +#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) +#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) +#define xof_absorb(STATE, SEED, X, Y) \ + PQCLEAN_KYBER51290S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_KYBER51290S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER51290S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) +#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) + #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber512-90s/avx2/verify.c b/crypto_kem/kyber512-90s/avx2/verify.c index 9cf32ac3..af690208 100644 --- a/crypto_kem/kyber512-90s/avx2/verify.c +++ b/crypto_kem/kyber512-90s/avx2/verify.c @@ -1,23 +1,22 @@ #include "verify.h" - #include #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER51290S_AVX2_verify * * Description: Compare two arrays for equality in constant time. * -* Arguments: const uint8_t *a: pointer to first byte array -* const uint8_t *b: pointer to second byte array +* Arguments: const unsigned char *a: pointer to first byte array +* const unsigned char *b: pointer to second byte array * size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER51290S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; - uint64_t r; +int PQCLEAN_KYBER51290S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t pos = 0; + uint64_t r = 0; __m256i avec, bvec, cvec; cvec = _mm256_setzero_si256(); @@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER51290S_AVX2_verify(const uint8_t *a, const uint8_t *b, size avec = _mm256_xor_si256(avec, bvec); cvec = _mm256_or_si256(cvec, avec); } + r = !_mm256_testz_si256(cvec, cvec); - cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256()); - r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1); - - while (pos < len) { - r |= a[pos] ^ b[pos]; - pos += 1; + if (pos < len) { + avec = _mm256_loadu_si256((__m256i *)&a[pos]); + bvec = _mm256_loadu_si256((__m256i *)&b[pos]); + cvec = _mm256_cmpeq_epi8(avec, bvec); + r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); } r = (-r) >> 63; - return (uint8_t)r; + return r; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER51290S_AVX2_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: uint8_t *r: pointer to output byte array -* const uint8_t *x: pointer to input byte array +* Arguments: unsigned char *r: pointer to output byte array +* const unsigned char *x: pointer to input byte array * size_t len: Amount of bytes to be copied -* uint8_t b: Condition bit; has to be in {0,1} +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER51290S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER51290S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { + size_t pos = 0; __m256i xvec, rvec, bvec; b = -b; - bvec = _mm256_set1_epi8((char)b); + bvec = _mm256_set1_epi8(b); for (pos = 0; pos + 32 <= len; pos += 32) { rvec = _mm256_loadu_si256((__m256i *)&r[pos]); diff --git a/crypto_kem/kyber512-90s/avx2/verify.h b/crypto_kem/kyber512-90s/avx2/verify.h index 005c3758..300077db 100644 --- a/crypto_kem/kyber512-90s/avx2/verify.h +++ b/crypto_kem/kyber512-90s/avx2/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER51290S_AVX2_VERIFY_H +#define PQCLEAN_KYBER51290S_AVX2_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER51290S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER51290S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER51290S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber512-90s/clean/LICENSE b/crypto_kem/kyber512-90s/clean/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber512-90s/clean/LICENSE +++ b/crypto_kem/kyber512-90s/clean/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber512-90s/clean/Makefile b/crypto_kem/kyber512-90s/clean/Makefile index aff039c5..7f676006 100644 --- a/crypto_kem/kyber512-90s/clean/Makefile +++ b/crypto_kem/kyber512-90s/clean/Makefile @@ -1,8 +1,29 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber512-90s_clean.a -HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h aes256ctr.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o aes256ctr.o +HEADERS= \ + api.h \ + cbd.h \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + symmetric-aes.h \ + symmetric.h \ + verify.h +OBJECTS= \ + cbd.o \ + indcpa.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + reduce.o \ + verify.o \ + symmetric-aes.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber512-90s/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber512-90s/clean/Makefile.Microsoft_nmake index 69eced0f..8a663b40 100644 --- a/crypto_kem/kyber512-90s/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber512-90s/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber512-90s_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj aes256ctr.obj +OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-aes.o # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber512-90s/clean/cbd.c b/crypto_kem/kyber512-90s/clean/cbd.c index ab69db4f..bf8f5bef 100644 --- a/crypto_kem/kyber512-90s/clean/cbd.c +++ b/crypto_kem/kyber512-90s/clean/cbd.c @@ -1,7 +1,5 @@ -#include "cbd.h" #include "params.h" - -#include +#include "cbd.h" #include /************************************************* @@ -14,8 +12,8 @@ * * Returns 32-bit unsigned integer loaded from x **************************************************/ -static uint32_t load32_littleendian(const uint8_t *x) { - uint32_t r; +static uint32_t load32_littleendian(const uint8_t x[4]) { + uint32_t r = 0; r = (uint32_t)x[0]; r |= (uint32_t)x[1] << 8; r |= (uint32_t)x[2] << 16; @@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) { } /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER51290S_CLEAN_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA -* specialized for KYBER_ETA=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_cbd(poly *r, const uint8_t *buf) { - uint32_t d, t; - int16_t a, b; +void PQCLEAN_KYBER51290S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { + unsigned int i = 0, j = 0; + uint32_t t = 0, d = 0; + int16_t a = 0, b = 0; - for (size_t i = 0; i < KYBER_N / 8; i++) { - t = load32_littleendian(buf + 4 * i); + for (i = 0; i < KYBER_N / 8; i++) { + t = load32_littleendian(buf + 4 * i); d = t & 0x55555555; d += (t >> 1) & 0x55555555; - for (size_t j = 0; j < 8; j++) { - a = (d >> 4 * j) & 0x3; + for (j = 0; j < 8; j++) { + a = (d >> (4 * j + 0)) & 0x3; b = (d >> (4 * j + 2)) & 0x3; r->coeffs[8 * i + j] = a - b; } diff --git a/crypto_kem/kyber512-90s/clean/cbd.h b/crypto_kem/kyber512-90s/clean/cbd.h index a3f4c21d..676cddcc 100644 --- a/crypto_kem/kyber512-90s/clean/cbd.h +++ b/crypto_kem/kyber512-90s/clean/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER51290S_CLEAN_CBD_H +#define PQCLEAN_KYBER51290S_CLEAN_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER51290S_CLEAN_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER51290S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber512-90s/clean/indcpa.c b/crypto_kem/kyber512-90s/clean/indcpa.c index 5f0ec6e0..f06a2d79 100644 --- a/crypto_kem/kyber512-90s/clean/indcpa.c +++ b/crypto_kem/kyber512-90s/clean/indcpa.c @@ -5,7 +5,7 @@ #include "polyvec.h" #include "randombytes.h" #include "symmetric.h" - +#include #include /************************************************* @@ -16,12 +16,15 @@ * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key +* polynomial vector +* - uint8_t *seed: pointer to output seed to generate +* matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(r, sk); } @@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of +* polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER51290S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER51290S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - size_t len: requested number of 16-bit integers (uniform mod q) -* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes) -* - size_t buflen: length of input buffer in bytes +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) **************************************************/ -static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - size_t ctr, pos; - uint16_t val; +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { - val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction + val -= (val >> 12) * KYBER_Q; // Barrett reduction r[ctr++] = (int16_t)val; } } @@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER51290S_CLEAN_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER51290S_CLEAN_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER51290S_CLEAN_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T +* is generated **************************************************/ -#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - uint8_t i, j; - uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1]; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +// Not static for benchmarking +void PQCLEAN_KYBER51290S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { xof_absorb(&state, seed, j, i); } - xof_squeezeblocks(buf, MAXNBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES); + xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); while (ctr < KYBER_N) { xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, + XOF_BLOCKBYTES); } xof_ctx_release(&state); } @@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], e, pkpv, skpv; +void PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t *publicseed = buf; - uint8_t *noiseseed = buf + KYBER_SYMBYTES; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; uint8_t nonce = 0; + polyvec a[KYBER_K], e, pkpv, skpv; randombytes(buf, KYBER_SYMBYTES); hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(&skpv); PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv); - PQCLEAN_KYBER51290S_CLEAN_poly_frommont(&pkpv.vec[i]); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER51290S_CLEAN_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER51290S_CLEAN_polyvec_add(&pkpv, &pkpv, &e); @@ -217,34 +243,40 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER51290S_CLEAN_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec sp, pkpv, ep, at[KYBER_K], bp; - poly v, k, epp; +void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER51290S_CLEAN_poly_frommsg(&k, m); gen_at(at, seed); - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); } PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(&epp, coins, nonce++); @@ -252,14 +284,14 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t *c, PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt(&bp); - PQCLEAN_KYBER51290S_CLEAN_poly_invntt(&v); + PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(&v); PQCLEAN_KYBER51290S_CLEAN_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER51290S_CLEAN_poly_add(&v, &v, &epp); @@ -271,18 +303,21 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER51290S_CLEAN_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER51290S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -290,8 +325,8 @@ void PQCLEAN_KYBER51290S_CLEAN_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER51290S_CLEAN_poly_invntt(&mp); + PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER51290S_CLEAN_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER51290S_CLEAN_poly_reduce(&mp); diff --git a/crypto_kem/kyber512-90s/clean/indcpa.h b/crypto_kem/kyber512-90s/clean/indcpa.h index e9207d94..8d3571b4 100644 --- a/crypto_kem/kyber512-90s/clean/indcpa.h +++ b/crypto_kem/kyber512-90s/clean/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER51290S_CLEAN_INDCPA_H +#define PQCLEAN_KYBER51290S_CLEAN_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER51290S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER51290S_CLEAN_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER51290S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber512-90s/clean/kem.c b/crypto_kem/kyber512-90s/clean/kem.c index f5a6efbd..ef28920c 100644 --- a/crypto_kem/kyber512-90s/clean/kem.c +++ b/crypto_kem/kyber512-90s/clean/kem.c @@ -1,99 +1,125 @@ -#include "api.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER51290S_CLEAN_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER51290S_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER51290S_CLEAN_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { uint8_t buf[2 * KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + /* Don't release system RNG output */ + hash_h(buf, buf, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER51290S_CLEAN_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; PQCLEAN_KYBER51290S_CLEAN_indcpa_dec(buf, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER51290S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER51290S_CLEAN_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER51290S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER51290S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber512-90s/clean/kem.h b/crypto_kem/kyber512-90s/clean/kem.h new file mode 100644 index 00000000..9373b40d --- /dev/null +++ b/crypto_kem/kyber512-90s/clean/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER51290S_CLEAN_KEM_H +#define PQCLEAN_KYBER51290S_CLEAN_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER51290S_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber512-90s/clean/ntt.c b/crypto_kem/kyber512-90s/clean/ntt.c index 28ee4c95..f2a7ecf1 100644 --- a/crypto_kem/kyber512-90s/clean/ntt.c +++ b/crypto_kem/kyber512-90s/clean/ntt.c @@ -1,11 +1,9 @@ -#include "ntt.h" #include "params.h" +#include "ntt.h" #include "reduce.h" - -#include #include -/* Code to generate zetas and zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER51290S_CLEAN_zetas and PQCLEAN_KYBER51290S_CLEAN_zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 @@ -17,12 +15,8 @@ static const uint16_t tree[128] = { 1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121, 5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125, 3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123, - 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127}; - - -static int16_t fqmul(int16_t a, int16_t b) { - return montgomery_reduce((int32_t)a*b); -} + 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127 +}; void init_ntt() { unsigned int i, j, k; @@ -33,40 +27,44 @@ void init_ntt() { tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); for(i = 0; i < 128; ++i) - zetas[i] = tmp[tree[i]]; + PQCLEAN_KYBER51290S_CLEAN_zetas[i] = tmp[tree[i]]; k = 0; for(i = 64; i >= 1; i >>= 1) for(j = i; j < 2*i; ++j) - zetas_inv[k++] = -tmp[128 - tree[j]]; + PQCLEAN_KYBER51290S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + PQCLEAN_KYBER51290S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; } */ + const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, - 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, - 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, - 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, - 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, - 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, + 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, + 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, + 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, + 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, + 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, + 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, + 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, + 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 }; const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, - 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, - 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, - 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, - 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, - 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 + 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, + 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, + 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, + 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, + 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, + 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, + 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, + 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, + 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 }; - /************************************************* * Name: fqmul * @@ -82,68 +80,73 @@ static int16_t fqmul(int16_t a, int16_t b) { } /************************************************* -* Name: ntt +* Name: PQCLEAN_KYBER51290S_CLEAN_ntt * * Description: Inplace number-theoretic transform (NTT) in Rq * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t poly[256]) { - size_t j, k = 1; - int16_t t, zeta; +void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t r[256]) { + unsigned int len = 0, start = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 128; len >= 2; len >>= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 1; + for (len = 128; len >= 2; len >>= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER51290S_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = fqmul(zeta, poly[j + len]); - poly[j + len] = poly[j] - t; - poly[j] = poly[j] + t; + t = fqmul(zeta, r[j + len]); + r[j + len] = r[j] - t; + r[j] = r[j] + t; } } } } /************************************************* -* Name: invntt +* Name: invntt_tomont * -* Description: Inplace inverse number-theoretic transform in Rq -* input is in bitreversed order, output is in standard order +* Description: Inplace inverse number-theoretic transform in Rq and +* multiplication by Montgomery factor 2^16. +* Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t poly[256]) { - size_t j, k = 0; - int16_t t, zeta; +void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t r[256]) { + unsigned int start = 0, len = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 2; len <= 128; len <<= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 0; + for (len = 2; len <= 128; len <<= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER51290S_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { - t = poly[j]; - poly[j] = PQCLEAN_KYBER51290S_CLEAN_barrett_reduce(t + poly[j + len]); - poly[j + len] = t - poly[j + len]; - poly[j + len] = fqmul(zeta, poly[j + len]); + t = r[j]; + r[j] = PQCLEAN_KYBER51290S_CLEAN_barrett_reduce(t + r[j + len]); + r[j + len] = t - r[j + len]; + r[j + len] = fqmul(zeta, r[j + len]); } } } for (j = 0; j < 256; ++j) { - poly[j] = fqmul(poly[j], PQCLEAN_KYBER51290S_CLEAN_zetas_inv[127]); + r[j] = fqmul(r[j], PQCLEAN_KYBER51290S_CLEAN_zetas_inv[127]); } } /************************************************* -* Name: basemul +* Name: PQCLEAN_KYBER51290S_CLEAN_basemul * -* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta)) +* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); diff --git a/crypto_kem/kyber512-90s/clean/ntt.h b/crypto_kem/kyber512-90s/clean/ntt.h index 720bee97..8a96c3d4 100644 --- a/crypto_kem/kyber512-90s/clean/ntt.h +++ b/crypto_kem/kyber512-90s/clean/ntt.h @@ -1,13 +1,22 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER51290S_CLEAN_NTT_H +#define PQCLEAN_KYBER51290S_CLEAN_NTT_H +#include "params.h" #include + extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetasinv[128]; -void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t *poly); -void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t *poly); + +extern const int16_t PQCLEAN_KYBER51290S_CLEAN_zetas_inv[128]; + + +void PQCLEAN_KYBER51290S_CLEAN_ntt(int16_t r[256]); + + +void PQCLEAN_KYBER51290S_CLEAN_invntt(int16_t r[256]); + + void PQCLEAN_KYBER51290S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber512-90s/clean/params.h b/crypto_kem/kyber512-90s/clean/params.h index d086d4c6..16775a59 100644 --- a/crypto_kem/kyber512-90s/clean/params.h +++ b/crypto_kem/kyber512-90s/clean/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER51290S_CLEAN_PARAMS_H +#define PQCLEAN_KYBER51290S_CLEAN_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 2 #define KYBER_POLYCOMPRESSEDBYTES 96 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber512-90s/clean/poly.c b/crypto_kem/kyber512-90s/clean/poly.c index 1aee99dd..f45e57dd 100644 --- a/crypto_kem/kyber512-90s/clean/poly.c +++ b/crypto_kem/kyber512-90s/clean/poly.c @@ -1,118 +1,175 @@ +#include "params.h" #include "cbd.h" #include "ntt.h" -#include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include + /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_compress * * Description: Compression and subsequent serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t k = 0; PQCLEAN_KYBER51290S_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N; i += 8) { - for (size_t j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; } - r[k] = (uint8_t)( t[0] | (t[1] << 3) | (t[2] << 6)); - r[k + 1] = (uint8_t)((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)); - r[k + 2] = (uint8_t)((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)); - k += 3; + r[0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + r += 3; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER51290S_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_decompress(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)( (((a[0] & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 1] = (int16_t)(((((a[0] >> 3) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 2] = (int16_t)(((((a[0] >> 6) | ((a[1] << 2) & 4)) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 3] = (int16_t)(((((a[1] >> 1) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 4] = (int16_t)(((((a[1] >> 4) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 5] = (int16_t)(((((a[1] >> 7) | ((a[2] << 1) & 6)) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 6] = (int16_t)(((((a[2] >> 2) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 7] = (int16_t)(((((a[2] >> 5)) * KYBER_Q) + 4) >> 3); +void PQCLEAN_KYBER51290S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + unsigned int j = 0; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 3); + t[2] = (a[0] >> 6) | (a[1] << 2); + t[3] = (a[1] >> 1); + t[4] = (a[1] >> 4); + t[5] = (a[1] >> 7) | (a[2] << 1); + t[6] = (a[2] >> 2); + t[7] = (a[2] >> 5); a += 3; + + for (j = 0; j < 8; j++) { + r->coeffs[8 * i + j] = ((uint16_t)(t[j] & 7) * KYBER_Q + 4) >> 3; + } } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_tobytes * * Description: Serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t *r, poly *a) { - int16_t t0, t1; +void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + unsigned int i = 0; + uint16_t t0 = 0, t1 = 0; PQCLEAN_KYBER51290S_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N / 2; i++) { + for (i = 0; i < KYBER_N / 2; i++) { t0 = a->coeffs[2 * i]; t1 = a->coeffs[2 * i + 1]; - r[3 * i] = t0 & 0xff; - r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 & 0xf) << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER51290S_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_frombytes(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N / 2; i++) { - r->coeffs[2 * i] = (int16_t)(a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8); - r->coeffs[2 * i + 1] = (int16_t)(a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4); +void PQCLEAN_KYBER51290S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF; + r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF; } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void PQCLEAN_KYBER51290S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + unsigned int i = 0, j = 0; + int16_t mask = 0; + + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + mask = -(int16_t)((msg[i] >> j) & 1); + r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER51290S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { + unsigned int i = 0, j = 0; + uint16_t t = 0; + + PQCLEAN_KYBER51290S_CLEAN_poly_csubq(a); + + for (i = 0; i < KYBER_N / 8; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { +void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); + prf(buf, sizeof(buf), seed, nonce); PQCLEAN_KYBER51290S_CLEAN_cbd(r, buf); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -126,20 +183,20 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_ntt(poly *r) { } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_invntt(poly *r) { +void PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(poly *r) { PQCLEAN_KYBER51290S_CLEAN_invntt(r->coeffs); } /************************************************* -* Name: poly_basemul +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_basemul_montgomery * * Description: Multiplication of two polynomials in NTT domain * @@ -147,68 +204,64 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_invntt(poly *r) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N / 4; ++i) { - PQCLEAN_KYBER51290S_CLEAN_basemul( - r->coeffs + 4 * i, - a->coeffs + 4 * i, - b->coeffs + 4 * i, - PQCLEAN_KYBER51290S_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER51290S_CLEAN_basemul( - r->coeffs + 4 * i + 2, - a->coeffs + 4 * i + 2, - b->coeffs + 4 * i + 2, - -PQCLEAN_KYBER51290S_CLEAN_zetas[64 + i]); +void PQCLEAN_KYBER51290S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 4; i++) { + PQCLEAN_KYBER51290S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER51290S_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER51290S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], + -PQCLEAN_KYBER51290S_CLEAN_zetas[64 + i]); } } /************************************************* -* Name: poly_frommont +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_tomont * * Description: Inplace conversion of all coefficients of a polynomial -* from Montgomery domain to normal domain +* from normal domain to Montgomery domain * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_frommont(poly *r) { +void PQCLEAN_KYBER51290S_CLEAN_poly_tomont(poly *r) { + unsigned int i = 0; const int16_t f = (1ULL << 32) % KYBER_Q; - - for (size_t i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce( - (int32_t)r->coeffs[i] * f); + for (i = 0; i < KYBER_N; i++) { + r->coeffs[i] = PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce((int32_t)r->coeffs[i] * f); } } /************************************************* -* Name: poly_reduce +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_reduce * * Description: Applies Barrett reduction to all coefficients of a polynomial * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_poly_reduce(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER51290S_CLEAN_barrett_reduce(r->coeffs[i]); } } /************************************************* -* Name: poly_csubq +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_csubq * -* Description: Applies conditional subtraction of q to each coefficient of a polynomial -* for details of conditional subtraction of q see comments in reduce.c +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_poly_csubq(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER51290S_CLEAN_csubq(r->coeffs[i]); } } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_add * * Description: Add two polynomials * @@ -217,13 +270,14 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER51290S_CLEAN_poly_sub * * Description: Subtract two polynomials * @@ -232,48 +286,8 @@ void PQCLEAN_KYBER51290S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] - b->coeffs[i]; } } - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - uint16_t mask; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - for (size_t j = 0; j < 8; j++) { - mask = -((msg[i] >> j) & 1); - r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); - } - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint16_t t; - - PQCLEAN_KYBER51290S_CLEAN_poly_csubq(a); - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - msg[i] = 0; - for (size_t j = 0; j < 8; j++) { - t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; - msg[i] |= t << j; - } - } -} diff --git a/crypto_kem/kyber512-90s/clean/poly.h b/crypto_kem/kyber512-90s/clean/poly.h index fbab1da7..dc968ea7 100644 --- a/crypto_kem/kyber512-90s/clean/poly.h +++ b/crypto_kem/kyber512-90s/clean/poly.h @@ -1,9 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER51290S_CLEAN_POLY_H +#define PQCLEAN_KYBER51290S_CLEAN_POLY_H #include "params.h" - #include + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -12,26 +12,41 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER51290S_CLEAN_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER51290S_CLEAN_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER51290S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER51290S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER51290S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER51290S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER51290S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER51290S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER51290S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER51290S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + void PQCLEAN_KYBER51290S_CLEAN_poly_ntt(poly *r); -void PQCLEAN_KYBER51290S_CLEAN_poly_invntt(poly *r); -void PQCLEAN_KYBER51290S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER51290S_CLEAN_poly_frommont(poly *r); + +void PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(poly *r); + +void PQCLEAN_KYBER51290S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER51290S_CLEAN_poly_tomont(poly *r); + void PQCLEAN_KYBER51290S_CLEAN_poly_reduce(poly *r); + void PQCLEAN_KYBER51290S_CLEAN_poly_csubq(poly *r); + void PQCLEAN_KYBER51290S_CLEAN_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER51290S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber512-90s/clean/polyvec.c b/crypto_kem/kyber512-90s/clean/polyvec.c index 8d90e7e0..5dbdc260 100644 --- a/crypto_kem/kyber512-90s/clean/polyvec.c +++ b/crypto_kem/kyber512-90s/clean/polyvec.c @@ -1,128 +1,153 @@ -#include "polyvec.h" - +#include "params.h" #include "poly.h" - -#include +#include "polyvec.h" #include + /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_compress * * Description: Compress and serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq(a); uint16_t t[4]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - for (size_t k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) + / KYBER_Q) & 0x3ff; + } } - r[5 * j + 0] = (uint8_t)t[0]; - r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x3f) << 2)); - r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] & 0x0f) << 4)); - r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] & 0x03) << 6)); - r[5 * j + 4] = (uint8_t)((t[3] >> 2)); + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; } - r += 320; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER51290S_CLEAN_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - r->vec[i].coeffs[4 * j + 0] = (int16_t)( (((a[5 * j + 0] | (((uint32_t)a[5 * j + 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 1] = (int16_t)(((((a[5 * j + 1] >> 2) | (((uint32_t)a[5 * j + 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 2] = (int16_t)(((((a[5 * j + 2] >> 4) | (((uint32_t)a[5 * j + 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 3] = (int16_t)(((((a[5 * j + 3] >> 6) | (((uint32_t)a[5 * j + 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10); +void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } } - a += 320; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes * * Description: Serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) +* Arguments: - uint8_t *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_CLEAN_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { +void PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + unsigned int i = 0; poly t; - PQCLEAN_KYBER51290S_CLEAN_poly_basemul(r, &a->vec[0], &b->vec[0]); - for (size_t i = 1; i < KYBER_K; i++) { - PQCLEAN_KYBER51290S_CLEAN_poly_basemul(&t, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER51290S_CLEAN_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER51290S_CLEAN_poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]); PQCLEAN_KYBER51290S_CLEAN_poly_add(r, r, &t); } @@ -130,37 +155,40 @@ void PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, } /************************************************* -* Name: polyvec_reduce +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient * of each element of a vector of polynomials * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_reduce(&r->vec[i]); } } /************************************************* -* Name: polyvec_csubq +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq * * Description: Applies conditional subtraction of q to each coefficient * of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in reduce.c +* for details of conditional subtraction of q see comments in +* reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER51290S_CLEAN_polyvec_add * * Description: Add vectors of polynomials * @@ -169,7 +197,8 @@ void PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER51290S_CLEAN_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber512-90s/clean/polyvec.h b/crypto_kem/kyber512-90s/clean/polyvec.h index abf3fb9b..57ce4507 100644 --- a/crypto_kem/kyber512-90s/clean/polyvec.h +++ b/crypto_kem/kyber512-90s/clean/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER51290S_CLEAN_POLYVEC_H +#define PQCLEAN_KYBER51290S_CLEAN_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER51290S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER51290S_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER51290S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER51290S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER51290S_CLEAN_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER51290S_CLEAN_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER51290S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER51290S_CLEAN_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER51290S_CLEAN_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER51290S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber512-90s/clean/reduce.c b/crypto_kem/kyber512-90s/clean/reduce.c index 2447fef1..ad313be0 100644 --- a/crypto_kem/kyber512-90s/clean/reduce.c +++ b/crypto_kem/kyber512-90s/clean/reduce.c @@ -1,32 +1,32 @@ -#include "reduce.h" - #include "params.h" - +#include "reduce.h" #include + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes * 16-bit integer congruent to a * R^-1 mod q, * where R=2^16 * -* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1} +* Arguments: - int32_t a: input integer to be reduced; +* has to be in {-q2^15,...,q2^15-1} * * Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. **************************************************/ int16_t PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce(int32_t a) { - int32_t t; - int16_t u; + int32_t t = 0; + int16_t u = 0; u = (int16_t)(a * (int64_t)QINV); t = (int32_t)u * KYBER_Q; t = a - t; t >>= 16; - return (int16_t)t; + return t; } /************************************************* -* Name: barrett_reduce +* Name: PQCLEAN_KYBER51290S_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes * 16-bit integer congruent to a mod q in {0,...,q} @@ -36,21 +36,20 @@ int16_t PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce(int32_t a) { * Returns: integer in {0,...,q} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER51290S_CLEAN_barrett_reduce(int16_t a) { - int32_t t; - const int32_t v = (1U << 26) / KYBER_Q + 1; + int16_t t = 0; + const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = v * a; - t >>= 26; + t = (int32_t)v * a >> 26; t *= KYBER_Q; - return a - (int16_t)t; + return a - t; } /************************************************* -* Name: csubq +* Name: PQCLEAN_KYBER51290S_CLEAN_csubq * * Description: Conditionallly subtract q * -* Arguments: - int16_t a: input integer +* Arguments: - int16_t x: input integer * * Returns: a - q if a >= q, else a **************************************************/ diff --git a/crypto_kem/kyber512-90s/clean/reduce.h b/crypto_kem/kyber512-90s/clean/reduce.h index f9a9b762..d79f51f3 100644 --- a/crypto_kem/kyber512-90s/clean/reduce.h +++ b/crypto_kem/kyber512-90s/clean/reduce.h @@ -1,15 +1,19 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER51290S_CLEAN_REDUCE_H +#define PQCLEAN_KYBER51290S_CLEAN_REDUCE_H +#include "params.h" #include -#define MONT 2285 // 2^16 % Q -#define QINV 62209 // q^(-1) mod 2^16 +#define MONT 2285 // 2^16 mod q +#define QINV 62209 // q^-1 mod 2^16 + int16_t PQCLEAN_KYBER51290S_CLEAN_montgomery_reduce(int32_t a); + int16_t PQCLEAN_KYBER51290S_CLEAN_barrett_reduce(int16_t a); + int16_t PQCLEAN_KYBER51290S_CLEAN_csubq(int16_t a); #endif diff --git a/crypto_kem/kyber512-90s/clean/aes256ctr.c b/crypto_kem/kyber512-90s/clean/symmetric-aes.c similarity index 98% rename from crypto_kem/kyber512-90s/clean/aes256ctr.c rename to crypto_kem/kyber512-90s/clean/symmetric-aes.c index d15194b3..a492b66d 100644 --- a/crypto_kem/kyber512-90s/clean/aes256ctr.c +++ b/crypto_kem/kyber512-90s/clean/symmetric-aes.c @@ -1,4 +1,4 @@ -#include "aes256ctr.h" +#include "symmetric-aes.h" #include "aes.h" #include #include @@ -14,7 +14,7 @@ static inline void br_enc32be(unsigned char *dst, uint32_t x) { static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) { uint8_t ivw[16]; uint8_t buf[AES_BLOCKBYTES]; - size_t i; + size_t i = 0; memcpy(ivw, iv, AESCTR_NONCEBYTES); br_enc32be(ivw + AESCTR_NONCEBYTES, ctr); @@ -94,7 +94,6 @@ void PQCLEAN_KYBER51290S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblo s->ctr += (uint32_t) (4 * nblocks); } -/** Free the AES ctx **/ void PQCLEAN_KYBER51290S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) { aes256_ctx_release(&s->sk_exp); } diff --git a/crypto_kem/kyber512-90s/clean/aes256ctr.h b/crypto_kem/kyber512-90s/clean/symmetric-aes.h similarity index 100% rename from crypto_kem/kyber512-90s/clean/aes256ctr.h rename to crypto_kem/kyber512-90s/clean/symmetric-aes.h diff --git a/crypto_kem/kyber512-90s/clean/symmetric.h b/crypto_kem/kyber512-90s/clean/symmetric.h index e1bc2743..e16c8145 100644 --- a/crypto_kem/kyber512-90s/clean/symmetric.h +++ b/crypto_kem/kyber512-90s/clean/symmetric.h @@ -2,22 +2,24 @@ #define SYMMETRIC_H #include "params.h" +#include +#include -#include "aes256ctr.h" #include "sha2.h" +#include "symmetric-aes.h" + +typedef aes256xof_ctx xof_state; + +#define XOF_BLOCKBYTES 64 #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER51290S_CLEAN_aes256xof_absorb(STATE, IN, X, Y) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER51290S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y) #define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER51290S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) PQCLEAN_KYBER51290S_CLEAN_aes256xof_ctx_release(STATE) #define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER51290S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) -#define XOF_BLOCKBYTES 64 - -typedef aes256xof_ctx xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber512-90s/clean/verify.c b/crypto_kem/kyber512-90s/clean/verify.c index 35867a99..18186fa9 100644 --- a/crypto_kem/kyber512-90s/clean/verify.c +++ b/crypto_kem/kyber512-90s/clean/verify.c @@ -1,34 +1,31 @@ #include "verify.h" - #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER51290S_CLEAN_verify * * Description: Compare two arrays for equality in constant time. * * Arguments: const uint8_t *a: pointer to first byte array * const uint8_t *b: pointer to second byte array -* size_t len: length of the byte arrays +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER51290S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { - uint64_t r; - size_t i; - r = 0; +int PQCLEAN_KYBER51290S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t i = 0; + uint8_t r = 0; for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } - r = (-r) >> 63; - return (uint8_t)r; + return (-(uint64_t)r) >> 63; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER51290S_CLEAN_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; @@ -37,14 +34,14 @@ uint8_t PQCLEAN_KYBER51290S_CLEAN_verify(const uint8_t *a, const uint8_t *b, siz * * Arguments: uint8_t *r: pointer to output byte array * const uint8_t *x: pointer to input byte array -* size_t len: Amount of bytes to be copied +* size_t len: Amount of bytes to be copied * uint8_t b: Condition bit; has to be in {0,1} **************************************************/ void PQCLEAN_KYBER51290S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t i; + size_t i = 0; b = -b; for (i = 0; i < len; i++) { - r[i] ^= b & (x[i] ^ r[i]); + r[i] ^= b & (r[i] ^ x[i]); } } diff --git a/crypto_kem/kyber512-90s/clean/verify.h b/crypto_kem/kyber512-90s/clean/verify.h index 7ece5735..c911c51c 100644 --- a/crypto_kem/kyber512-90s/clean/verify.h +++ b/crypto_kem/kyber512-90s/clean/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER51290S_CLEAN_VERIFY_H +#define PQCLEAN_KYBER51290S_CLEAN_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER51290S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER51290S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER51290S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber512/META.yml b/crypto_kem/kyber512/META.yml index 6dbcd49a..9de99b60 100644 --- a/crypto_kem/kyber512/META.yml +++ b/crypto_kem/kyber512/META.yml @@ -20,14 +20,15 @@ auxiliary-submitters: - Gregor Seiler - Damien Stehlé implementations: - - name: clean - version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 - - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - required_flags: - - avx2 - - bmi2 + - name: clean + version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 + - name: avx2 + version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 diff --git a/crypto_kem/kyber512/avx2/LICENSE b/crypto_kem/kyber512/avx2/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber512/avx2/LICENSE +++ b/crypto_kem/kyber512/avx2/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber512/avx2/Makefile b/crypto_kem/kyber512/avx2/Makefile index 81058f2e..b619b1c2 100644 --- a/crypto_kem/kyber512/avx2/Makefile +++ b/crypto_kem/kyber512/avx2/Makefile @@ -1,26 +1,58 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber512_avx2.a -HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h fips202x4.h -OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \ - verify.o indcpa.o rejsample.o fips202x4.o symmetric-fips202.o +HEADERS= \ + align.h \ + api.h \ + cbd.h \ + cdecl.inc \ + consts.h \ + fips202x4.h \ + fq.inc \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + rejsample.h \ + shuffle.inc \ + symmetric.h \ + verify.h +OBJECTS= \ + basemul.o \ + cbd.o \ + consts.o \ + fips202x4.o \ + fq.o \ + indcpa.o \ + invntt.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + rejsample.o \ + shuffle.o \ + symmetric-shake.o \ + verify.o KECCAK4XDIR=../../../common/keccak4x KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) -CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls \ + -Wpointer-arith -Wshadow \ + -std=c99 -I../../../common $(EXTRAFLAGS) all: $(LIB) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) $(CFLAGS) -c -o $@ $< $(LIB): $(OBJECTS) $(KECCAK4X) $(AR) -r $@ $(OBJECTS) $(KECCAK4X) diff --git a/crypto_kem/kyber512/avx2/align.h b/crypto_kem/kyber512/avx2/align.h new file mode 100644 index 00000000..88623df4 --- /dev/null +++ b/crypto_kem/kyber512/avx2/align.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_KYBER512_AVX2_ALIGN_H +#define PQCLEAN_KYBER512_AVX2_ALIGN_H +#include + +#define ALIGN16_TYPE(t) \ + union { \ + __m128i vec; \ + t orig; \ + } + +#define ALIGN32_ARRAY(t, s) \ + union { \ + __m256i vec; \ + t arr[(s)]; \ + } + +#define ALIGN32_ARRAY_2D(t, n, m) \ + union { \ + __m256i vec; \ + t arr[(n)][(m)]; \ + } +#endif diff --git a/crypto_kem/kyber512/avx2/basemul.S b/crypto_kem/kyber512/avx2/basemul.S index 9598c305..fd513a51 100644 --- a/crypto_kem/kyber512/avx2/basemul.S +++ b/crypto_kem/kyber512/avx2/basemul.S @@ -1,4 +1,5 @@ #include "params.h" +#include "cdecl.inc" .macro schoolbook off,sign #load @@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0 vpaddd %ymm7,%ymm8,%ymm7 # y1 .endm -.macro red a0,a1,b0,b1 x,y,z +.macro red a0,a1,b0,b1,x,y,z #pack vpxor %ymm\x,%ymm\x,%ymm\x vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y @@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0 vpsubw %ymm\y,%ymm\b0,%ymm\b0 .endm -.global PQCLEAN_KYBER512_AVX2_basemul_acc_avx -PQCLEAN_KYBER512_AVX2_basemul_acc_avx: -#consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER512_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 - +.text +basemul64_acc_avx: poly0.0: schoolbook 0,0 @@ -101,7 +97,7 @@ vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,(%rdi) @@ -128,7 +124,7 @@ vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,64(%rdi) @@ -136,17 +132,40 @@ vmovdqa %ymm5,96(%rdi) ret -.global PQCLEAN_KYBER512_AVX2_basemul_avx -PQCLEAN_KYBER512_AVX2_basemul_avx: +.global cdecl(PQCLEAN_KYBER512_AVX2_basemul_acc_avx) +cdecl(PQCLEAN_KYBER512_AVX2_basemul_acc_avx): #consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER512_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +ret +basemul64_avx: schoolbook 0,0 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,(%rdi) @@ -155,10 +174,39 @@ vmovdqa %ymm12,32(%rdi) schoolbook 64,1 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,64(%rdi) vmovdqa %ymm12,96(%rdi) ret + +.global cdecl(PQCLEAN_KYBER512_AVX2_basemul_avx) +cdecl(PQCLEAN_KYBER512_AVX2_basemul_avx): +#consts +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +ret diff --git a/crypto_kem/kyber512/avx2/cbd.c b/crypto_kem/kyber512/avx2/cbd.c index e3806fdc..be5771f0 100644 --- a/crypto_kem/kyber512/avx2/cbd.c +++ b/crypto_kem/kyber512/avx2/cbd.c @@ -1,27 +1,27 @@ -#include "cbd.h" #include "params.h" - +#include "cbd.h" #include #include /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER512_AVX2_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA * * Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *buf: pointer to input byte array +* - const unsigned char *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER512_AVX2_cbd(poly *r, const uint8_t *buf) { +void PQCLEAN_KYBER512_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { + unsigned int i = 0; __m256i vec0, vec1, vec2, vec3, tmp; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); - for (size_t i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]); + for (i = 0; i < KYBER_N / 64; i++) { + vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); vec1 = _mm256_srli_epi32(vec0, 1); vec0 = _mm256_and_si256(mask55, vec0); diff --git a/crypto_kem/kyber512/avx2/cbd.h b/crypto_kem/kyber512/avx2/cbd.h index 37bbdbc1..70a3a134 100644 --- a/crypto_kem/kyber512/avx2/cbd.h +++ b/crypto_kem/kyber512/avx2/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER512_AVX2_CBD_H +#define PQCLEAN_KYBER512_AVX2_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER512_AVX2_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER512_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber512/avx2/cdecl.inc b/crypto_kem/kyber512/avx2/cdecl.inc new file mode 100644 index 00000000..8ded53b1 --- /dev/null +++ b/crypto_kem/kyber512/avx2/cdecl.inc @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL +#define PQCLEAN_DILITHIUM2_AVX2_CDECL + +#define _16XQ 0 +#define _16XQINV 16 +#define _16XV 32 +#define _16XFLO 48 +#define _16XFHI 64 +#define _16XMONTSQLO 80 +#define _16XMONTSQHI 96 +#define _16XMASK 112 +#define _ZETAS_EXP 128 +#define _ZETAS_INV_EXP 528 + + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_kem/kyber512/avx2/consts.c b/crypto_kem/kyber512/avx2/consts.c index 2ca970fd..74edb379 100644 --- a/crypto_kem/kyber512/avx2/consts.c +++ b/crypto_kem/kyber512/avx2/consts.c @@ -1,34 +1,155 @@ -#include "consts.h" #include "params.h" - -const uint16_t PQCLEAN_KYBER512_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628}; -const uint16_t PQCLEAN_KYBER512_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932}; +#include "consts.h" +#include #define Q KYBER_Q -#define MONT ((1U << 16) % KYBER_Q) +#define MONT ((1U << 16) % Q) #define QINV 62209 // q^-1 mod 2^16 -#define V ((1U << 26)/KYBER_Q + 1) -#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q) -#define FLO (FHI * QINV % 65536) -#define MONTSQHI (MONT * MONT % KYBER_Q) -#define MONTSQLO (MONTSQHI * QINV % 65536) +#define V (((1U << 26) + Q/2)/Q) +#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) +#define FLO (FHI*QINV % 65536) +#define MONTSQHI (MONT*MONT % Q) +#define MONTSQLO (MONTSQHI*QINV % 65536) #define MASK 4095 -const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}}; -const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}}; -const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}}; -const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}}; -const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}}; -const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}}; - -#undef Q -#undef QINV -#undef MONT -#undef V -#undef FLO -#undef FHI -#undef MONTSQLO -#undef MONTSQHI -#undef MASK + +const qdata_t PQCLEAN_KYBER512_AVX2_qdata = {.as_arr = { +#define _16XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, + +#define _16XQINV 16 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +#define _16XV 32 + V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + +#define _16XFLO 48 + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + +#define _16XFHI 64 + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + +#define _16XMONTSQLO 80 + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + +#define _16XMONTSQHI 96 + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + +#define _16XMASK 112 + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + +#define _ZETAS_EXP 128 + 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, + 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, + 3158, 3158, 3158, 3158, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, + 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, + 573, 573, 2004, 2004, 264, 264, 383, 383, + 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, + 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, + 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, + 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, + 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, + 2226, 555, 2078, 1550, 422, 177, 3038, 1574, + 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, + 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, + 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, + 430, 843, 871, 105, 587, 3094, 2869, 1653, + 778, 3182, 1483, 1119, 644, 349, 329, 3254, + 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, + 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, + 48842, 48842, 48842, 48842, 287, 287, 287, 287, + 287, 287, 287, 287, 202, 202, 202, 202, + 202, 202, 202, 202, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, + 31164, 31164, 31164, 31164, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, + 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, + 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, + 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, + 732, 732, 608, 608, 1787, 1787, 411, 411, + 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, + 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, + 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, + 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, + 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, + 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, + 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, + 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, + 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, + 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, + 3193, 1994, 220, 1670, 1799, 794, 2475, 478, + 3021, 991, 1869, 1628, 0, 0, 0, 0, + +#define _ZETAS_INV_EXP 528 + 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, + 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, + 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, + 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, + 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, + 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, + 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, + 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, + 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, + 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, + 951, 247, 1421, 3222, 2499, 271, 90, 853, + 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, + 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, + 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, + 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, + 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, + 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, + 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, + 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, + 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, + 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, + 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, + 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, + 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, + 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, + 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, + 2210, 1846, 147, 2551, 1676, 460, 235, 2742, + 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, + 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, + 45043, 32227, 11478, 335, 156, 2911, 872, 1590, + 602, 777, 2170, 246, 1755, 291, 3152, 2907, + 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, + 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, + 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, + 666, 320, 8, 2813, 1544, 282, 1838, 1293, + 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, + 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, + 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, + 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, + 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, + 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, + 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, + 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, + 171, 171, 171, 171, 12403, 12403, 12403, 12403, + 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, + 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, + 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, + 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, + 60300, 60300, 1932, 1932, 0, 0, 0, 0 + } +}; diff --git a/crypto_kem/kyber512/avx2/consts.h b/crypto_kem/kyber512/avx2/consts.h index dd6e7135..050400ee 100644 --- a/crypto_kem/kyber512/avx2/consts.h +++ b/crypto_kem/kyber512/avx2/consts.h @@ -1,24 +1,20 @@ -#ifndef CONSTS_H -#define CONSTS_H +#ifndef PQCLEAN_KYBER512_AVX2_CONSTS_H +#define PQCLEAN_KYBER512_AVX2_CONSTS_H +#include "cdecl.inc" + +#include "params.h" #include #include -typedef union { - uint16_t as_arr[16]; - __m256i as_vec; -} aligned_uint16_t; +#define ALIGNED_UINT16_T(N) \ + union { \ + __m256i as_vec; \ + uint16_t as_arr[(N)]; \ + } -extern const uint16_t PQCLEAN_KYBER512_AVX2_zetas_exp[396]; -extern const uint16_t PQCLEAN_KYBER512_AVX2_zetas_inv_exp[396]; +typedef ALIGNED_UINT16_T(928) qdata_t; -extern const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xq; -extern const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xqinv; -extern const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xv; -extern const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xflo; -extern const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xfhi; -extern const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xmontsqlo; -extern const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xmontsqhi; -extern const aligned_uint16_t PQCLEAN_KYBER512_AVX2_16xmask; +extern const qdata_t PQCLEAN_KYBER512_AVX2_qdata; #endif diff --git a/crypto_kem/kyber512/avx2/fips202x4.c b/crypto_kem/kyber512/avx2/fips202x4.c index d63274e8..73c4b416 100644 --- a/crypto_kem/kyber512/avx2/fips202x4.c +++ b/crypto_kem/kyber512/avx2/fips202x4.c @@ -1,148 +1,111 @@ #include "fips202.h" #include "fips202x4.h" -#include "params.h" - #include +#include #include +#include +/* Use implementation from the Keccak Code Package */ +#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds +extern void KeccakF1600_StatePermute4x(__m256i *s); -#define NROUNDS 24 -#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64-(offset)))) - -static uint64_t load64(const uint8_t *x) { - unsigned long long r = 0, i; - - for (i = 0; i < 8; ++i) { - r |= (unsigned long long)x[i] << 8 * i; - } - return r; -} - -static void store64(uint8_t *x, uint64_t u) { - size_t i; +static inline void store64(uint8_t x[8], uint64_t u) { + unsigned int i = 0; - for (i = 0; i < 8; ++i) { - x[i] = (uint8_t)u; - u >>= 8; + for (i = 0; i < 8; i++) { + x[i] = u >> 8 * i; } } -/* Use implementation from the Keccak Code Package */ -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); -#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds - -static void keccak_absorb4x(__m256i *s, +static void keccakx4_absorb(__m256i s[25], unsigned int r, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, uint8_t p) { - size_t i; - uint8_t t0[200] = {0}; - uint8_t t1[200] = {0}; - uint8_t t2[200] = {0}; - uint8_t t3[200] = {0}; + size_t i = 0, pos = 0; + __m256i t, idx; - unsigned long long *ss = (unsigned long long *)s; + for (i = 0; i < 25; ++i) { + s[i] = _mm256_setzero_si256(); + } - while (mlen >= r) { + idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); + while (inlen >= r) { for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(m0 + 8 * i); - ss[4 * i + 1] ^= load64(m1 + 8 * i); - ss[4 * i + 2] ^= load64(m2 + 8 * i); - ss[4 * i + 3] ^= load64(m3 + 8 * i); + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; } KeccakF1600_StatePermute4x(s); - mlen -= r; - m0 += r; - m1 += r; - m2 += r; - m3 += r; + inlen -= r; } - for (i = 0; i < mlen; ++i) { - t0[i] = m0[i]; - t1[i] = m1[i]; - t2[i] = m2[i]; - t3[i] = m3[i]; + i = 0; + while (inlen >= 8) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + + i++; + pos += 8; + inlen -= 8; } - t0[i] = p; - t1[i] = p; - t2[i] = p; - t3[i] = p; - - t0[r - 1] |= 128; - t1[r - 1] |= 128; - t2[r - 1] |= 128; - t3[r - 1] |= 128; - - for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(t0 + 8 * i); - ss[4 * i + 1] ^= load64(t1 + 8 * i); - ss[4 * i + 2] ^= load64(t2 + 8 * i); - ss[4 * i + 3] ^= load64(t3 + 8 * i); + if (inlen) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); + t = _mm256_and_si256(t, idx); + s[i] = _mm256_xor_si256(s[i], t); } + + t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); + s[i] = _mm256_xor_si256(s[i], t); + t = _mm256_set1_epi64x((long long)(1ULL << 63)); + s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); } -static void keccak_squeezeblocks4x(uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, +static void keccakx4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, size_t nblocks, - __m256i *s, - unsigned int r) { - unsigned long long *ss = (unsigned long long *)s; + unsigned int r, + __m256i s[25]) { + unsigned int i = 0; + uint64_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; while (nblocks > 0) { KeccakF1600_StatePermute4x(s); - for (size_t i = 0; i < (r >> 3); i++) { - store64(h0 + 8 * i, ss[4 * i + 0]); - store64(h1 + 8 * i, ss[4 * i + 1]); - store64(h2 + 8 * i, ss[4 * i + 2]); - store64(h3 + 8 * i, ss[4 * i + 3]); + for (i = 0; i < r / 8; ++i) { + f0 = _mm256_extract_epi64(s[i], 0); + f1 = _mm256_extract_epi64(s[i], 1); + f2 = _mm256_extract_epi64(s[i], 2); + f3 = _mm256_extract_epi64(s[i], 3); + store64(out0, f0); + store64(out1, f1); + store64(out2, f2); + store64(out3, f3); + + out0 += 8; + out1 += 8; + out2 += 8; + out3 += 8; } - h0 += r; - h1 += r; - h2 += r; - h3 += r; - nblocks--; - } -} -void PQCLEAN_KYBER512_AVX2_kyber_shake128x4_absorb(keccak4x_state *state, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3) { - uint8_t extseed[4][KYBER_SYMBYTES + 2]; - - for (size_t i = 0; i < KYBER_SYMBYTES; ++i) { - extseed[0][i] = seed[i]; - extseed[1][i] = seed[i]; - extseed[2][i] = seed[i]; - extseed[3][i] = seed[i]; - } - extseed[0][KYBER_SYMBYTES + 0] = (uint8_t)nonce0; - extseed[0][KYBER_SYMBYTES + 1] = (uint8_t)(nonce0 >> 8); - extseed[1][KYBER_SYMBYTES + 0] = (uint8_t)nonce1; - extseed[1][KYBER_SYMBYTES + 1] = (uint8_t)(nonce1 >> 8); - extseed[2][KYBER_SYMBYTES + 0] = (uint8_t)nonce2; - extseed[2][KYBER_SYMBYTES + 1] = (uint8_t)(nonce2 >> 8); - extseed[3][KYBER_SYMBYTES + 0] = (uint8_t)nonce3; - extseed[3][KYBER_SYMBYTES + 1] = (uint8_t)(nonce3 >> 8); - - /* zero state */ - for (size_t i = 0; i < 25; i++) { - state->s[i] = _mm256_xor_si256(state->s[i], state->s[i]); + --nblocks; } +} - /* absorb 4 message of identical length in parallel */ - keccak_absorb4x(state->s, SHAKE128_RATE, extseed[0], extseed[1], extseed[2], extseed[3], KYBER_SYMBYTES + 2, 0x1F); +void PQCLEAN_KYBER512_AVX2_shake128x4_absorb(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(uint8_t *out0, @@ -150,82 +113,78 @@ void PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out2, uint8_t *out3, size_t nblocks, - keccak4x_state *state) { - keccak_squeezeblocks4x(out0, out1, out2, out3, nblocks, state->s, SHAKE128_RATE); + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, + state->s); } -static void shake256x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, size_t inlen) { - __m256i s[25]; - uint8_t t0[SHAKE256_RATE]; - uint8_t t1[SHAKE256_RATE]; - uint8_t t2[SHAKE256_RATE]; - uint8_t t3[SHAKE256_RATE]; - - /* zero state */ - for (size_t i = 0; i < 25; i++) { - s[i] = _mm256_xor_si256(s[i], s[i]); - } - - /* absorb 4 message of identical length in parallel */ - keccak_absorb4x(s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); - - /* Squeeze output */ - keccak_squeezeblocks4x(out0, out1, out2, out3, outlen / SHAKE256_RATE, s, SHAKE256_RATE); +void PQCLEAN_KYBER512_AVX2_shake256x4_absorb(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); +} - out0 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out1 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out2 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out3 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; +void PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, + state->s); +} - if (outlen % SHAKE256_RATE) { - keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE256_RATE); - for (size_t i = 0; i < outlen % SHAKE256_RATE; i++) { - out0[i] = t0[i]; - out1[i] = t1[i]; - out2[i] = t2[i]; - out3[i] = t3[i]; +void PQCLEAN_KYBER512_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { + unsigned int i = 0; + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[4][SHAKE128_RATE]; + keccakx4_state state; + + PQCLEAN_KYBER512_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + out2 += nblocks * SHAKE128_RATE; + out3 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; } } } -void PQCLEAN_KYBER512_AVX2_shake256x4_prf(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *key, - uint8_t nonce0, - uint8_t nonce1, - uint8_t nonce2, - uint8_t nonce3) { - uint8_t extseed[4][KYBER_SYMBYTES + 1]; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - extseed[0][i] = key[i]; - extseed[1][i] = key[i]; - extseed[2][i] = key[i]; - extseed[3][i] = key[i]; +void PQCLEAN_KYBER512_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { + unsigned int i = 0; + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[4][SHAKE256_RATE]; + keccakx4_state state; + + PQCLEAN_KYBER512_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + out2 += nblocks * SHAKE256_RATE; + out3 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; + } } - extseed[0][KYBER_SYMBYTES] = nonce0; - extseed[1][KYBER_SYMBYTES] = nonce1; - extseed[2][KYBER_SYMBYTES] = nonce2; - extseed[3][KYBER_SYMBYTES] = nonce3; - - shake256x4(out0, - out1, - out2, - out3, - outlen, - extseed[0], - extseed[1], - extseed[2], - extseed[3], - KYBER_SYMBYTES + 1); } diff --git a/crypto_kem/kyber512/avx2/fips202x4.h b/crypto_kem/kyber512/avx2/fips202x4.h index fbdbc80e..4f1bd110 100644 --- a/crypto_kem/kyber512/avx2/fips202x4.h +++ b/crypto_kem/kyber512/avx2/fips202x4.h @@ -7,31 +7,19 @@ typedef struct { __m256i s[25]; -} keccak4x_state; - -void PQCLEAN_KYBER512_AVX2_kyber_shake128x4_absorb(keccak4x_state *state, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3); - -void PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccak4x_state *state); - -void PQCLEAN_KYBER512_AVX2_shake256x4_prf(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *key, - uint8_t nonce0, - uint8_t nonce1, - uint8_t nonce2, - uint8_t nonce3); +} keccakx4_state; + +void PQCLEAN_KYBER512_AVX2_shake128x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, keccakx4_state *state); + +void PQCLEAN_KYBER512_AVX2_shake256x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, + keccakx4_state *state); + +void PQCLEAN_KYBER512_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER512_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); #endif diff --git a/crypto_kem/kyber1024-90s/avx2/fq.s b/crypto_kem/kyber512/avx2/fq.S similarity index 54% rename from crypto_kem/kyber1024-90s/avx2/fq.s rename to crypto_kem/kyber512/avx2/fq.S index 7022e76b..00a3a599 100644 --- a/crypto_kem/kyber1024-90s/avx2/fq.s +++ b/crypto_kem/kyber512/avx2/fq.S @@ -1,11 +1,8 @@ +#include "cdecl.inc" .include "fq.inc" -.global PQCLEAN_KYBER102490S_AVX2_reduce_avx -PQCLEAN_KYBER102490S_AVX2_reduce_avx: -#consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xv(%rip),%ymm1 - +.text +reduce128_avx: #load vmovdqa (%rdi),%ymm2 vmovdqa 32(%rdi),%ymm3 @@ -16,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2 10 -red16 3 11 -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 10 -red16 9 11 +red16 2,10 +red16 3,11 +red16 4,12 +red16 5,13 +red16 6,14 +red16 7,15 +red16 8,10 +red16 9,11 #store vmovdqa %ymm2,(%rdi) @@ -37,11 +34,17 @@ vmovdqa %ymm9,224(%rdi) ret -.global PQCLEAN_KYBER102490S_AVX2_csubq_avx -PQCLEAN_KYBER102490S_AVX2_csubq_avx: +.global cdecl(PQCLEAN_KYBER512_AVX2_reduce_avx) +cdecl(PQCLEAN_KYBER512_AVX2_reduce_avx): #consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XV*2(%rsi),%ymm1 +call reduce128_avx +add $256,%rdi +call reduce128_avx +ret +csubq128_avx: #load vmovdqa (%rdi),%ymm1 vmovdqa 32(%rdi),%ymm2 @@ -52,14 +55,14 @@ vmovdqa 160(%rdi),%ymm6 vmovdqa 192(%rdi),%ymm7 vmovdqa 224(%rdi),%ymm8 -csubq 1 9 -csubq 2 10 -csubq 3 11 -csubq 4 12 -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 9 +csubq 1,9 +csubq 2,10 +csubq 3,11 +csubq 4,12 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,9 #store vmovdqa %ymm1,(%rdi) @@ -73,13 +76,16 @@ vmovdqa %ymm8,224(%rdi) ret -.global PQCLEAN_KYBER102490S_AVX2_frommont_avx -PQCLEAN_KYBER102490S_AVX2_frommont_avx: +.global cdecl(PQCLEAN_KYBER512_AVX2_csubq_avx) +cdecl(PQCLEAN_KYBER512_AVX2_csubq_avx): #consts -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xmontsqlo(%rip),%ymm1 -vmovdqa PQCLEAN_KYBER102490S_AVX2_16xmontsqhi(%rip),%ymm2 +vmovdqa _16XQ*2(%rsi),%ymm0 +call csubq128_avx +add $256,%rdi +call csubq128_avx +ret +tomont128_avx: #load vmovdqa (%rdi),%ymm3 vmovdqa 32(%rdi),%ymm4 @@ -90,14 +96,14 @@ vmovdqa 160(%rdi),%ymm8 vmovdqa 192(%rdi),%ymm9 vmovdqa 224(%rdi),%ymm10 -fqmulprecomp 1,2,3 11 -fqmulprecomp 1,2,4 12 -fqmulprecomp 1,2,5 13 -fqmulprecomp 1,2,6 14 -fqmulprecomp 1,2,7 15 -fqmulprecomp 1,2,8 11 -fqmulprecomp 1,2,9 12 -fqmulprecomp 1,2,10 13 +fqmulprecomp 1,2,3,11 +fqmulprecomp 1,2,4,12 +fqmulprecomp 1,2,5,13 +fqmulprecomp 1,2,6,14 +fqmulprecomp 1,2,7,15 +fqmulprecomp 1,2,8,11 +fqmulprecomp 1,2,9,12 +fqmulprecomp 1,2,10,13 #store vmovdqa %ymm3,(%rdi) @@ -110,3 +116,14 @@ vmovdqa %ymm9,192(%rdi) vmovdqa %ymm10,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER512_AVX2_tomont_avx) +cdecl(PQCLEAN_KYBER512_AVX2_tomont_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx +add $256,%rdi +call tomont128_avx +ret diff --git a/crypto_kem/kyber512/avx2/fq.inc b/crypto_kem/kyber512/avx2/fq.inc index 56e36a02..75df098a 100644 --- a/crypto_kem/kyber512/avx2/fq.inc +++ b/crypto_kem/kyber512/avx2/fq.inc @@ -1,24 +1,27 @@ -.macro red16 r x=12 +.macro red16 r,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x vpsraw $10,%ymm\x,%ymm\x vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro csubq r x=12 +.macro csubq r,x=12 vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r +#vpcmpgtw %ymm0,%ymm\r,%ymm\x +#vpand %ymm0,%ymm\x,%ymm\x +#vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro caddq r x=12 +.macro caddq r,x=12 vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r .endm -.macro fqmulprecomp al,ah,b x=12 +.macro fqmulprecomp al,ah,b,x=12 vpmullw %ymm\al,%ymm\b,%ymm\x vpmulhw %ymm\ah,%ymm\b,%ymm\b vpmulhw %ymm0,%ymm\x,%ymm\x diff --git a/crypto_kem/kyber512/avx2/indcpa.c b/crypto_kem/kyber512/avx2/indcpa.c index 5f40c770..f0e04088 100644 --- a/crypto_kem/kyber512/avx2/indcpa.c +++ b/crypto_kem/kyber512/avx2/indcpa.c @@ -1,26 +1,33 @@ +#include "align.h" #include "cbd.h" #include "indcpa.h" #include "ntt.h" +#include "params.h" #include "poly.h" #include "polyvec.h" #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include +#include /************************************************* * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* compressed and serialized vector of polynomials pk +* serialized vector of polynomials pk * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER512_AVX2_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { /************************************************* * Name: unpack_pk * -* Description: De-serialize and decompress public key from a byte array; +* Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER512_AVX2_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER512_AVX2_polyvec_tobytes(r, sk); } @@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials +* (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER512_AVX2_polyvec_frombytes(sk, packedsk); } @@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER512_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER512_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER512_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER512_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } -static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - unsigned int ctr, pos; - uint16_t val; +/************************************************* +* Name: rej_uniform +* +* Description: Run rejection sampling on uniform random bytes to generate +* uniform random integers mod q +* +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes +* +* Returns number of sampled 16-bit integers (at most len) +**************************************************/ +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { @@ -116,52 +150,75 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER512_AVX2_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER512_AVX2_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER512_AVX2_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr0, ctr1, ctr2, ctr3, bufbytes; - union { - uint8_t x[4][XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS]; - __m256i _dummy; - } buf; - keccak4x_state state; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +void PQCLEAN_KYBER512_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { + unsigned int ctr0 = 0, ctr1 = 0, ctr2 = 0, ctr3 = 0; + ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf; + __m256i f; + keccakx4_state state; + + f = _mm256_load_si256((__m256i *)seed); + _mm256_store_si256((__m256i *)buf.arr[0], f); + _mm256_store_si256((__m256i *)buf.arr[1], f); + _mm256_store_si256((__m256i *)buf.arr[2], f); + _mm256_store_si256((__m256i *)buf.arr[3], f); if (transposed) { - PQCLEAN_KYBER512_AVX2_kyber_shake128x4_absorb(&state, seed, 0, 256, 1, 257); + buf.arr[0][KYBER_SYMBYTES + 0] = 0; + buf.arr[0][KYBER_SYMBYTES + 1] = 0; + buf.arr[1][KYBER_SYMBYTES + 0] = 0; + buf.arr[1][KYBER_SYMBYTES + 1] = 1; + buf.arr[2][KYBER_SYMBYTES + 0] = 1; + buf.arr[2][KYBER_SYMBYTES + 1] = 0; + buf.arr[3][KYBER_SYMBYTES + 0] = 1; + buf.arr[3][KYBER_SYMBYTES + 1] = 1; } else { - PQCLEAN_KYBER512_AVX2_kyber_shake128x4_absorb(&state, seed, 0, 1, 256, 257); + buf.arr[0][KYBER_SYMBYTES + 0] = 0; + buf.arr[0][KYBER_SYMBYTES + 1] = 0; + buf.arr[1][KYBER_SYMBYTES + 0] = 1; + buf.arr[1][KYBER_SYMBYTES + 1] = 0; + buf.arr[2][KYBER_SYMBYTES + 0] = 0; + buf.arr[2][KYBER_SYMBYTES + 1] = 1; + buf.arr[3][KYBER_SYMBYTES + 0] = 1; + buf.arr[3][KYBER_SYMBYTES + 1] = 1; } - PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], GEN_MATRIX_MAXNBLOCKS, &state); - bufbytes = GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES; + PQCLEAN_KYBER512_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); + PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], GEN_MATRIX_NBLOCKS, + &state); - ctr0 = PQCLEAN_KYBER512_AVX2_rej_uniform(a[0].vec[0].coeffs, KYBER_N, buf.x[0], bufbytes); - ctr1 = PQCLEAN_KYBER512_AVX2_rej_uniform(a[0].vec[1].coeffs, KYBER_N, buf.x[1], bufbytes); - ctr2 = PQCLEAN_KYBER512_AVX2_rej_uniform(a[1].vec[0].coeffs, KYBER_N, buf.x[2], bufbytes); - ctr3 = PQCLEAN_KYBER512_AVX2_rej_uniform(a[1].vec[1].coeffs, KYBER_N, buf.x[3], bufbytes); + ctr0 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[0].vec[0].coeffs, buf.arr[0]); + ctr1 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[0].vec[1].coeffs, buf.arr[1]); + ctr2 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[1].vec[0].coeffs, buf.arr[2]); + ctr3 = PQCLEAN_KYBER512_AVX2_rej_uniform_avx(a[1].vec[1].coeffs, buf.arr[3]); while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { - PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], 1, &state); - bufbytes = XOF_BLOCKBYTES; - - ctr0 += rej_uniform_ref(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.x[0], bufbytes); - ctr1 += rej_uniform_ref(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.x[1], bufbytes); - ctr2 += rej_uniform_ref(a[1].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf.x[2], bufbytes); - ctr3 += rej_uniform_ref(a[1].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf.x[3], bufbytes); + PQCLEAN_KYBER512_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + + ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], + XOF_BLOCKBYTES); + ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], + XOF_BLOCKBYTES); + ctr2 += rej_uniform(a[1].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], + XOF_BLOCKBYTES); + ctr3 += rej_uniform(a[1].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], + XOF_BLOCKBYTES); } PQCLEAN_KYBER512_AVX2_poly_nttunpack(&a[0].vec[0]); @@ -171,35 +228,39 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER512_AVX2_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER512_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], skpv, e, pkpv; - uint8_t buf[2 * KYBER_SYMBYTES]; - const uint8_t *publicseed = buf; - const uint8_t *noiseseed = buf + KYBER_SYMBYTES; - uint8_t nonce = 0; +void PQCLEAN_KYBER512_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + const uint8_t *publicseed = buf.arr; + const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf, KYBER_SYMBYTES); - hash_g(buf, buf, KYBER_SYMBYTES); + randombytes(buf.arr, KYBER_SYMBYTES); + hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); gen_a(a, publicseed); - PQCLEAN_KYBER512_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, e.vec + 0, e.vec + 1, noiseseed, nonce + 0, nonce + 1, nonce + 2, nonce + 3); + PQCLEAN_KYBER512_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, e.vec + 0, e.vec + 1, noiseseed, + 0, 1, 2, 3); PQCLEAN_KYBER512_AVX2_polyvec_ntt(&skpv); PQCLEAN_KYBER512_AVX2_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv); - PQCLEAN_KYBER512_AVX2_poly_frommont(pkpv.vec + i); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER512_AVX2_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER512_AVX2_polyvec_add(&pkpv, &pkpv, &e); @@ -210,44 +271,50 @@ void PQCLEAN_KYBER512_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER512_AVX2_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER512_AVX2_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec at[KYBER_K], pkpv, sp, ep, bp; - poly k, v, epp; - uint8_t seed[KYBER_SYMBYTES]; - uint8_t nonce = 0; - - unpack_pk(&pkpv, seed, pk); +void PQCLEAN_KYBER512_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; + + unpack_pk(&pkpv, seed.arr, pk); PQCLEAN_KYBER512_AVX2_poly_frommsg(&k, m); - gen_at(at, seed); + gen_at(at, seed.arr); - PQCLEAN_KYBER512_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, ep.vec + 0, ep.vec + 1, coins, nonce + 0, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_KYBER512_AVX2_poly_getnoise(&epp, coins, nonce + 4); + PQCLEAN_KYBER512_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, ep.vec + 0, ep.vec + 1, coins, + 0, 1, 2, 3); + PQCLEAN_KYBER512_AVX2_poly_getnoise(&epp, coins, 4); PQCLEAN_KYBER512_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER512_AVX2_polyvec_invntt(&bp); - PQCLEAN_KYBER512_AVX2_poly_invntt(&v); + PQCLEAN_KYBER512_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(&v); PQCLEAN_KYBER512_AVX2_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER512_AVX2_poly_add(&v, &v, &epp); @@ -259,18 +326,21 @@ void PQCLEAN_KYBER512_AVX2_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER512_AVX2_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER512_AVX2_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER512_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -278,8 +348,8 @@ void PQCLEAN_KYBER512_AVX2_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER512_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER512_AVX2_poly_invntt(&mp); + PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER512_AVX2_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER512_AVX2_poly_reduce(&mp); diff --git a/crypto_kem/kyber512/avx2/indcpa.h b/crypto_kem/kyber512/avx2/indcpa.h index 82d39756..d0933b2f 100644 --- a/crypto_kem/kyber512/avx2/indcpa.h +++ b/crypto_kem/kyber512/avx2/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER512_AVX2_INDCPA_H +#define PQCLEAN_KYBER512_AVX2_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER512_AVX2_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER512_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER512_AVX2_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER512_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER512_AVX2_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER512_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER512_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber512-90s/avx2/invntt.s b/crypto_kem/kyber512/avx2/invntt.S similarity index 75% rename from crypto_kem/kyber512-90s/avx2/invntt.s rename to crypto_kem/kyber512/avx2/invntt.S index 6f3dea10..fa4707a4 100644 --- a/crypto_kem/kyber512-90s/avx2/invntt.s +++ b/crypto_kem/kyber512/avx2/invntt.S @@ -1,7 +1,8 @@ +#include "cdecl.inc" .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 #update & mul vpsubw %ymm\rh0,%ymm\rl0,%ymm12 vpsubw %ymm\rh1,%ymm\rl1,%ymm13 @@ -36,12 +37,8 @@ vpsubw %ymm\rh2,%ymm14,%ymm\rh2 vpsubw %ymm\rh3,%ymm15,%ymm\rh3 .endm -.global PQCLEAN_KYBER51290S_AVX2_invntt_levels0t5_avx -.p2align 5 -PQCLEAN_KYBER51290S_AVX2_invntt_levels0t5_avx: -#consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 - +.text +invntt_levels0t5_avx: level0: #zetas vmovdqu (%rsi),%ymm15 @@ -59,14 +56,14 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -butterfly 4,5,8,9,6,7,10,11 15,3,1,2 +butterfly 4,5,8,9,6,7,10,11,15,3,1,2 level1: #zetas vmovdqu 128(%rsi),%ymm3 vmovdqu 160(%rsi),%ymm2 -butterfly 4,5,6,7,8,9,10,11 3,3,2,2 +butterfly 4,5,6,7,8,9,10,11,3,3,2,2 shuffle1 4,5,3,5 shuffle1 6,7,4,7 @@ -79,9 +76,9 @@ vmovdqu 192(%rsi),%ymm10 vmovdqu 224(%rsi),%ymm2 #consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xv(%rip),%ymm1 +vmovdqa _16XV*2(%rdx),%ymm1 -butterfly 3,4,6,8,5,7,9,11 10,10,2,2 +butterfly 3,4,6,8,5,7,9,11,10,10,2,2 red16 3 @@ -95,7 +92,7 @@ level3: vmovdqu 256(%rsi),%ymm9 vmovdqu 288(%rsi),%ymm2 -butterfly 10,3,6,5,4,8,7,11 9,9,2,2 +butterfly 10,3,6,5,4,8,7,11,9,9,2,2 red16 10 @@ -109,7 +106,7 @@ level4: vmovdqu 320(%rsi),%ymm7 vmovdqu 352(%rsi),%ymm2 -butterfly 9,10,6,4,3,5,8,11 7,7,2,2 +butterfly 9,10,6,4,3,5,8,11,7,7,2,2 red16 9 @@ -123,7 +120,7 @@ level5: vpbroadcastd 384(%rsi),%ymm8 vpbroadcastd 388(%rsi),%ymm2 -butterfly 7,9,6,3,10,4,5,11 8,8,2,2 +butterfly 7,9,6,3,10,4,5,11,8,8,2,2 red16 7 @@ -139,11 +136,7 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_KYBER51290S_AVX2_invntt_level6_avx -PQCLEAN_KYBER51290S_AVX2_invntt_level6_avx: -#consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 - +invntt_level6_avx: #zetas vpbroadcastd (%rsi),%ymm1 vpbroadcastd 4(%rsi),%ymm2 @@ -161,8 +154,8 @@ vmovdqa 352(%rdi),%ymm11 butterfly 4,5,6,7,8,9,10,11 #consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xfhi(%rip),%ymm13 +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 #store vmovdqa %ymm8,256(%rdi) @@ -170,10 +163,10 @@ vmovdqa %ymm9,288(%rdi) vmovdqa %ymm10,320(%rdi) vmovdqa %ymm11,352(%rdi) -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 #store vmovdqa %ymm4,(%rdi) @@ -194,8 +187,8 @@ vmovdqa 480(%rdi),%ymm11 butterfly 4,5,6,7,8,9,10,11 #consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xfhi(%rip),%ymm13 +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 #store vmovdqa %ymm8,384(%rdi) @@ -203,10 +196,10 @@ vmovdqa %ymm9,416(%rdi) vmovdqa %ymm10,448(%rdi) vmovdqa %ymm11,480(%rdi) -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 #store vmovdqa %ymm4,128(%rdi) @@ -215,3 +208,18 @@ vmovdqa %ymm6,192(%rdi) vmovdqa %ymm7,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER512_AVX2_invntt_avx) +cdecl(PQCLEAN_KYBER512_AVX2_invntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_INV_EXP*2,%rsi +call invntt_levels0t5_avx +add $256,%rdi +add $392,%rsi +call invntt_levels0t5_avx +sub $256,%rdi +add $392,%rsi +call invntt_level6_avx +ret diff --git a/crypto_kem/kyber512/avx2/kem.c b/crypto_kem/kyber512/avx2/kem.c index d61bc0ea..4e723c98 100644 --- a/crypto_kem/kyber512/avx2/kem.c +++ b/crypto_kem/kyber512/avx2/kem.c @@ -1,103 +1,127 @@ -#include "api.h" +#include "align.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include + -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER512_AVX2_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER512_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER512_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER512_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER512_AVX2_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER512_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - uint8_t buf[2 * KYBER_SYMBYTES]; +int PQCLEAN_KYBER512_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + randombytes(buf.arr, KYBER_SYMBYTES); + /* Don't release system RNG output */ + hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER512_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER512_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER512_AVX2_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER512_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - union { - uint8_t x[KYBER_CIPHERTEXTBYTES]; - __m256i __dummy; - } _cmp; - uint8_t *cmp = _cmp.x; - uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER512_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER512_AVX2_indcpa_dec(buf, ct, sk); + PQCLEAN_KYBER512_AVX2_indcpa_dec(buf.arr, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER512_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER512_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER512_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER512_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER512_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber512/avx2/kem.h b/crypto_kem/kyber512/avx2/kem.h new file mode 100644 index 00000000..e85d945c --- /dev/null +++ b/crypto_kem/kyber512/avx2/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER512_AVX2_KEM_H +#define PQCLEAN_KYBER512_AVX2_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER512_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER512_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER512_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber1024/avx2/ntt.s b/crypto_kem/kyber512/avx2/ntt.S similarity index 81% rename from crypto_kem/kyber1024/avx2/ntt.s rename to crypto_kem/kyber512/avx2/ntt.S index 6b61bde6..35a933b3 100644 --- a/crypto_kem/kyber1024/avx2/ntt.s +++ b/crypto_kem/kyber512/avx2/ntt.S @@ -1,7 +1,8 @@ +#include "cdecl.inc" .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=15,zl1=15,zh0=1,zh1=1 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 #mul vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 @@ -36,7 +37,7 @@ vpaddw %ymm15,%ymm\rl3,%ymm\rl3 # We break the dependency chains with the cost of slightly more additions. # But they can be run in parallel to the multiplications on execution port 5 # (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 +.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 #mul vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x @@ -73,11 +74,8 @@ vpaddw %ymm15,%ymm\rh3,%ymm\rh3 vpsubw %ymm15,%ymm\rl3,%ymm\rl3 .endm -.global PQCLEAN_KYBER1024_AVX2_ntt_level0_avx -PQCLEAN_KYBER1024_AVX2_ntt_level0_avx: -#consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 - +.text +ntt_level0_avx: level0: #zetas vpbroadcastd (%rsi),%ymm15 @@ -107,11 +105,7 @@ vmovdqa %ymm11,352(%rdi) ret -.global PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx -PQCLEAN_KYBER1024_AVX2_ntt_levels1t6_avx: -#consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 - +ntt_levels1t6_avx: level1: #zetas vpbroadcastd (%rsi),%ymm15 @@ -127,7 +121,7 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -butterfly2 4,5,6,7,8,9,10,11 3 +butterfly2 4,5,6,7,8,9,10,11,3 level2: #zetas @@ -139,7 +133,7 @@ shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly2 3,8,4,9,5,10,6,11 7 +butterfly2 3,8,4,9,5,10,6,11,7 level3: #zetas @@ -151,7 +145,7 @@ shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly2 7,5,3,10,8,6,4,11 9 +butterfly2 7,5,3,10,8,6,4,11,9 level4: #zetas @@ -163,7 +157,7 @@ shuffle2 5,6,7,6 shuffle2 3,4,5,4 shuffle2 10,11,3,11 -butterfly2 9,8,7,6,5,4,3,11 10 +butterfly2 9,8,7,6,5,4,3,11,10 level5: #zetas @@ -175,7 +169,7 @@ shuffle1 8,4,9,4 shuffle1 7,3,8,3 shuffle1 6,11,7,11 -butterfly2 10,5,9,4,8,3,7,11 6 +butterfly2 10,5,9,4,8,3,7,11,6 level6: #zetas @@ -184,17 +178,17 @@ vmovdqu 328(%rsi),%ymm15 vmovdqu 296(%rsi),%ymm1 vmovdqu 360(%rsi),%ymm2 -butterfly2 10,5,8,3,9,4,7,11 6,1,14,15,1,2 +butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xv(%rip),%ymm1 -red16 10 12 -red16 5 13 -red16 9 14 -red16 4 15 -red16 8 2 -red16 3 6 -red16 7 12 -red16 11 13 +vmovdqa _16XV*2(%rdx),%ymm1 +red16 10,12 +red16 5,13 +red16 9,14 +red16 4,15 +red16 8,2 +red16 3,6 +red16 7,12 +red16 11,13 #store vmovdqa %ymm10,(%rdi) @@ -207,3 +201,20 @@ vmovdqa %ymm7,192(%rdi) vmovdqa %ymm11,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER512_AVX2_ntt_avx) +cdecl(PQCLEAN_KYBER512_AVX2_ntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_EXP*2,%rsi +call ntt_level0_avx +add $128,%rdi +call ntt_level0_avx +sub $128,%rdi +add $8,%rsi +call ntt_levels1t6_avx +add $256,%rdi +add $392,%rsi +call ntt_levels1t6_avx +ret diff --git a/crypto_kem/kyber512/avx2/ntt.h b/crypto_kem/kyber512/avx2/ntt.h index f643cd60..a559287c 100644 --- a/crypto_kem/kyber512/avx2/ntt.h +++ b/crypto_kem/kyber512/avx2/ntt.h @@ -2,19 +2,27 @@ #define NTT_H #include "consts.h" - +#include "params.h" #include -void PQCLEAN_KYBER512_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER512_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER512_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER512_AVX2_nttpack_avx(int16_t *r); -void PQCLEAN_KYBER512_AVX2_nttunpack_avx(int16_t *r); -void PQCLEAN_KYBER512_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); -void PQCLEAN_KYBER512_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); - -void PQCLEAN_KYBER512_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a); -void PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a); + +void PQCLEAN_KYBER512_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + +void PQCLEAN_KYBER512_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + + +void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + +void PQCLEAN_KYBER512_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + + +void PQCLEAN_KYBER512_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + +void PQCLEAN_KYBER512_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + + +void PQCLEAN_KYBER512_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + +void PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); #endif diff --git a/crypto_kem/kyber512/avx2/params.h b/crypto_kem/kyber512/avx2/params.h index d086d4c6..034b9cdd 100644 --- a/crypto_kem/kyber512/avx2/params.h +++ b/crypto_kem/kyber512/avx2/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER512_AVX2_PARAMS_H +#define PQCLEAN_KYBER512_AVX2_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 2 #define KYBER_POLYCOMPRESSEDBYTES 96 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber512/avx2/poly.c b/crypto_kem/kyber512/avx2/poly.c index 0d166498..7fad1476 100644 --- a/crypto_kem/kyber512/avx2/poly.c +++ b/crypto_kem/kyber512/avx2/poly.c @@ -1,130 +1,240 @@ +#include "align.h" #include "cbd.h" +#include "consts.h" #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include #include /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER512_AVX2_poly_compress * * Description: Compression and subsequent serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER512_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t i, j, k = 0; PQCLEAN_KYBER512_AVX2_poly_csubq(a); - for (i = 0; i < KYBER_N; i += 8) { + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = (uint8_t)(((((uint16_t)a->coeffs[i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7); + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; } - r[k] = (uint8_t)( t[0] | (t[1] << 3) | (t[2] << 6)); - r[k + 1] = (uint8_t)((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)); - r[k + 2] = (uint8_t)((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)); - k += 3; + r[0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + r += 3; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER512_AVX2_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER512_AVX2_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_decompress(poly *r, const uint8_t *a) { - size_t i; - for (i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)((((a[0] & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 1] = (int16_t)(((((a[0] >> 3) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 2] = (int16_t)(((((a[0] >> 6) | ((a[1] << 2) & 4)) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 3] = (int16_t)(((((a[1] >> 1) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 4] = (int16_t)(((((a[1] >> 4) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 5] = (int16_t)(((((a[1] >> 7) | ((a[2] << 1) & 6)) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 6] = (int16_t)(((((a[2] >> 2) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 7] = (int16_t)(((((a[2] >> 5)) * KYBER_Q) + 4) >> 3); +void PQCLEAN_KYBER512_AVX2_poly_decompress(poly *restrict r, + const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + unsigned int j = 0; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 3); + t[2] = (a[0] >> 6) | (a[1] << 2); + t[3] = (a[1] >> 1); + t[4] = (a[1] >> 4); + t[5] = (a[1] >> 7) | (a[2] << 1); + t[6] = (a[2] >> 2); + t[7] = (a[2] >> 5); a += 3; + + for (j = 0; j < 8; j++) { + r->coeffs[8 * i + j] = ((uint16_t)(t[j] & 7) * KYBER_Q + 4) >> 3; + } } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER512_AVX2_poly_tobytes * * Description: Serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_tobytes(uint8_t *r, poly *a) { - PQCLEAN_KYBER512_AVX2_ntttobytes_avx(r, a->coeffs); - PQCLEAN_KYBER512_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128); +void PQCLEAN_KYBER512_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + PQCLEAN_KYBER512_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER512_AVX2_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER512_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) +**************************************************/ +void PQCLEAN_KYBER512_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER512_AVX2_qdata); +} + +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_frombytes(poly *r, const uint8_t *a) { - PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(r->coeffs, a); - PQCLEAN_KYBER512_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192); +void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *restrict r, + const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; + const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); + const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2); + +#define FROMMSG64(i) \ + g3 = _mm256_shuffle_epi32(f,0x55*(i)); \ + g3 = _mm256_sllv_epi32(g3,shift); \ + g3 = _mm256_shuffle_epi8(g3,idx); \ + g0 = _mm256_slli_epi16(g3,12); \ + g1 = _mm256_slli_epi16(g3,8); \ + g2 = _mm256_slli_epi16(g3,4); \ + g0 = _mm256_srai_epi16(g0,15); \ + g1 = _mm256_srai_epi16(g1,15); \ + g2 = _mm256_srai_epi16(g2,15); \ + g3 = _mm256_srai_epi16(g3,15); \ + g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ + g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ + g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ + g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ + h0 = _mm256_unpacklo_epi64(g0,g1); \ + h2 = _mm256_unpackhi_epi64(g0,g1); \ + h1 = _mm256_unpacklo_epi64(g2,g3); \ + h3 = _mm256_unpackhi_epi64(g2,g3); \ + g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ + g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ + g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ + g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + + f = _mm256_load_si256((__m256i *)msg); + FROMMSG64(0); + FROMMSG64(1); + FROMMSG64(2); + FROMMSG64(3); } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER512_AVX2_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER512_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { + unsigned int i = 0; + uint32_t small = 0; + __m256i f0, f1, g0, g1; + const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); + f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); + f0 = _mm256_sub_epi16(hqs, f0); + f1 = _mm256_sub_epi16(hqs, f1); + g0 = _mm256_srai_epi16(f0, 15); + g1 = _mm256_srai_epi16(f1, 15); + f0 = _mm256_xor_si256(f0, g0); + f1 = _mm256_xor_si256(f1, g1); + f0 = _mm256_sub_epi16(hhqs, f0); + f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_packs_epi16(f0, f1); + small = _mm256_movemask_epi8(f0); + small = ~small; + msg[4 * i + 0] = small; + msg[4 * i + 1] = small >> 16; + msg[4 * i + 2] = small >> 8; + msg[4 * i + 3] = small >> 24; + } +} + +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); - PQCLEAN_KYBER512_AVX2_cbd(r, buf); +void PQCLEAN_KYBER512_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; + prf(buf.arr, sizeof(buf.arr), seed, nonce); + PQCLEAN_KYBER512_AVX2_cbd(r, buf.arr); } -// FIXME void PQCLEAN_KYBER512_AVX2_poly_getnoise4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t *seed, + const uint8_t seed[32], uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) { - uint8_t buf[4][SHAKE256_RATE]; - - PQCLEAN_KYBER512_AVX2_shake256x4_prf(buf[0], buf[1], buf[2], buf[3], SHAKE256_RATE, seed, nonce0, nonce1, nonce2, nonce3); - - PQCLEAN_KYBER512_AVX2_cbd(r0, buf[0]); - PQCLEAN_KYBER512_AVX2_cbd(r1, buf[1]); - PQCLEAN_KYBER512_AVX2_cbd(r2, buf[2]); - PQCLEAN_KYBER512_AVX2_cbd(r3, buf[3]); + ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf; + __m256i f; + keccakx4_state state; + + f = _mm256_load_si256((__m256i *)seed); + _mm256_store_si256((__m256i *)buf.arr[0], f); + _mm256_store_si256((__m256i *)buf.arr[1], f); + _mm256_store_si256((__m256i *)buf.arr[2], f); + _mm256_store_si256((__m256i *)buf.arr[3], f); + + buf.arr[0][32] = nonce0; + buf.arr[1][32] = nonce1; + buf.arr[2][32] = nonce2; + buf.arr[3][32] = nonce3; + + PQCLEAN_KYBER512_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33); + PQCLEAN_KYBER512_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + + PQCLEAN_KYBER512_AVX2_cbd(r0, buf.arr[0]); + PQCLEAN_KYBER512_AVX2_cbd(r1, buf.arr[1]); + PQCLEAN_KYBER512_AVX2_cbd(r2, buf.arr[2]); + PQCLEAN_KYBER512_AVX2_cbd(r3, buf.arr[3]); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER512_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -133,73 +243,78 @@ void PQCLEAN_KYBER512_AVX2_poly_getnoise4x(poly *r0, * Arguments: - uint16_t *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER512_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER512_AVX2_zetas_exp); - PQCLEAN_KYBER512_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER512_AVX2_zetas_exp); - PQCLEAN_KYBER512_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER512_AVX2_zetas_exp + 4); - PQCLEAN_KYBER512_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER512_AVX2_zetas_exp + 200); + PQCLEAN_KYBER512_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER512_AVX2_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_invntt(poly *r) { - PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER512_AVX2_zetas_inv_exp); - PQCLEAN_KYBER512_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER512_AVX2_zetas_inv_exp + 196); - PQCLEAN_KYBER512_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER512_AVX2_zetas_inv_exp + 392); +void PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(poly *r) { + PQCLEAN_KYBER512_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } -// FIXME void PQCLEAN_KYBER512_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER512_AVX2_nttunpack_avx(r->coeffs); - PQCLEAN_KYBER512_AVX2_nttunpack_avx(r->coeffs + 128); + PQCLEAN_KYBER512_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } -//XXX Add comment -void PQCLEAN_KYBER512_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER512_AVX2_basemul_avx(r->coeffs, - a->coeffs, - b->coeffs, - PQCLEAN_KYBER512_AVX2_zetas_exp + 152); - PQCLEAN_KYBER512_AVX2_basemul_avx(r->coeffs + 64, - a->coeffs + 64, - b->coeffs + 64, - PQCLEAN_KYBER512_AVX2_zetas_exp + 184); - PQCLEAN_KYBER512_AVX2_basemul_avx(r->coeffs + 128, - a->coeffs + 128, - b->coeffs + 128, - PQCLEAN_KYBER512_AVX2_zetas_exp + 348); - PQCLEAN_KYBER512_AVX2_basemul_avx(r->coeffs + 192, - a->coeffs + 192, - b->coeffs + 192, - PQCLEAN_KYBER512_AVX2_zetas_exp + 380); +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + PQCLEAN_KYBER512_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } -// FIXME -void PQCLEAN_KYBER512_AVX2_poly_frommont(poly *r) { - PQCLEAN_KYBER512_AVX2_frommont_avx(r->coeffs); - PQCLEAN_KYBER512_AVX2_frommont_avx(r->coeffs + 128); +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_poly_tomont +* +* Description: Inplace conversion of all coefficients of a polynomial +* from normal domain to Montgomery domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_KYBER512_AVX2_poly_tomont(poly *r) { + PQCLEAN_KYBER512_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER512_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER512_AVX2_reduce_avx(r->coeffs); - PQCLEAN_KYBER512_AVX2_reduce_avx(r->coeffs + 128); + PQCLEAN_KYBER512_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_poly_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER512_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER512_AVX2_csubq_avx(r->coeffs); - PQCLEAN_KYBER512_AVX2_csubq_avx(r->coeffs + 128); + PQCLEAN_KYBER512_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER512_AVX2_poly_add * * Description: Add two polynomials * @@ -208,18 +323,19 @@ void PQCLEAN_KYBER512_AVX2_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_add(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_add_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_add_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER512_AVX2_poly_sub * * Description: Subtract two polynomials * @@ -228,127 +344,13 @@ void PQCLEAN_KYBER512_AVX2_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER512_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_sub_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); - } -} - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - __m128i tmp; - __m256i a[4], d0, d1, d2, d3; - const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i zeros = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi32(1); - const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2); - - tmp = _mm_loadu_si128((__m128i *)msg); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2); - } - - tmp = _mm_loadu_si128((__m128i *)&msg[16]); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2); - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER512_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint32_t small; - __m256i vec, tmp; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); - - for (size_t i = 0; i < KYBER_N / 16; i++) { - vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]); - vec = _mm256_sub_epi16(hqs, vec); - tmp = _mm256_srai_epi16(vec, 15); - vec = _mm256_xor_si256(vec, tmp); - vec = _mm256_sub_epi16(hhqs, vec); - small = (uint32_t)_mm256_movemask_epi8(vec); - small = _pext_u32(small, 0xAAAAAAAA); - small = ~small; - msg[2 * i + 0] = (uint8_t)small; - msg[2 * i + 1] = (uint8_t)(small >> 8); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_sub_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } diff --git a/crypto_kem/kyber512/avx2/poly.h b/crypto_kem/kyber512/avx2/poly.h index 1278be0a..e5c603f8 100644 --- a/crypto_kem/kyber512/avx2/poly.h +++ b/crypto_kem/kyber512/avx2/poly.h @@ -1,8 +1,7 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER512_AVX2_POLY_H +#define PQCLEAN_KYBER512_AVX2_POLY_H #include "params.h" - #include #include @@ -11,20 +10,28 @@ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] */ typedef union { + __m256i dummy; int16_t coeffs[KYBER_N]; - __m256i _dummy; } poly; -void PQCLEAN_KYBER512_AVX2_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER512_AVX2_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER512_AVX2_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER512_AVX2_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER512_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER512_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER512_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER512_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER512_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); + -void PQCLEAN_KYBER512_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER512_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); +void PQCLEAN_KYBER512_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); -void PQCLEAN_KYBER512_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); void PQCLEAN_KYBER512_AVX2_poly_getnoise4x(poly *r0, poly *r1, poly *r2, @@ -37,15 +44,23 @@ void PQCLEAN_KYBER512_AVX2_poly_getnoise4x(poly *r0, void PQCLEAN_KYBER512_AVX2_poly_ntt(poly *r); -void PQCLEAN_KYBER512_AVX2_poly_invntt(poly *r); + +void PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(poly *r); + void PQCLEAN_KYBER512_AVX2_poly_nttunpack(poly *r); -void PQCLEAN_KYBER512_AVX2_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER512_AVX2_poly_frommont(poly *r); + +void PQCLEAN_KYBER512_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER512_AVX2_poly_tomont(poly *r); + void PQCLEAN_KYBER512_AVX2_poly_reduce(poly *r); + void PQCLEAN_KYBER512_AVX2_poly_csubq(poly *r); + void PQCLEAN_KYBER512_AVX2_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER512_AVX2_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber512/avx2/polyvec.c b/crypto_kem/kyber512/avx2/polyvec.c index f6f7cea8..19a85a0e 100644 --- a/crypto_kem/kyber512/avx2/polyvec.c +++ b/crypto_kem/kyber512/avx2/polyvec.c @@ -1,157 +1,188 @@ +#include "params.h" +#include "consts.h" #include "ntt.h" #include "poly.h" #include "polyvec.h" - #include /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER512_AVX2_polyvec_compress * * Description: Compress and serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], + polyvec *restrict a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER512_AVX2_polyvec_csubq(a); uint16_t t[4]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - for (size_t k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) + / KYBER_Q) & 0x3ff; + } } - r[5 * j + 0] = (uint8_t)t[0]; - r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x3f) << 2)); - r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] & 0x0f) << 4)); - r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] & 0x03) << 6)); - r[5 * j + 4] = (uint8_t)((t[3] >> 2)); + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; } - r += 320; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER512_AVX2_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER512_AVX2_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - r->vec[i].coeffs[4 * j + 0] = (int16_t)( (((a[5 * j + 0] | (((uint32_t)a[5 * j + 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 1] = (int16_t)(((((a[5 * j + 1] >> 2) | (((uint32_t)a[5 * j + 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 2] = (int16_t)(((((a[5 * j + 2] >> 4) | (((uint32_t)a[5 * j + 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 3] = (int16_t)(((((a[5 * j + 3] >> 6) | (((uint32_t)a[5 * j + 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10); +void PQCLEAN_KYBER512_AVX2_polyvec_decompress(polyvec *restrict r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } } - a += 320; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER512_AVX2_polyvec_tobytes * * Description: Serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER512_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER512_AVX2_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER512_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER512_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER512_AVX2_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER512_AVX2_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_AVX2_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER512_AVX2_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_AVX2_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER512_AVX2_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER512_AVX2_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { - PQCLEAN_KYBER512_AVX2_basemul_acc_avx(r->coeffs, - a->vec->coeffs, - b->vec->coeffs, - PQCLEAN_KYBER512_AVX2_zetas_exp + 152); - PQCLEAN_KYBER512_AVX2_basemul_acc_avx(r->coeffs + 64, - a->vec->coeffs + 64, - b->vec->coeffs + 64, - PQCLEAN_KYBER512_AVX2_zetas_exp + 184); - PQCLEAN_KYBER512_AVX2_basemul_acc_avx(r->coeffs + 128, - a->vec->coeffs + 128, - b->vec->coeffs + 128, - PQCLEAN_KYBER512_AVX2_zetas_exp + 348); - PQCLEAN_KYBER512_AVX2_basemul_acc_avx(r->coeffs + 192, - a->vec->coeffs + 192, - b->vec->coeffs + 192, - PQCLEAN_KYBER512_AVX2_zetas_exp + 380); +void PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + PQCLEAN_KYBER512_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER512_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER512_AVX2_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_AVX2_poly_reduce(&r->vec[i]); } } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_polyvec_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of each element of a vector of polynomials +* for details of conditional subtraction of q see comments in +* reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER512_AVX2_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_AVX2_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER512_AVX2_polyvec_add * * Description: Add vectors of polynomials * @@ -160,7 +191,8 @@ void PQCLEAN_KYBER512_AVX2_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER512_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber512/avx2/polyvec.h b/crypto_kem/kyber512/avx2/polyvec.h index 3aafaf14..ccdf2172 100644 --- a/crypto_kem/kyber512/avx2/polyvec.h +++ b/crypto_kem/kyber512/avx2/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER512_AVX2_POLYVEC_H +#define PQCLEAN_KYBER512_AVX2_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER512_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER512_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER512_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER512_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER512_AVX2_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER512_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER512_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER512_AVX2_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER512_AVX2_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER512_AVX2_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER512_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER512_AVX2_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER512_AVX2_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER512_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber512/avx2/reduce.h b/crypto_kem/kyber512/avx2/reduce.h index a363e4b4..55bc800d 100644 --- a/crypto_kem/kyber512/avx2/reduce.h +++ b/crypto_kem/kyber512/avx2/reduce.h @@ -3,8 +3,14 @@ #include -int16_t PQCLEAN_KYBER512_AVX2_reduce_avx(int16_t *r); -int16_t PQCLEAN_KYBER512_AVX2_csubq_avx(int16_t *r); -int16_t PQCLEAN_KYBER512_AVX2_frommont_avx(int16_t *r); +#include "consts.h" +#include "params.h" + + +int16_t PQCLEAN_KYBER512_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + +int16_t PQCLEAN_KYBER512_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); + +int16_t PQCLEAN_KYBER512_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER512_AVX2_qdata); #endif diff --git a/crypto_kem/kyber512/avx2/rejsample.c b/crypto_kem/kyber512/avx2/rejsample.c index 876d8788..ea4b16bc 100644 --- a/crypto_kem/kyber512/avx2/rejsample.c +++ b/crypto_kem/kyber512/avx2/rejsample.c @@ -1,386 +1,360 @@ +#include "align.h" #include "consts.h" #include "params.h" #include "rejsample.h" - -#include #include - -static const uint8_t idx[256][8] = { - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 2, 0, 0, 0, 0, 0, 0, 0}, - { 0, 2, 0, 0, 0, 0, 0, 0}, - { 4, 0, 0, 0, 0, 0, 0, 0}, - { 0, 4, 0, 0, 0, 0, 0, 0}, - { 2, 4, 0, 0, 0, 0, 0, 0}, - { 0, 2, 4, 0, 0, 0, 0, 0}, - { 6, 0, 0, 0, 0, 0, 0, 0}, - { 0, 6, 0, 0, 0, 0, 0, 0}, - { 2, 6, 0, 0, 0, 0, 0, 0}, - { 0, 2, 6, 0, 0, 0, 0, 0}, - { 4, 6, 0, 0, 0, 0, 0, 0}, - { 0, 4, 6, 0, 0, 0, 0, 0}, - { 2, 4, 6, 0, 0, 0, 0, 0}, - { 0, 2, 4, 6, 0, 0, 0, 0}, - { 8, 0, 0, 0, 0, 0, 0, 0}, - { 0, 8, 0, 0, 0, 0, 0, 0}, - { 2, 8, 0, 0, 0, 0, 0, 0}, - { 0, 2, 8, 0, 0, 0, 0, 0}, - { 4, 8, 0, 0, 0, 0, 0, 0}, - { 0, 4, 8, 0, 0, 0, 0, 0}, - { 2, 4, 8, 0, 0, 0, 0, 0}, - { 0, 2, 4, 8, 0, 0, 0, 0}, - { 6, 8, 0, 0, 0, 0, 0, 0}, - { 0, 6, 8, 0, 0, 0, 0, 0}, - { 2, 6, 8, 0, 0, 0, 0, 0}, - { 0, 2, 6, 8, 0, 0, 0, 0}, - { 4, 6, 8, 0, 0, 0, 0, 0}, - { 0, 4, 6, 8, 0, 0, 0, 0}, - { 2, 4, 6, 8, 0, 0, 0, 0}, - { 0, 2, 4, 6, 8, 0, 0, 0}, - {10, 0, 0, 0, 0, 0, 0, 0}, - { 0, 10, 0, 0, 0, 0, 0, 0}, - { 2, 10, 0, 0, 0, 0, 0, 0}, - { 0, 2, 10, 0, 0, 0, 0, 0}, - { 4, 10, 0, 0, 0, 0, 0, 0}, - { 0, 4, 10, 0, 0, 0, 0, 0}, - { 2, 4, 10, 0, 0, 0, 0, 0}, - { 0, 2, 4, 10, 0, 0, 0, 0}, - { 6, 10, 0, 0, 0, 0, 0, 0}, - { 0, 6, 10, 0, 0, 0, 0, 0}, - { 2, 6, 10, 0, 0, 0, 0, 0}, - { 0, 2, 6, 10, 0, 0, 0, 0}, - { 4, 6, 10, 0, 0, 0, 0, 0}, - { 0, 4, 6, 10, 0, 0, 0, 0}, - { 2, 4, 6, 10, 0, 0, 0, 0}, - { 0, 2, 4, 6, 10, 0, 0, 0}, - { 8, 10, 0, 0, 0, 0, 0, 0}, - { 0, 8, 10, 0, 0, 0, 0, 0}, - { 2, 8, 10, 0, 0, 0, 0, 0}, - { 0, 2, 8, 10, 0, 0, 0, 0}, - { 4, 8, 10, 0, 0, 0, 0, 0}, - { 0, 4, 8, 10, 0, 0, 0, 0}, - { 2, 4, 8, 10, 0, 0, 0, 0}, - { 0, 2, 4, 8, 10, 0, 0, 0}, - { 6, 8, 10, 0, 0, 0, 0, 0}, - { 0, 6, 8, 10, 0, 0, 0, 0}, - { 2, 6, 8, 10, 0, 0, 0, 0}, - { 0, 2, 6, 8, 10, 0, 0, 0}, - { 4, 6, 8, 10, 0, 0, 0, 0}, - { 0, 4, 6, 8, 10, 0, 0, 0}, - { 2, 4, 6, 8, 10, 0, 0, 0}, - { 0, 2, 4, 6, 8, 10, 0, 0}, - {12, 0, 0, 0, 0, 0, 0, 0}, - { 0, 12, 0, 0, 0, 0, 0, 0}, - { 2, 12, 0, 0, 0, 0, 0, 0}, - { 0, 2, 12, 0, 0, 0, 0, 0}, - { 4, 12, 0, 0, 0, 0, 0, 0}, - { 0, 4, 12, 0, 0, 0, 0, 0}, - { 2, 4, 12, 0, 0, 0, 0, 0}, - { 0, 2, 4, 12, 0, 0, 0, 0}, - { 6, 12, 0, 0, 0, 0, 0, 0}, - { 0, 6, 12, 0, 0, 0, 0, 0}, - { 2, 6, 12, 0, 0, 0, 0, 0}, - { 0, 2, 6, 12, 0, 0, 0, 0}, - { 4, 6, 12, 0, 0, 0, 0, 0}, - { 0, 4, 6, 12, 0, 0, 0, 0}, - { 2, 4, 6, 12, 0, 0, 0, 0}, - { 0, 2, 4, 6, 12, 0, 0, 0}, - { 8, 12, 0, 0, 0, 0, 0, 0}, - { 0, 8, 12, 0, 0, 0, 0, 0}, - { 2, 8, 12, 0, 0, 0, 0, 0}, - { 0, 2, 8, 12, 0, 0, 0, 0}, - { 4, 8, 12, 0, 0, 0, 0, 0}, - { 0, 4, 8, 12, 0, 0, 0, 0}, - { 2, 4, 8, 12, 0, 0, 0, 0}, - { 0, 2, 4, 8, 12, 0, 0, 0}, - { 6, 8, 12, 0, 0, 0, 0, 0}, - { 0, 6, 8, 12, 0, 0, 0, 0}, - { 2, 6, 8, 12, 0, 0, 0, 0}, - { 0, 2, 6, 8, 12, 0, 0, 0}, - { 4, 6, 8, 12, 0, 0, 0, 0}, - { 0, 4, 6, 8, 12, 0, 0, 0}, - { 2, 4, 6, 8, 12, 0, 0, 0}, - { 0, 2, 4, 6, 8, 12, 0, 0}, - {10, 12, 0, 0, 0, 0, 0, 0}, - { 0, 10, 12, 0, 0, 0, 0, 0}, - { 2, 10, 12, 0, 0, 0, 0, 0}, - { 0, 2, 10, 12, 0, 0, 0, 0}, - { 4, 10, 12, 0, 0, 0, 0, 0}, - { 0, 4, 10, 12, 0, 0, 0, 0}, - { 2, 4, 10, 12, 0, 0, 0, 0}, - { 0, 2, 4, 10, 12, 0, 0, 0}, - { 6, 10, 12, 0, 0, 0, 0, 0}, - { 0, 6, 10, 12, 0, 0, 0, 0}, - { 2, 6, 10, 12, 0, 0, 0, 0}, - { 0, 2, 6, 10, 12, 0, 0, 0}, - { 4, 6, 10, 12, 0, 0, 0, 0}, - { 0, 4, 6, 10, 12, 0, 0, 0}, - { 2, 4, 6, 10, 12, 0, 0, 0}, - { 0, 2, 4, 6, 10, 12, 0, 0}, - { 8, 10, 12, 0, 0, 0, 0, 0}, - { 0, 8, 10, 12, 0, 0, 0, 0}, - { 2, 8, 10, 12, 0, 0, 0, 0}, - { 0, 2, 8, 10, 12, 0, 0, 0}, - { 4, 8, 10, 12, 0, 0, 0, 0}, - { 0, 4, 8, 10, 12, 0, 0, 0}, - { 2, 4, 8, 10, 12, 0, 0, 0}, - { 0, 2, 4, 8, 10, 12, 0, 0}, - { 6, 8, 10, 12, 0, 0, 0, 0}, - { 0, 6, 8, 10, 12, 0, 0, 0}, - { 2, 6, 8, 10, 12, 0, 0, 0}, - { 0, 2, 6, 8, 10, 12, 0, 0}, - { 4, 6, 8, 10, 12, 0, 0, 0}, - { 0, 4, 6, 8, 10, 12, 0, 0}, - { 2, 4, 6, 8, 10, 12, 0, 0}, - { 0, 2, 4, 6, 8, 10, 12, 0}, - {14, 0, 0, 0, 0, 0, 0, 0}, - { 0, 14, 0, 0, 0, 0, 0, 0}, - { 2, 14, 0, 0, 0, 0, 0, 0}, - { 0, 2, 14, 0, 0, 0, 0, 0}, - { 4, 14, 0, 0, 0, 0, 0, 0}, - { 0, 4, 14, 0, 0, 0, 0, 0}, - { 2, 4, 14, 0, 0, 0, 0, 0}, - { 0, 2, 4, 14, 0, 0, 0, 0}, - { 6, 14, 0, 0, 0, 0, 0, 0}, - { 0, 6, 14, 0, 0, 0, 0, 0}, - { 2, 6, 14, 0, 0, 0, 0, 0}, - { 0, 2, 6, 14, 0, 0, 0, 0}, - { 4, 6, 14, 0, 0, 0, 0, 0}, - { 0, 4, 6, 14, 0, 0, 0, 0}, - { 2, 4, 6, 14, 0, 0, 0, 0}, - { 0, 2, 4, 6, 14, 0, 0, 0}, - { 8, 14, 0, 0, 0, 0, 0, 0}, - { 0, 8, 14, 0, 0, 0, 0, 0}, - { 2, 8, 14, 0, 0, 0, 0, 0}, - { 0, 2, 8, 14, 0, 0, 0, 0}, - { 4, 8, 14, 0, 0, 0, 0, 0}, - { 0, 4, 8, 14, 0, 0, 0, 0}, - { 2, 4, 8, 14, 0, 0, 0, 0}, - { 0, 2, 4, 8, 14, 0, 0, 0}, - { 6, 8, 14, 0, 0, 0, 0, 0}, - { 0, 6, 8, 14, 0, 0, 0, 0}, - { 2, 6, 8, 14, 0, 0, 0, 0}, - { 0, 2, 6, 8, 14, 0, 0, 0}, - { 4, 6, 8, 14, 0, 0, 0, 0}, - { 0, 4, 6, 8, 14, 0, 0, 0}, - { 2, 4, 6, 8, 14, 0, 0, 0}, - { 0, 2, 4, 6, 8, 14, 0, 0}, - {10, 14, 0, 0, 0, 0, 0, 0}, - { 0, 10, 14, 0, 0, 0, 0, 0}, - { 2, 10, 14, 0, 0, 0, 0, 0}, - { 0, 2, 10, 14, 0, 0, 0, 0}, - { 4, 10, 14, 0, 0, 0, 0, 0}, - { 0, 4, 10, 14, 0, 0, 0, 0}, - { 2, 4, 10, 14, 0, 0, 0, 0}, - { 0, 2, 4, 10, 14, 0, 0, 0}, - { 6, 10, 14, 0, 0, 0, 0, 0}, - { 0, 6, 10, 14, 0, 0, 0, 0}, - { 2, 6, 10, 14, 0, 0, 0, 0}, - { 0, 2, 6, 10, 14, 0, 0, 0}, - { 4, 6, 10, 14, 0, 0, 0, 0}, - { 0, 4, 6, 10, 14, 0, 0, 0}, - { 2, 4, 6, 10, 14, 0, 0, 0}, - { 0, 2, 4, 6, 10, 14, 0, 0}, - { 8, 10, 14, 0, 0, 0, 0, 0}, - { 0, 8, 10, 14, 0, 0, 0, 0}, - { 2, 8, 10, 14, 0, 0, 0, 0}, - { 0, 2, 8, 10, 14, 0, 0, 0}, - { 4, 8, 10, 14, 0, 0, 0, 0}, - { 0, 4, 8, 10, 14, 0, 0, 0}, - { 2, 4, 8, 10, 14, 0, 0, 0}, - { 0, 2, 4, 8, 10, 14, 0, 0}, - { 6, 8, 10, 14, 0, 0, 0, 0}, - { 0, 6, 8, 10, 14, 0, 0, 0}, - { 2, 6, 8, 10, 14, 0, 0, 0}, - { 0, 2, 6, 8, 10, 14, 0, 0}, - { 4, 6, 8, 10, 14, 0, 0, 0}, - { 0, 4, 6, 8, 10, 14, 0, 0}, - { 2, 4, 6, 8, 10, 14, 0, 0}, - { 0, 2, 4, 6, 8, 10, 14, 0}, - {12, 14, 0, 0, 0, 0, 0, 0}, - { 0, 12, 14, 0, 0, 0, 0, 0}, - { 2, 12, 14, 0, 0, 0, 0, 0}, - { 0, 2, 12, 14, 0, 0, 0, 0}, - { 4, 12, 14, 0, 0, 0, 0, 0}, - { 0, 4, 12, 14, 0, 0, 0, 0}, - { 2, 4, 12, 14, 0, 0, 0, 0}, - { 0, 2, 4, 12, 14, 0, 0, 0}, - { 6, 12, 14, 0, 0, 0, 0, 0}, - { 0, 6, 12, 14, 0, 0, 0, 0}, - { 2, 6, 12, 14, 0, 0, 0, 0}, - { 0, 2, 6, 12, 14, 0, 0, 0}, - { 4, 6, 12, 14, 0, 0, 0, 0}, - { 0, 4, 6, 12, 14, 0, 0, 0}, - { 2, 4, 6, 12, 14, 0, 0, 0}, - { 0, 2, 4, 6, 12, 14, 0, 0}, - { 8, 12, 14, 0, 0, 0, 0, 0}, - { 0, 8, 12, 14, 0, 0, 0, 0}, - { 2, 8, 12, 14, 0, 0, 0, 0}, - { 0, 2, 8, 12, 14, 0, 0, 0}, - { 4, 8, 12, 14, 0, 0, 0, 0}, - { 0, 4, 8, 12, 14, 0, 0, 0}, - { 2, 4, 8, 12, 14, 0, 0, 0}, - { 0, 2, 4, 8, 12, 14, 0, 0}, - { 6, 8, 12, 14, 0, 0, 0, 0}, - { 0, 6, 8, 12, 14, 0, 0, 0}, - { 2, 6, 8, 12, 14, 0, 0, 0}, - { 0, 2, 6, 8, 12, 14, 0, 0}, - { 4, 6, 8, 12, 14, 0, 0, 0}, - { 0, 4, 6, 8, 12, 14, 0, 0}, - { 2, 4, 6, 8, 12, 14, 0, 0}, - { 0, 2, 4, 6, 8, 12, 14, 0}, - {10, 12, 14, 0, 0, 0, 0, 0}, - { 0, 10, 12, 14, 0, 0, 0, 0}, - { 2, 10, 12, 14, 0, 0, 0, 0}, - { 0, 2, 10, 12, 14, 0, 0, 0}, - { 4, 10, 12, 14, 0, 0, 0, 0}, - { 0, 4, 10, 12, 14, 0, 0, 0}, - { 2, 4, 10, 12, 14, 0, 0, 0}, - { 0, 2, 4, 10, 12, 14, 0, 0}, - { 6, 10, 12, 14, 0, 0, 0, 0}, - { 0, 6, 10, 12, 14, 0, 0, 0}, - { 2, 6, 10, 12, 14, 0, 0, 0}, - { 0, 2, 6, 10, 12, 14, 0, 0}, - { 4, 6, 10, 12, 14, 0, 0, 0}, - { 0, 4, 6, 10, 12, 14, 0, 0}, - { 2, 4, 6, 10, 12, 14, 0, 0}, - { 0, 2, 4, 6, 10, 12, 14, 0}, - { 8, 10, 12, 14, 0, 0, 0, 0}, - { 0, 8, 10, 12, 14, 0, 0, 0}, - { 2, 8, 10, 12, 14, 0, 0, 0}, - { 0, 2, 8, 10, 12, 14, 0, 0}, - { 4, 8, 10, 12, 14, 0, 0, 0}, - { 0, 4, 8, 10, 12, 14, 0, 0}, - { 2, 4, 8, 10, 12, 14, 0, 0}, - { 0, 2, 4, 8, 10, 12, 14, 0}, - { 6, 8, 10, 12, 14, 0, 0, 0}, - { 0, 6, 8, 10, 12, 14, 0, 0}, - { 2, 6, 8, 10, 12, 14, 0, 0}, - { 0, 2, 6, 8, 10, 12, 14, 0}, - { 4, 6, 8, 10, 12, 14, 0, 0}, - { 0, 4, 6, 8, 10, 12, 14, 0}, - { 2, 4, 6, 8, 10, 12, 14, 0}, - { 0, 2, 4, 6, 8, 10, 12, 14} +static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { + {-1, -1, -1, -1, -1, -1, -1, -1}, + { 0, -1, -1, -1, -1, -1, -1, -1}, + { 2, -1, -1, -1, -1, -1, -1, -1}, + { 0, 2, -1, -1, -1, -1, -1, -1}, + { 4, -1, -1, -1, -1, -1, -1, -1}, + { 0, 4, -1, -1, -1, -1, -1, -1}, + { 2, 4, -1, -1, -1, -1, -1, -1}, + { 0, 2, 4, -1, -1, -1, -1, -1}, + { 6, -1, -1, -1, -1, -1, -1, -1}, + { 0, 6, -1, -1, -1, -1, -1, -1}, + { 2, 6, -1, -1, -1, -1, -1, -1}, + { 0, 2, 6, -1, -1, -1, -1, -1}, + { 4, 6, -1, -1, -1, -1, -1, -1}, + { 0, 4, 6, -1, -1, -1, -1, -1}, + { 2, 4, 6, -1, -1, -1, -1, -1}, + { 0, 2, 4, 6, -1, -1, -1, -1}, + { 8, -1, -1, -1, -1, -1, -1, -1}, + { 0, 8, -1, -1, -1, -1, -1, -1}, + { 2, 8, -1, -1, -1, -1, -1, -1}, + { 0, 2, 8, -1, -1, -1, -1, -1}, + { 4, 8, -1, -1, -1, -1, -1, -1}, + { 0, 4, 8, -1, -1, -1, -1, -1}, + { 2, 4, 8, -1, -1, -1, -1, -1}, + { 0, 2, 4, 8, -1, -1, -1, -1}, + { 6, 8, -1, -1, -1, -1, -1, -1}, + { 0, 6, 8, -1, -1, -1, -1, -1}, + { 2, 6, 8, -1, -1, -1, -1, -1}, + { 0, 2, 6, 8, -1, -1, -1, -1}, + { 4, 6, 8, -1, -1, -1, -1, -1}, + { 0, 4, 6, 8, -1, -1, -1, -1}, + { 2, 4, 6, 8, -1, -1, -1, -1}, + { 0, 2, 4, 6, 8, -1, -1, -1}, + {10, -1, -1, -1, -1, -1, -1, -1}, + { 0, 10, -1, -1, -1, -1, -1, -1}, + { 2, 10, -1, -1, -1, -1, -1, -1}, + { 0, 2, 10, -1, -1, -1, -1, -1}, + { 4, 10, -1, -1, -1, -1, -1, -1}, + { 0, 4, 10, -1, -1, -1, -1, -1}, + { 2, 4, 10, -1, -1, -1, -1, -1}, + { 0, 2, 4, 10, -1, -1, -1, -1}, + { 6, 10, -1, -1, -1, -1, -1, -1}, + { 0, 6, 10, -1, -1, -1, -1, -1}, + { 2, 6, 10, -1, -1, -1, -1, -1}, + { 0, 2, 6, 10, -1, -1, -1, -1}, + { 4, 6, 10, -1, -1, -1, -1, -1}, + { 0, 4, 6, 10, -1, -1, -1, -1}, + { 2, 4, 6, 10, -1, -1, -1, -1}, + { 0, 2, 4, 6, 10, -1, -1, -1}, + { 8, 10, -1, -1, -1, -1, -1, -1}, + { 0, 8, 10, -1, -1, -1, -1, -1}, + { 2, 8, 10, -1, -1, -1, -1, -1}, + { 0, 2, 8, 10, -1, -1, -1, -1}, + { 4, 8, 10, -1, -1, -1, -1, -1}, + { 0, 4, 8, 10, -1, -1, -1, -1}, + { 2, 4, 8, 10, -1, -1, -1, -1}, + { 0, 2, 4, 8, 10, -1, -1, -1}, + { 6, 8, 10, -1, -1, -1, -1, -1}, + { 0, 6, 8, 10, -1, -1, -1, -1}, + { 2, 6, 8, 10, -1, -1, -1, -1}, + { 0, 2, 6, 8, 10, -1, -1, -1}, + { 4, 6, 8, 10, -1, -1, -1, -1}, + { 0, 4, 6, 8, 10, -1, -1, -1}, + { 2, 4, 6, 8, 10, -1, -1, -1}, + { 0, 2, 4, 6, 8, 10, -1, -1}, + {12, -1, -1, -1, -1, -1, -1, -1}, + { 0, 12, -1, -1, -1, -1, -1, -1}, + { 2, 12, -1, -1, -1, -1, -1, -1}, + { 0, 2, 12, -1, -1, -1, -1, -1}, + { 4, 12, -1, -1, -1, -1, -1, -1}, + { 0, 4, 12, -1, -1, -1, -1, -1}, + { 2, 4, 12, -1, -1, -1, -1, -1}, + { 0, 2, 4, 12, -1, -1, -1, -1}, + { 6, 12, -1, -1, -1, -1, -1, -1}, + { 0, 6, 12, -1, -1, -1, -1, -1}, + { 2, 6, 12, -1, -1, -1, -1, -1}, + { 0, 2, 6, 12, -1, -1, -1, -1}, + { 4, 6, 12, -1, -1, -1, -1, -1}, + { 0, 4, 6, 12, -1, -1, -1, -1}, + { 2, 4, 6, 12, -1, -1, -1, -1}, + { 0, 2, 4, 6, 12, -1, -1, -1}, + { 8, 12, -1, -1, -1, -1, -1, -1}, + { 0, 8, 12, -1, -1, -1, -1, -1}, + { 2, 8, 12, -1, -1, -1, -1, -1}, + { 0, 2, 8, 12, -1, -1, -1, -1}, + { 4, 8, 12, -1, -1, -1, -1, -1}, + { 0, 4, 8, 12, -1, -1, -1, -1}, + { 2, 4, 8, 12, -1, -1, -1, -1}, + { 0, 2, 4, 8, 12, -1, -1, -1}, + { 6, 8, 12, -1, -1, -1, -1, -1}, + { 0, 6, 8, 12, -1, -1, -1, -1}, + { 2, 6, 8, 12, -1, -1, -1, -1}, + { 0, 2, 6, 8, 12, -1, -1, -1}, + { 4, 6, 8, 12, -1, -1, -1, -1}, + { 0, 4, 6, 8, 12, -1, -1, -1}, + { 2, 4, 6, 8, 12, -1, -1, -1}, + { 0, 2, 4, 6, 8, 12, -1, -1}, + {10, 12, -1, -1, -1, -1, -1, -1}, + { 0, 10, 12, -1, -1, -1, -1, -1}, + { 2, 10, 12, -1, -1, -1, -1, -1}, + { 0, 2, 10, 12, -1, -1, -1, -1}, + { 4, 10, 12, -1, -1, -1, -1, -1}, + { 0, 4, 10, 12, -1, -1, -1, -1}, + { 2, 4, 10, 12, -1, -1, -1, -1}, + { 0, 2, 4, 10, 12, -1, -1, -1}, + { 6, 10, 12, -1, -1, -1, -1, -1}, + { 0, 6, 10, 12, -1, -1, -1, -1}, + { 2, 6, 10, 12, -1, -1, -1, -1}, + { 0, 2, 6, 10, 12, -1, -1, -1}, + { 4, 6, 10, 12, -1, -1, -1, -1}, + { 0, 4, 6, 10, 12, -1, -1, -1}, + { 2, 4, 6, 10, 12, -1, -1, -1}, + { 0, 2, 4, 6, 10, 12, -1, -1}, + { 8, 10, 12, -1, -1, -1, -1, -1}, + { 0, 8, 10, 12, -1, -1, -1, -1}, + { 2, 8, 10, 12, -1, -1, -1, -1}, + { 0, 2, 8, 10, 12, -1, -1, -1}, + { 4, 8, 10, 12, -1, -1, -1, -1}, + { 0, 4, 8, 10, 12, -1, -1, -1}, + { 2, 4, 8, 10, 12, -1, -1, -1}, + { 0, 2, 4, 8, 10, 12, -1, -1}, + { 6, 8, 10, 12, -1, -1, -1, -1}, + { 0, 6, 8, 10, 12, -1, -1, -1}, + { 2, 6, 8, 10, 12, -1, -1, -1}, + { 0, 2, 6, 8, 10, 12, -1, -1}, + { 4, 6, 8, 10, 12, -1, -1, -1}, + { 0, 4, 6, 8, 10, 12, -1, -1}, + { 2, 4, 6, 8, 10, 12, -1, -1}, + { 0, 2, 4, 6, 8, 10, 12, -1}, + {14, -1, -1, -1, -1, -1, -1, -1}, + { 0, 14, -1, -1, -1, -1, -1, -1}, + { 2, 14, -1, -1, -1, -1, -1, -1}, + { 0, 2, 14, -1, -1, -1, -1, -1}, + { 4, 14, -1, -1, -1, -1, -1, -1}, + { 0, 4, 14, -1, -1, -1, -1, -1}, + { 2, 4, 14, -1, -1, -1, -1, -1}, + { 0, 2, 4, 14, -1, -1, -1, -1}, + { 6, 14, -1, -1, -1, -1, -1, -1}, + { 0, 6, 14, -1, -1, -1, -1, -1}, + { 2, 6, 14, -1, -1, -1, -1, -1}, + { 0, 2, 6, 14, -1, -1, -1, -1}, + { 4, 6, 14, -1, -1, -1, -1, -1}, + { 0, 4, 6, 14, -1, -1, -1, -1}, + { 2, 4, 6, 14, -1, -1, -1, -1}, + { 0, 2, 4, 6, 14, -1, -1, -1}, + { 8, 14, -1, -1, -1, -1, -1, -1}, + { 0, 8, 14, -1, -1, -1, -1, -1}, + { 2, 8, 14, -1, -1, -1, -1, -1}, + { 0, 2, 8, 14, -1, -1, -1, -1}, + { 4, 8, 14, -1, -1, -1, -1, -1}, + { 0, 4, 8, 14, -1, -1, -1, -1}, + { 2, 4, 8, 14, -1, -1, -1, -1}, + { 0, 2, 4, 8, 14, -1, -1, -1}, + { 6, 8, 14, -1, -1, -1, -1, -1}, + { 0, 6, 8, 14, -1, -1, -1, -1}, + { 2, 6, 8, 14, -1, -1, -1, -1}, + { 0, 2, 6, 8, 14, -1, -1, -1}, + { 4, 6, 8, 14, -1, -1, -1, -1}, + { 0, 4, 6, 8, 14, -1, -1, -1}, + { 2, 4, 6, 8, 14, -1, -1, -1}, + { 0, 2, 4, 6, 8, 14, -1, -1}, + {10, 14, -1, -1, -1, -1, -1, -1}, + { 0, 10, 14, -1, -1, -1, -1, -1}, + { 2, 10, 14, -1, -1, -1, -1, -1}, + { 0, 2, 10, 14, -1, -1, -1, -1}, + { 4, 10, 14, -1, -1, -1, -1, -1}, + { 0, 4, 10, 14, -1, -1, -1, -1}, + { 2, 4, 10, 14, -1, -1, -1, -1}, + { 0, 2, 4, 10, 14, -1, -1, -1}, + { 6, 10, 14, -1, -1, -1, -1, -1}, + { 0, 6, 10, 14, -1, -1, -1, -1}, + { 2, 6, 10, 14, -1, -1, -1, -1}, + { 0, 2, 6, 10, 14, -1, -1, -1}, + { 4, 6, 10, 14, -1, -1, -1, -1}, + { 0, 4, 6, 10, 14, -1, -1, -1}, + { 2, 4, 6, 10, 14, -1, -1, -1}, + { 0, 2, 4, 6, 10, 14, -1, -1}, + { 8, 10, 14, -1, -1, -1, -1, -1}, + { 0, 8, 10, 14, -1, -1, -1, -1}, + { 2, 8, 10, 14, -1, -1, -1, -1}, + { 0, 2, 8, 10, 14, -1, -1, -1}, + { 4, 8, 10, 14, -1, -1, -1, -1}, + { 0, 4, 8, 10, 14, -1, -1, -1}, + { 2, 4, 8, 10, 14, -1, -1, -1}, + { 0, 2, 4, 8, 10, 14, -1, -1}, + { 6, 8, 10, 14, -1, -1, -1, -1}, + { 0, 6, 8, 10, 14, -1, -1, -1}, + { 2, 6, 8, 10, 14, -1, -1, -1}, + { 0, 2, 6, 8, 10, 14, -1, -1}, + { 4, 6, 8, 10, 14, -1, -1, -1}, + { 0, 4, 6, 8, 10, 14, -1, -1}, + { 2, 4, 6, 8, 10, 14, -1, -1}, + { 0, 2, 4, 6, 8, 10, 14, -1}, + {12, 14, -1, -1, -1, -1, -1, -1}, + { 0, 12, 14, -1, -1, -1, -1, -1}, + { 2, 12, 14, -1, -1, -1, -1, -1}, + { 0, 2, 12, 14, -1, -1, -1, -1}, + { 4, 12, 14, -1, -1, -1, -1, -1}, + { 0, 4, 12, 14, -1, -1, -1, -1}, + { 2, 4, 12, 14, -1, -1, -1, -1}, + { 0, 2, 4, 12, 14, -1, -1, -1}, + { 6, 12, 14, -1, -1, -1, -1, -1}, + { 0, 6, 12, 14, -1, -1, -1, -1}, + { 2, 6, 12, 14, -1, -1, -1, -1}, + { 0, 2, 6, 12, 14, -1, -1, -1}, + { 4, 6, 12, 14, -1, -1, -1, -1}, + { 0, 4, 6, 12, 14, -1, -1, -1}, + { 2, 4, 6, 12, 14, -1, -1, -1}, + { 0, 2, 4, 6, 12, 14, -1, -1}, + { 8, 12, 14, -1, -1, -1, -1, -1}, + { 0, 8, 12, 14, -1, -1, -1, -1}, + { 2, 8, 12, 14, -1, -1, -1, -1}, + { 0, 2, 8, 12, 14, -1, -1, -1}, + { 4, 8, 12, 14, -1, -1, -1, -1}, + { 0, 4, 8, 12, 14, -1, -1, -1}, + { 2, 4, 8, 12, 14, -1, -1, -1}, + { 0, 2, 4, 8, 12, 14, -1, -1}, + { 6, 8, 12, 14, -1, -1, -1, -1}, + { 0, 6, 8, 12, 14, -1, -1, -1}, + { 2, 6, 8, 12, 14, -1, -1, -1}, + { 0, 2, 6, 8, 12, 14, -1, -1}, + { 4, 6, 8, 12, 14, -1, -1, -1}, + { 0, 4, 6, 8, 12, 14, -1, -1}, + { 2, 4, 6, 8, 12, 14, -1, -1}, + { 0, 2, 4, 6, 8, 12, 14, -1}, + {10, 12, 14, -1, -1, -1, -1, -1}, + { 0, 10, 12, 14, -1, -1, -1, -1}, + { 2, 10, 12, 14, -1, -1, -1, -1}, + { 0, 2, 10, 12, 14, -1, -1, -1}, + { 4, 10, 12, 14, -1, -1, -1, -1}, + { 0, 4, 10, 12, 14, -1, -1, -1}, + { 2, 4, 10, 12, 14, -1, -1, -1}, + { 0, 2, 4, 10, 12, 14, -1, -1}, + { 6, 10, 12, 14, -1, -1, -1, -1}, + { 0, 6, 10, 12, 14, -1, -1, -1}, + { 2, 6, 10, 12, 14, -1, -1, -1}, + { 0, 2, 6, 10, 12, 14, -1, -1}, + { 4, 6, 10, 12, 14, -1, -1, -1}, + { 0, 4, 6, 10, 12, 14, -1, -1}, + { 2, 4, 6, 10, 12, 14, -1, -1}, + { 0, 2, 4, 6, 10, 12, 14, -1}, + { 8, 10, 12, 14, -1, -1, -1, -1}, + { 0, 8, 10, 12, 14, -1, -1, -1}, + { 2, 8, 10, 12, 14, -1, -1, -1}, + { 0, 2, 8, 10, 12, 14, -1, -1}, + { 4, 8, 10, 12, 14, -1, -1, -1}, + { 0, 4, 8, 10, 12, 14, -1, -1}, + { 2, 4, 8, 10, 12, 14, -1, -1}, + { 0, 2, 4, 8, 10, 12, 14, -1}, + { 6, 8, 10, 12, 14, -1, -1, -1}, + { 0, 6, 8, 10, 12, 14, -1, -1}, + { 2, 6, 8, 10, 12, 14, -1, -1}, + { 0, 2, 6, 8, 10, 12, 14, -1}, + { 4, 6, 8, 10, 12, 14, -1, -1}, + { 0, 4, 6, 8, 10, 12, 14, -1}, + { 2, 4, 6, 8, 10, 12, 14, -1}, + { 0, 2, 4, 6, 8, 10, 12, 14} + } }; -#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) -#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) +#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) +#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -size_t PQCLEAN_KYBER512_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint16_t val; - uint32_t good0, good1, good2; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison +#define REJ_UNIFORM_BUFLEN 672 +unsigned int PQCLEAN_KYBER512_AVX2_rej_uniform_avx(int16_t *restrict r, + const uint8_t *restrict buf) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; + uint32_t good = 0; + const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER512_AVX2_16xq.as_vec); - const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER512_AVX2_16xv.as_vec); - __m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2; - __m128i d, tmp, pilo, pihi; + const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER512_AVX2_qdata.as_arr[_16XQ]); + const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER512_AVX2_qdata.as_arr[_16XV]); + __m256i f0, f1, g0, g1, g2, g3; + __m128i f, t, pilo, pihi; - ctr = pos = 0; - while (ctr + 48 <= len && pos + 96 <= buflen) { - d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]); - d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]); - d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]); + ctr = 0; + for (pos = 0; pos < 2 * KYBER_N; pos += 64) { + f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); + f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); - tmp0 = _mm256_cmpge_epu16(bound, d0); - tmp1 = _mm256_cmpge_epu16(bound, d1); - tmp2 = _mm256_cmpge_epu16(bound, d2); - good0 = (uint32_t)_mm256_movemask_epi8(tmp0); - good1 = (uint32_t)_mm256_movemask_epi8(tmp1); - good2 = (uint32_t)_mm256_movemask_epi8(tmp2); - good0 = _pext_u32(good0, 0x55555555); - good1 = _pext_u32(good1, 0x55555555); - good2 = _pext_u32(good2, 0x55555555); + g0 = _mm256_cmpge_epu16(bound, f0); + g1 = _mm256_cmpge_epu16(bound, f1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]); - pi0 = _mm256_castsi128_si256(pilo); - pi0 = _mm256_inserti128_si256(pi0, pihi, 1); + g0 = _mm256_packs_epi16(g0, g1); + good = _mm256_movemask_epi8(g0); - pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]); - pi1 = _mm256_castsi128_si256(pilo); - pi1 = _mm256_inserti128_si256(pi1, pihi, 1); + g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); + g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); + g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); + g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]); - pi2 = _mm256_castsi128_si256(pilo); - pi2 = _mm256_inserti128_si256(pi2, pihi, 1); + //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); + //g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8); - tmp0 = _mm256_add_epi8(pi0, ones); - tmp1 = _mm256_add_epi8(pi1, ones); - tmp2 = _mm256_add_epi8(pi2, ones); - pi0 = _mm256_unpacklo_epi8(pi0, tmp0); - pi1 = _mm256_unpacklo_epi8(pi1, tmp1); - pi2 = _mm256_unpacklo_epi8(pi2, tmp2); + /* Barrett reduction of (still unsigned) values */ + g2 = _mm256_mulhi_epu16(f0, v); + g3 = _mm256_mulhi_epu16(f1, v); + g2 = _mm256_srli_epi16(g2, 10); + g3 = _mm256_srli_epi16(g3, 10); + g2 = _mm256_mullo_epi16(g2, kyberq); + g3 = _mm256_mullo_epi16(g3, kyberq); + f0 = _mm256_sub_epi16(f0, g2); + f1 = _mm256_sub_epi16(f1, g3); - d0 = _mm256_shuffle_epi8(d0, pi0); - d1 = _mm256_shuffle_epi8(d1, pi1); - d2 = _mm256_shuffle_epi8(d2, pi2); + g2 = _mm256_add_epi8(g0, ones); + g3 = _mm256_add_epi8(g1, ones); + g0 = _mm256_unpacklo_epi8(g0, g2); + g1 = _mm256_unpacklo_epi8(g1, g3); - /* Barrett reduction of (still unsigned) d values */ - tmp0 = _mm256_mulhi_epu16(d0, v); - tmp1 = _mm256_mulhi_epu16(d1, v); - tmp2 = _mm256_mulhi_epu16(d2, v); - tmp0 = _mm256_srli_epi16(tmp0, 10); - tmp1 = _mm256_srli_epi16(tmp1, 10); - tmp2 = _mm256_srli_epi16(tmp2, 10); - tmp0 = _mm256_mullo_epi16(tmp0, kyberq); - tmp1 = _mm256_mullo_epi16(tmp1, kyberq); - tmp2 = _mm256_mullo_epi16(tmp2, kyberq); - d0 = _mm256_sub_epi16(d0, tmp0); - d1 = _mm256_sub_epi16(d1, tmp1); - d2 = _mm256_sub_epi16(d2, tmp2); + f0 = _mm256_shuffle_epi8(f0, g0); + f1 = _mm256_shuffle_epi8(f1, g1); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0)); - ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1)); - ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2)); - ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF); - pos += 96; + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); + ctr += _mm_popcnt_u32((good >> 0) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); + ctr += _mm_popcnt_u32((good >> 16) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); + ctr += _mm_popcnt_u32((good >> 8) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); + ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr + 8 <= len && pos + 16 <= buflen) { - d = _mm_loadu_si128((__m128i *)&buf[pos]); - tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d); - good0 = (uint32_t)_mm_movemask_epi8(tmp); - good0 = _pext_u32(good0, 0x55555555); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0]); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { + f = _mm_load_si128((__m128i *)&buf[pos]); + t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + good = _mm_movemask_epi8(t); + good = _pext_u32(good, 0x5555); + pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - d = _mm_shuffle_epi8(d, pilo); /* Barrett reduction */ - tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v)); - tmp = _mm_srli_epi16(tmp, 10); - tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq)); - d = _mm_sub_epi16(d, tmp); + t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); + t = _mm_srli_epi16(t, 10); + t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); + f = _mm_sub_epi16(f, t); - _mm_storeu_si128((__m128i *)&r[ctr], d); - ctr += (unsigned int)_mm_popcnt_u32(good0); + f = _mm_shuffle_epi8(f, pilo); + _mm_storeu_si128((__m128i *)&r[ctr], f); + ctr += _mm_popcnt_u32(good); pos += 16; } - while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = (int16_t)val; + r[ctr++] = val; } } diff --git a/crypto_kem/kyber512/avx2/rejsample.h b/crypto_kem/kyber512/avx2/rejsample.h index 7bcade5c..a832f82d 100644 --- a/crypto_kem/kyber512/avx2/rejsample.h +++ b/crypto_kem/kyber512/avx2/rejsample.h @@ -1,12 +1,11 @@ #ifndef REJSAMPLE_H #define REJSAMPLE_H -#include +#include "params.h" #include -size_t PQCLEAN_KYBER512_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); + +unsigned int PQCLEAN_KYBER512_AVX2_rej_uniform_avx(int16_t *r, + const unsigned char *buf); #endif diff --git a/crypto_kem/kyber1024/avx2/shuffle.s b/crypto_kem/kyber512/avx2/shuffle.S similarity index 66% rename from crypto_kem/kyber1024/avx2/shuffle.s rename to crypto_kem/kyber512/avx2/shuffle.S index fd53d83f..e6c8e413 100644 --- a/crypto_kem/kyber1024/avx2/shuffle.s +++ b/crypto_kem/kyber512/avx2/shuffle.S @@ -1,12 +1,9 @@ +#include "cdecl.inc" .include "fq.inc" .include "shuffle.inc" -.global PQCLEAN_KYBER1024_AVX2_nttunpack_avx -PQCLEAN_KYBER1024_AVX2_nttunpack_avx: -#consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xv(%rip),%ymm1 - +/* +nttpack_avx: #load vmovdqa (%rdi),%ymm4 vmovdqa 32(%rdi),%ymm5 @@ -17,18 +14,51 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -/* -#reduce -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 12 -red16 9 13 -red16 10 14 -red16 11 15 +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret */ +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + shuffle8 4,8,3,8 shuffle8 5,9,4,9 shuffle8 6,10,5,10 @@ -61,11 +91,14 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_KYBER1024_AVX2_ntttobytes_avx -PQCLEAN_KYBER1024_AVX2_ntttobytes_avx: -#consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 +.global cdecl(PQCLEAN_KYBER512_AVX2_nttunpack_avx) +cdecl(PQCLEAN_KYBER512_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret +ntttobytes128_avx: #load vmovdqa (%rsi),%ymm5 vmovdqa 32(%rsi),%ymm6 @@ -77,14 +110,14 @@ vmovdqa 192(%rsi),%ymm11 vmovdqa 224(%rsi),%ymm12 #csubq -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 1 -csubq 9 13 -csubq 10 14 -csubq 11 15 -csubq 12 1 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,1 +csubq 9,13 +csubq 10,14 +csubq 11,15 +csubq 12,1 #bitpack vpsllw $12,%ymm6,%ymm4 @@ -135,11 +168,17 @@ vmovdqu %ymm9,160(%rdi) ret -.global PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx -PQCLEAN_KYBER1024_AVX2_nttfrombytes_avx: +.global cdecl(PQCLEAN_KYBER512_AVX2_ntttobytes_avx) +cdecl(PQCLEAN_KYBER512_AVX2_ntttobytes_avx): #consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xmask(%rip),%ymm0 +vmovdqa _16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret +nttfrombytes128_avx: #load vmovdqu (%rsi),%ymm4 vmovdqu 32(%rsi),%ymm5 @@ -204,3 +243,13 @@ vmovdqa %ymm15,192(%rdi) vmovdqa %ymm1,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER512_AVX2_nttfrombytes_avx) +cdecl(PQCLEAN_KYBER512_AVX2_nttfrombytes_avx): +#consts +vmovdqa _16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret diff --git a/crypto_kem/kyber512/avx2/shuffle.inc b/crypto_kem/kyber512/avx2/shuffle.inc index df352030..d4b092bc 100644 --- a/crypto_kem/kyber512/avx2/shuffle.inc +++ b/crypto_kem/kyber512/avx2/shuffle.inc @@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 +#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 vpsllq $32,%ymm\r1,%ymm12 vpsrlq $32,%ymm\r0,%ymm13 vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 diff --git a/crypto_kem/kyber512/avx2/symmetric-fips202.c b/crypto_kem/kyber512/avx2/symmetric-fips202.c deleted file mode 100644 index 49706ab5..00000000 --- a/crypto_kem/kyber512/avx2/symmetric-fips202.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "fips202.h" -#include "symmetric.h" - -#include -/************************************************* -* Name: kyber_shake128_absorb -* -* Description: Absorb step of the SHAKE128 specialized for the Kyber context. - -* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *input: pointer to KYBER_SYMBYTES input to be absorbed into s -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input -**************************************************/ -void PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) { - size_t i; - uint8_t extseed[KYBER_SYMBYTES + 2]; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extseed[i] = input[i]; - } - extseed[i++] = x; - extseed[i] = y; - shake128_absorb(s, extseed, KYBER_SYMBYTES + 2); -} - -/************************************************* -* Name: kyber_shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. -* Modifies the state. Can be called multiple times to keep squeezing, -* i.e., is incremental. -* -* Arguments: - uint8_t *output: pointer to output blocks -* - unsigned long long nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to in/output Keccak state -**************************************************/ -void PQCLEAN_KYBER512_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s) { - shake128_squeezeblocks(output, nblocks, s); -} - -/************************************************* -* Name: shake256_prf -* -* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input -* and then generates outlen bytes of SHAKE256 output -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t * key: pointer to the key (of length KYBER_SYMBYTES) -* - const uint8_t nonce: single-byte nonce (public PRF input) -**************************************************/ -void PQCLEAN_KYBER512_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t extkey[KYBER_SYMBYTES + 1]; - size_t i; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extkey[i] = key[i]; - } - extkey[i] = nonce; - - shake256(output, outlen, extkey, KYBER_SYMBYTES + 1); -} diff --git a/crypto_kem/kyber512/avx2/symmetric-shake.c b/crypto_kem/kyber512/avx2/symmetric-shake.c new file mode 100644 index 00000000..e18e8c5b --- /dev/null +++ b/crypto_kem/kyber512/avx2/symmetric-shake.c @@ -0,0 +1,60 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include +#include + +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. +* +* Arguments: - keccak_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input +* to be absorbed into state +* - uint8_t i additional byte of input +* - uint8_t j additional byte of input +**************************************************/ +void PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(xof_state *state, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y) { + unsigned int i = 0; + uint8_t extseed[KYBER_SYMBYTES + 2]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extseed[i] = seed[i]; + } + extseed[i++] = x; + extseed[i] = y; + + shake128_absorb(state, extseed, sizeof(extseed)); +} + +/************************************************* +* Name: PQCLEAN_KYBER512_AVX2_kyber_shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key +* (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce) { + unsigned int i = 0; + uint8_t extkey[KYBER_SYMBYTES + 1]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extkey[i] = key[i]; + } + extkey[i] = nonce; + + shake256(out, outlen, extkey, sizeof(extkey)); +} diff --git a/crypto_kem/kyber512/avx2/symmetric.h b/crypto_kem/kyber512/avx2/symmetric.h index 88982be0..4b80e85c 100644 --- a/crypto_kem/kyber512/avx2/symmetric.h +++ b/crypto_kem/kyber512/avx2/symmetric.h @@ -2,28 +2,36 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "fips202.h" #include "fips202x4.h" -typedef shake128ctx keccak_state; +typedef shake128ctx xof_state; -void PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y); -void PQCLEAN_KYBER512_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s); -void PQCLEAN_KYBER512_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); +void PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(shake128ctx *s, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y); + +void PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce); + +#define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(STATE, IN, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER512_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER512_AVX2_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER512_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) -#define XOF_BLOCKBYTES SHAKE128_RATE - -typedef keccak_state xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber512/avx2/verify.c b/crypto_kem/kyber512/avx2/verify.c index 4763ce46..0f009974 100644 --- a/crypto_kem/kyber512/avx2/verify.c +++ b/crypto_kem/kyber512/avx2/verify.c @@ -1,23 +1,22 @@ #include "verify.h" - #include #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER512_AVX2_verify * * Description: Compare two arrays for equality in constant time. * -* Arguments: const uint8_t *a: pointer to first byte array -* const uint8_t *b: pointer to second byte array +* Arguments: const unsigned char *a: pointer to first byte array +* const unsigned char *b: pointer to second byte array * size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER512_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; - uint64_t r; +int PQCLEAN_KYBER512_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t pos = 0; + uint64_t r = 0; __m256i avec, bvec, cvec; cvec = _mm256_setzero_si256(); @@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER512_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t avec = _mm256_xor_si256(avec, bvec); cvec = _mm256_or_si256(cvec, avec); } + r = !_mm256_testz_si256(cvec, cvec); - cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256()); - r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1); - - while (pos < len) { - r |= a[pos] ^ b[pos]; - pos += 1; + if (pos < len) { + avec = _mm256_loadu_si256((__m256i *)&a[pos]); + bvec = _mm256_loadu_si256((__m256i *)&b[pos]); + cvec = _mm256_cmpeq_epi8(avec, bvec); + r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); } r = (-r) >> 63; - return (uint8_t)r; + return r; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER512_AVX2_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: uint8_t *r: pointer to output byte array -* const uint8_t *x: pointer to input byte array +* Arguments: unsigned char *r: pointer to output byte array +* const unsigned char *x: pointer to input byte array * size_t len: Amount of bytes to be copied -* uint8_t b: Condition bit; has to be in {0,1} +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER512_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER512_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { + size_t pos = 0; __m256i xvec, rvec, bvec; b = -b; - bvec = _mm256_set1_epi8((char)b); + bvec = _mm256_set1_epi8(b); for (pos = 0; pos + 32 <= len; pos += 32) { rvec = _mm256_loadu_si256((__m256i *)&r[pos]); diff --git a/crypto_kem/kyber512/avx2/verify.h b/crypto_kem/kyber512/avx2/verify.h index 4978884c..38c02db6 100644 --- a/crypto_kem/kyber512/avx2/verify.h +++ b/crypto_kem/kyber512/avx2/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER512_AVX2_VERIFY_H +#define PQCLEAN_KYBER512_AVX2_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER512_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER512_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER512_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber512/clean/LICENSE b/crypto_kem/kyber512/clean/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber512/clean/LICENSE +++ b/crypto_kem/kyber512/clean/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber512/clean/Makefile b/crypto_kem/kyber512/clean/Makefile index 0b0678ee..1f187f0d 100644 --- a/crypto_kem/kyber512/clean/Makefile +++ b/crypto_kem/kyber512/clean/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber512_clean.a -HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-fips202.o +HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h +OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-shake.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber512/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber512/clean/Makefile.Microsoft_nmake index 85c8d9a5..2f8c6221 100644 --- a/crypto_kem/kyber512/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber512/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber512_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-fips202.obj +OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-shake.obj # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber512/clean/cbd.c b/crypto_kem/kyber512/clean/cbd.c index 9dd0975f..3bed346f 100644 --- a/crypto_kem/kyber512/clean/cbd.c +++ b/crypto_kem/kyber512/clean/cbd.c @@ -1,7 +1,5 @@ -#include "cbd.h" #include "params.h" - -#include +#include "cbd.h" #include /************************************************* @@ -14,8 +12,8 @@ * * Returns 32-bit unsigned integer loaded from x **************************************************/ -static uint32_t load32_littleendian(const uint8_t *x) { - uint32_t r; +static uint32_t load32_littleendian(const uint8_t x[4]) { + uint32_t r = 0; r = (uint32_t)x[0]; r |= (uint32_t)x[1] << 8; r |= (uint32_t)x[2] << 16; @@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) { } /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER512_CLEAN_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA -* specialized for KYBER_ETA=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER512_CLEAN_cbd(poly *r, const uint8_t *buf) { - uint32_t d, t; - int16_t a, b; +void PQCLEAN_KYBER512_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { + unsigned int i = 0, j = 0; + uint32_t t = 0, d = 0; + int16_t a = 0, b = 0; - for (size_t i = 0; i < KYBER_N / 8; i++) { - t = load32_littleendian(buf + 4 * i); + for (i = 0; i < KYBER_N / 8; i++) { + t = load32_littleendian(buf + 4 * i); d = t & 0x55555555; d += (t >> 1) & 0x55555555; - for (size_t j = 0; j < 8; j++) { - a = (d >> 4 * j) & 0x3; + for (j = 0; j < 8; j++) { + a = (d >> (4 * j + 0)) & 0x3; b = (d >> (4 * j + 2)) & 0x3; r->coeffs[8 * i + j] = a - b; } diff --git a/crypto_kem/kyber512/clean/cbd.h b/crypto_kem/kyber512/clean/cbd.h index 2eb5dc89..1a4f3ef4 100644 --- a/crypto_kem/kyber512/clean/cbd.h +++ b/crypto_kem/kyber512/clean/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER512_CLEAN_CBD_H +#define PQCLEAN_KYBER512_CLEAN_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER512_CLEAN_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER512_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber512/clean/indcpa.c b/crypto_kem/kyber512/clean/indcpa.c index 3f734eb5..edb54c76 100644 --- a/crypto_kem/kyber512/clean/indcpa.c +++ b/crypto_kem/kyber512/clean/indcpa.c @@ -5,7 +5,7 @@ #include "polyvec.h" #include "randombytes.h" #include "symmetric.h" - +#include #include /************************************************* @@ -16,12 +16,15 @@ * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key +* polynomial vector +* - uint8_t *seed: pointer to output seed to generate +* matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER512_CLEAN_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(r, sk); } @@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of +* polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER512_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER512_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER512_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER512_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER512_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - size_t len: requested number of 16-bit integers (uniform mod q) -* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes) -* - size_t buflen: length of input buffer in bytes +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) **************************************************/ -static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - size_t ctr, pos; - uint16_t val; +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { - val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction + val -= (val >> 12) * KYBER_Q; // Barrett reduction r[ctr++] = (int16_t)val; } } @@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER512_CLEAN_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER512_CLEAN_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER512_CLEAN_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T +* is generated **************************************************/ -#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - uint8_t i, j; - uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1]; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +// Not static for benchmarking +void PQCLEAN_KYBER512_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { xof_absorb(&state, seed, j, i); } - xof_squeezeblocks(buf, MAXNBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES); + xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); while (ctr < KYBER_N) { xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, + XOF_BLOCKBYTES); } xof_ctx_release(&state); } @@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER512_CLEAN_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], e, pkpv, skpv; +void PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t *publicseed = buf; - uint8_t *noiseseed = buf + KYBER_SYMBYTES; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; uint8_t nonce = 0; + polyvec a[KYBER_K], e, pkpv, skpv; randombytes(buf, KYBER_SYMBYTES); hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER512_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER512_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER512_CLEAN_polyvec_ntt(&skpv); PQCLEAN_KYBER512_CLEAN_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv); - PQCLEAN_KYBER512_CLEAN_poly_frommont(&pkpv.vec[i]); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER512_CLEAN_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER512_CLEAN_polyvec_add(&pkpv, &pkpv, &e); @@ -217,34 +243,40 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER512_CLEAN_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec sp, pkpv, ep, at[KYBER_K], bp; - poly v, k, epp; +void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER512_CLEAN_poly_frommsg(&k, m); gen_at(at, seed); - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); } PQCLEAN_KYBER512_CLEAN_poly_getnoise(&epp, coins, nonce++); @@ -252,14 +284,14 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t *c, PQCLEAN_KYBER512_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER512_CLEAN_polyvec_invntt(&bp); - PQCLEAN_KYBER512_CLEAN_poly_invntt(&v); + PQCLEAN_KYBER512_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(&v); PQCLEAN_KYBER512_CLEAN_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER512_CLEAN_poly_add(&v, &v, &epp); @@ -271,18 +303,21 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER512_CLEAN_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER512_CLEAN_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER512_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -290,8 +325,8 @@ void PQCLEAN_KYBER512_CLEAN_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER512_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER512_CLEAN_poly_invntt(&mp); + PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER512_CLEAN_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER512_CLEAN_poly_reduce(&mp); diff --git a/crypto_kem/kyber512/clean/indcpa.h b/crypto_kem/kyber512/clean/indcpa.h index eaa4fb1e..cb865942 100644 --- a/crypto_kem/kyber512/clean/indcpa.h +++ b/crypto_kem/kyber512/clean/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER512_CLEAN_INDCPA_H +#define PQCLEAN_KYBER512_CLEAN_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER512_CLEAN_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER512_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER512_CLEAN_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER512_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER512_CLEAN_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER512_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER512_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber512/clean/kem.c b/crypto_kem/kyber512/clean/kem.c index f7490dd7..f7b77456 100644 --- a/crypto_kem/kyber512/clean/kem.c +++ b/crypto_kem/kyber512/clean/kem.c @@ -1,99 +1,125 @@ -#include "api.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER512_CLEAN_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER512_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER512_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER512_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER512_CLEAN_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER512_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER512_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { uint8_t buf[2 * KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + /* Don't release system RNG output */ + hash_h(buf, buf, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER512_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER512_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER512_CLEAN_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER512_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; +int PQCLEAN_KYBER512_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; PQCLEAN_KYBER512_CLEAN_indcpa_dec(buf, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER512_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER512_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER512_CLEAN_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER512_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER512_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber512/clean/kem.h b/crypto_kem/kyber512/clean/kem.h new file mode 100644 index 00000000..c3a98378 --- /dev/null +++ b/crypto_kem/kyber512/clean/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER512_CLEAN_KEM_H +#define PQCLEAN_KYBER512_CLEAN_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER512_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER512_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER512_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber512/clean/ntt.c b/crypto_kem/kyber512/clean/ntt.c index 444664b3..99f1257f 100644 --- a/crypto_kem/kyber512/clean/ntt.c +++ b/crypto_kem/kyber512/clean/ntt.c @@ -1,11 +1,9 @@ -#include "ntt.h" #include "params.h" +#include "ntt.h" #include "reduce.h" - -#include #include -/* Code to generate zetas and zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER512_CLEAN_zetas and PQCLEAN_KYBER512_CLEAN_zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 @@ -17,12 +15,8 @@ static const uint16_t tree[128] = { 1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121, 5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125, 3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123, - 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127}; - - -static int16_t fqmul(int16_t a, int16_t b) { - return montgomery_reduce((int32_t)a*b); -} + 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127 +}; void init_ntt() { unsigned int i, j, k; @@ -33,40 +27,44 @@ void init_ntt() { tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); for(i = 0; i < 128; ++i) - zetas[i] = tmp[tree[i]]; + PQCLEAN_KYBER512_CLEAN_zetas[i] = tmp[tree[i]]; k = 0; for(i = 64; i >= 1; i >>= 1) for(j = i; j < 2*i; ++j) - zetas_inv[k++] = -tmp[128 - tree[j]]; + PQCLEAN_KYBER512_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + PQCLEAN_KYBER512_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; } */ + const int16_t PQCLEAN_KYBER512_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, - 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, - 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, - 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, - 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, - 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, + 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, + 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, + 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, + 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, + 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, + 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, + 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, + 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 }; const int16_t PQCLEAN_KYBER512_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, - 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, - 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, - 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, - 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, - 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 + 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, + 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, + 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, + 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, + 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, + 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, + 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, + 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, + 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 }; - /************************************************* * Name: fqmul * @@ -82,68 +80,73 @@ static int16_t fqmul(int16_t a, int16_t b) { } /************************************************* -* Name: ntt +* Name: PQCLEAN_KYBER512_CLEAN_ntt * * Description: Inplace number-theoretic transform (NTT) in Rq * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER512_CLEAN_ntt(int16_t poly[256]) { - size_t j, k = 1; - int16_t t, zeta; +void PQCLEAN_KYBER512_CLEAN_ntt(int16_t r[256]) { + unsigned int len = 0, start = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 128; len >= 2; len >>= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 1; + for (len = 128; len >= 2; len >>= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER512_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = fqmul(zeta, poly[j + len]); - poly[j + len] = poly[j] - t; - poly[j] = poly[j] + t; + t = fqmul(zeta, r[j + len]); + r[j + len] = r[j] - t; + r[j] = r[j] + t; } } } } /************************************************* -* Name: invntt +* Name: invntt_tomont * -* Description: Inplace inverse number-theoretic transform in Rq -* input is in bitreversed order, output is in standard order +* Description: Inplace inverse number-theoretic transform in Rq and +* multiplication by Montgomery factor 2^16. +* Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER512_CLEAN_invntt(int16_t poly[256]) { - size_t j, k = 0; - int16_t t, zeta; +void PQCLEAN_KYBER512_CLEAN_invntt(int16_t r[256]) { + unsigned int start = 0, len = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 2; len <= 128; len <<= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 0; + for (len = 2; len <= 128; len <<= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER512_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { - t = poly[j]; - poly[j] = PQCLEAN_KYBER512_CLEAN_barrett_reduce(t + poly[j + len]); - poly[j + len] = t - poly[j + len]; - poly[j + len] = fqmul(zeta, poly[j + len]); + t = r[j]; + r[j] = PQCLEAN_KYBER512_CLEAN_barrett_reduce(t + r[j + len]); + r[j + len] = t - r[j + len]; + r[j + len] = fqmul(zeta, r[j + len]); } } } for (j = 0; j < 256; ++j) { - poly[j] = fqmul(poly[j], PQCLEAN_KYBER512_CLEAN_zetas_inv[127]); + r[j] = fqmul(r[j], PQCLEAN_KYBER512_CLEAN_zetas_inv[127]); } } /************************************************* -* Name: basemul +* Name: PQCLEAN_KYBER512_CLEAN_basemul * -* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta)) +* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); diff --git a/crypto_kem/kyber512/clean/ntt.h b/crypto_kem/kyber512/clean/ntt.h index 13e976f7..5f22c401 100644 --- a/crypto_kem/kyber512/clean/ntt.h +++ b/crypto_kem/kyber512/clean/ntt.h @@ -1,13 +1,22 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER512_CLEAN_NTT_H +#define PQCLEAN_KYBER512_CLEAN_NTT_H +#include "params.h" #include + extern const int16_t PQCLEAN_KYBER512_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER512_CLEAN_zetasinv[128]; -void PQCLEAN_KYBER512_CLEAN_ntt(int16_t *poly); -void PQCLEAN_KYBER512_CLEAN_invntt(int16_t *poly); + +extern const int16_t PQCLEAN_KYBER512_CLEAN_zetas_inv[128]; + + +void PQCLEAN_KYBER512_CLEAN_ntt(int16_t r[256]); + + +void PQCLEAN_KYBER512_CLEAN_invntt(int16_t r[256]); + + void PQCLEAN_KYBER512_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber512/clean/params.h b/crypto_kem/kyber512/clean/params.h index d086d4c6..d189bf85 100644 --- a/crypto_kem/kyber512/clean/params.h +++ b/crypto_kem/kyber512/clean/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER512_CLEAN_PARAMS_H +#define PQCLEAN_KYBER512_CLEAN_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 2 #define KYBER_POLYCOMPRESSEDBYTES 96 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber512/clean/poly.c b/crypto_kem/kyber512/clean/poly.c index b0136c8e..8a319c7e 100644 --- a/crypto_kem/kyber512/clean/poly.c +++ b/crypto_kem/kyber512/clean/poly.c @@ -1,118 +1,175 @@ +#include "params.h" #include "cbd.h" #include "ntt.h" -#include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include + /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER512_CLEAN_poly_compress * * Description: Compression and subsequent serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t k = 0; PQCLEAN_KYBER512_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N; i += 8) { - for (size_t j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 3) + KYBER_Q / 2) / KYBER_Q) & 7; } - r[k] = (uint8_t)( t[0] | (t[1] << 3) | (t[2] << 6)); - r[k + 1] = (uint8_t)((t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7)); - r[k + 2] = (uint8_t)((t[5] >> 1) | (t[6] << 2) | (t[7] << 5)); - k += 3; + r[0] = (t[0] >> 0) | (t[1] << 3) | (t[2] << 6); + r[1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7); + r[2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5); + r += 3; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER512_CLEAN_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER512_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_decompress(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)( (((a[0] & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 1] = (int16_t)(((((a[0] >> 3) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 2] = (int16_t)(((((a[0] >> 6) | ((a[1] << 2) & 4)) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 3] = (int16_t)(((((a[1] >> 1) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 4] = (int16_t)(((((a[1] >> 4) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 5] = (int16_t)(((((a[1] >> 7) | ((a[2] << 1) & 6)) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 6] = (int16_t)(((((a[2] >> 2) & 7) * KYBER_Q) + 4) >> 3); - r->coeffs[i + 7] = (int16_t)(((((a[2] >> 5)) * KYBER_Q) + 4) >> 3); +void PQCLEAN_KYBER512_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + unsigned int j = 0; + uint8_t t[8]; + for (i = 0; i < KYBER_N / 8; i++) { + t[0] = (a[0] >> 0); + t[1] = (a[0] >> 3); + t[2] = (a[0] >> 6) | (a[1] << 2); + t[3] = (a[1] >> 1); + t[4] = (a[1] >> 4); + t[5] = (a[1] >> 7) | (a[2] << 1); + t[6] = (a[2] >> 2); + t[7] = (a[2] >> 5); a += 3; + + for (j = 0; j < 8; j++) { + r->coeffs[8 * i + j] = ((uint16_t)(t[j] & 7) * KYBER_Q + 4) >> 3; + } } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER512_CLEAN_poly_tobytes * * Description: Serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t *r, poly *a) { - int16_t t0, t1; +void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + unsigned int i = 0; + uint16_t t0 = 0, t1 = 0; PQCLEAN_KYBER512_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N / 2; i++) { + for (i = 0; i < KYBER_N / 2; i++) { t0 = a->coeffs[2 * i]; t1 = a->coeffs[2 * i + 1]; - r[3 * i] = t0 & 0xff; - r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 & 0xf) << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER512_CLEAN_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER512_CLEAN_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_frombytes(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N / 2; i++) { - r->coeffs[2 * i] = (int16_t)(a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8); - r->coeffs[2 * i + 1] = (int16_t)(a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4); +void PQCLEAN_KYBER512_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF; + r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF; } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER512_CLEAN_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void PQCLEAN_KYBER512_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + unsigned int i = 0, j = 0; + int16_t mask = 0; + + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + mask = -(int16_t)((msg[i] >> j) & 1); + r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER512_CLEAN_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER512_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { + unsigned int i = 0, j = 0; + uint16_t t = 0; + + PQCLEAN_KYBER512_CLEAN_poly_csubq(a); + + for (i = 0; i < KYBER_N / 8; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + msg[i] |= t << j; + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER512_CLEAN_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { +void PQCLEAN_KYBER512_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); + prf(buf, sizeof(buf), seed, nonce); PQCLEAN_KYBER512_CLEAN_cbd(r, buf); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER512_CLEAN_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -126,20 +183,20 @@ void PQCLEAN_KYBER512_CLEAN_poly_ntt(poly *r) { } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_invntt(poly *r) { +void PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(poly *r) { PQCLEAN_KYBER512_CLEAN_invntt(r->coeffs); } /************************************************* -* Name: poly_basemul +* Name: PQCLEAN_KYBER512_CLEAN_poly_basemul_montgomery * * Description: Multiplication of two polynomials in NTT domain * @@ -147,68 +204,64 @@ void PQCLEAN_KYBER512_CLEAN_poly_invntt(poly *r) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N / 4; ++i) { - PQCLEAN_KYBER512_CLEAN_basemul( - r->coeffs + 4 * i, - a->coeffs + 4 * i, - b->coeffs + 4 * i, - PQCLEAN_KYBER512_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER512_CLEAN_basemul( - r->coeffs + 4 * i + 2, - a->coeffs + 4 * i + 2, - b->coeffs + 4 * i + 2, - -PQCLEAN_KYBER512_CLEAN_zetas[64 + i]); +void PQCLEAN_KYBER512_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 4; i++) { + PQCLEAN_KYBER512_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER512_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER512_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], + -PQCLEAN_KYBER512_CLEAN_zetas[64 + i]); } } /************************************************* -* Name: poly_frommont +* Name: PQCLEAN_KYBER512_CLEAN_poly_tomont * * Description: Inplace conversion of all coefficients of a polynomial -* from Montgomery domain to normal domain +* from normal domain to Montgomery domain * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_frommont(poly *r) { +void PQCLEAN_KYBER512_CLEAN_poly_tomont(poly *r) { + unsigned int i = 0; const int16_t f = (1ULL << 32) % KYBER_Q; - - for (size_t i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER512_CLEAN_montgomery_reduce( - (int32_t)r->coeffs[i] * f); + for (i = 0; i < KYBER_N; i++) { + r->coeffs[i] = PQCLEAN_KYBER512_CLEAN_montgomery_reduce((int32_t)r->coeffs[i] * f); } } /************************************************* -* Name: poly_reduce +* Name: PQCLEAN_KYBER512_CLEAN_poly_reduce * * Description: Applies Barrett reduction to all coefficients of a polynomial * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER512_CLEAN_poly_reduce(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER512_CLEAN_barrett_reduce(r->coeffs[i]); } } /************************************************* -* Name: poly_csubq +* Name: PQCLEAN_KYBER512_CLEAN_poly_csubq * -* Description: Applies conditional subtraction of q to each coefficient of a polynomial -* for details of conditional subtraction of q see comments in reduce.c +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER512_CLEAN_poly_csubq(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER512_CLEAN_csubq(r->coeffs[i]); } } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER512_CLEAN_poly_add * * Description: Add two polynomials * @@ -217,13 +270,14 @@ void PQCLEAN_KYBER512_CLEAN_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER512_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER512_CLEAN_poly_sub * * Description: Subtract two polynomials * @@ -232,48 +286,8 @@ void PQCLEAN_KYBER512_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER512_CLEAN_poly_sub(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] - b->coeffs[i]; } } - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - uint16_t mask; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - for (size_t j = 0; j < 8; j++) { - mask = -((msg[i] >> j) & 1); - r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); - } - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER512_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint16_t t; - - PQCLEAN_KYBER512_CLEAN_poly_csubq(a); - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - msg[i] = 0; - for (size_t j = 0; j < 8; j++) { - t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; - msg[i] |= t << j; - } - } -} diff --git a/crypto_kem/kyber512/clean/poly.h b/crypto_kem/kyber512/clean/poly.h index ecdc7c29..1cd9d235 100644 --- a/crypto_kem/kyber512/clean/poly.h +++ b/crypto_kem/kyber512/clean/poly.h @@ -1,9 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER512_CLEAN_POLY_H +#define PQCLEAN_KYBER512_CLEAN_POLY_H #include "params.h" - #include + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -12,26 +12,41 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER512_CLEAN_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER512_CLEAN_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER512_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER512_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER512_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER512_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER512_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER512_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER512_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER512_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER512_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER512_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + void PQCLEAN_KYBER512_CLEAN_poly_ntt(poly *r); -void PQCLEAN_KYBER512_CLEAN_poly_invntt(poly *r); -void PQCLEAN_KYBER512_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER512_CLEAN_poly_frommont(poly *r); + +void PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(poly *r); + +void PQCLEAN_KYBER512_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER512_CLEAN_poly_tomont(poly *r); + void PQCLEAN_KYBER512_CLEAN_poly_reduce(poly *r); + void PQCLEAN_KYBER512_CLEAN_poly_csubq(poly *r); + void PQCLEAN_KYBER512_CLEAN_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER512_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber512/clean/polyvec.c b/crypto_kem/kyber512/clean/polyvec.c index ab4a352a..6033d526 100644 --- a/crypto_kem/kyber512/clean/polyvec.c +++ b/crypto_kem/kyber512/clean/polyvec.c @@ -1,128 +1,153 @@ -#include "polyvec.h" - +#include "params.h" #include "poly.h" - -#include +#include "polyvec.h" #include + /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_compress * * Description: Compress and serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER512_CLEAN_polyvec_csubq(a); uint16_t t[4]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - for (size_t k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) + / KYBER_Q) & 0x3ff; + } } - r[5 * j + 0] = (uint8_t)t[0]; - r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x3f) << 2)); - r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] & 0x0f) << 4)); - r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] & 0x03) << 6)); - r[5 * j + 4] = (uint8_t)((t[3] >> 2)); + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; } - r += 320; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER512_CLEAN_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - r->vec[i].coeffs[4 * j + 0] = (int16_t)( (((a[5 * j + 0] | (((uint32_t)a[5 * j + 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 1] = (int16_t)(((((a[5 * j + 1] >> 2) | (((uint32_t)a[5 * j + 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 2] = (int16_t)(((((a[5 * j + 2] >> 4) | (((uint32_t)a[5 * j + 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 3] = (int16_t)(((((a[5 * j + 3] >> 6) | (((uint32_t)a[5 * j + 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10); +void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } } - a += 320; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_tobytes * * Description: Serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER512_CLEAN_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) +* Arguments: - uint8_t *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER512_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER512_CLEAN_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER512_CLEAN_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER512_CLEAN_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { +void PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + unsigned int i = 0; poly t; - PQCLEAN_KYBER512_CLEAN_poly_basemul(r, &a->vec[0], &b->vec[0]); - for (size_t i = 1; i < KYBER_K; i++) { - PQCLEAN_KYBER512_CLEAN_poly_basemul(&t, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER512_CLEAN_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER512_CLEAN_poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]); PQCLEAN_KYBER512_CLEAN_poly_add(r, r, &t); } @@ -130,37 +155,40 @@ void PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, con } /************************************************* -* Name: polyvec_reduce +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient * of each element of a vector of polynomials * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER512_CLEAN_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_reduce(&r->vec[i]); } } /************************************************* -* Name: polyvec_csubq +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_csubq * * Description: Applies conditional subtraction of q to each coefficient * of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in reduce.c +* for details of conditional subtraction of q see comments in +* reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER512_CLEAN_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER512_CLEAN_polyvec_add * * Description: Add vectors of polynomials * @@ -169,7 +197,8 @@ void PQCLEAN_KYBER512_CLEAN_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER512_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER512_CLEAN_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber512/clean/polyvec.h b/crypto_kem/kyber512/clean/polyvec.h index 159d1bd2..33b27cb8 100644 --- a/crypto_kem/kyber512/clean/polyvec.h +++ b/crypto_kem/kyber512/clean/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER512_CLEAN_POLYVEC_H +#define PQCLEAN_KYBER512_CLEAN_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER512_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER512_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER512_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER512_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER512_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER512_CLEAN_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER512_CLEAN_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER512_CLEAN_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER512_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER512_CLEAN_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER512_CLEAN_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER512_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber512/clean/reduce.c b/crypto_kem/kyber512/clean/reduce.c index 60415dee..6e3b086d 100644 --- a/crypto_kem/kyber512/clean/reduce.c +++ b/crypto_kem/kyber512/clean/reduce.c @@ -1,32 +1,32 @@ -#include "reduce.h" - #include "params.h" - +#include "reduce.h" #include + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_KYBER512_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes * 16-bit integer congruent to a * R^-1 mod q, * where R=2^16 * -* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1} +* Arguments: - int32_t a: input integer to be reduced; +* has to be in {-q2^15,...,q2^15-1} * * Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. **************************************************/ int16_t PQCLEAN_KYBER512_CLEAN_montgomery_reduce(int32_t a) { - int32_t t; - int16_t u; + int32_t t = 0; + int16_t u = 0; u = (int16_t)(a * (int64_t)QINV); t = (int32_t)u * KYBER_Q; t = a - t; t >>= 16; - return (int16_t)t; + return t; } /************************************************* -* Name: barrett_reduce +* Name: PQCLEAN_KYBER512_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes * 16-bit integer congruent to a mod q in {0,...,q} @@ -36,21 +36,20 @@ int16_t PQCLEAN_KYBER512_CLEAN_montgomery_reduce(int32_t a) { * Returns: integer in {0,...,q} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER512_CLEAN_barrett_reduce(int16_t a) { - int32_t t; - const int32_t v = (1U << 26) / KYBER_Q + 1; + int16_t t = 0; + const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = v * a; - t >>= 26; + t = (int32_t)v * a >> 26; t *= KYBER_Q; - return a - (int16_t)t; + return a - t; } /************************************************* -* Name: csubq +* Name: PQCLEAN_KYBER512_CLEAN_csubq * * Description: Conditionallly subtract q * -* Arguments: - int16_t a: input integer +* Arguments: - int16_t x: input integer * * Returns: a - q if a >= q, else a **************************************************/ diff --git a/crypto_kem/kyber512/clean/reduce.h b/crypto_kem/kyber512/clean/reduce.h index 68a7f570..c7db6ba5 100644 --- a/crypto_kem/kyber512/clean/reduce.h +++ b/crypto_kem/kyber512/clean/reduce.h @@ -1,15 +1,19 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER512_CLEAN_REDUCE_H +#define PQCLEAN_KYBER512_CLEAN_REDUCE_H +#include "params.h" #include -#define MONT 2285 // 2^16 % Q -#define QINV 62209 // q^(-1) mod 2^16 +#define MONT 2285 // 2^16 mod q +#define QINV 62209 // q^-1 mod 2^16 + int16_t PQCLEAN_KYBER512_CLEAN_montgomery_reduce(int32_t a); + int16_t PQCLEAN_KYBER512_CLEAN_barrett_reduce(int16_t a); + int16_t PQCLEAN_KYBER512_CLEAN_csubq(int16_t a); #endif diff --git a/crypto_kem/kyber512/clean/symmetric-fips202.c b/crypto_kem/kyber512/clean/symmetric-fips202.c deleted file mode 100644 index fdc11f25..00000000 --- a/crypto_kem/kyber512/clean/symmetric-fips202.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "fips202.h" -#include "symmetric.h" - -#include -/************************************************* -* Name: kyber_shake128_absorb -* -* Description: Absorb step of the SHAKE128 specialized for the Kyber context. - -* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *input: pointer to KYBER_SYMBYTES input to be absorbed into s -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input -**************************************************/ -void PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) { - size_t i; - uint8_t extseed[KYBER_SYMBYTES + 2]; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extseed[i] = input[i]; - } - extseed[i++] = x; - extseed[i] = y; - shake128_absorb(s, extseed, KYBER_SYMBYTES + 2); -} - -/************************************************* -* Name: kyber_shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. -* Modifies the state. Can be called multiple times to keep squeezing, -* i.e., is incremental. -* -* Arguments: - uint8_t *output: pointer to output blocks -* - unsigned long long nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to in/output Keccak state -**************************************************/ -void PQCLEAN_KYBER512_CLEAN_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s) { - shake128_squeezeblocks(output, nblocks, s); -} - -/************************************************* -* Name: shake256_prf -* -* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input -* and then generates outlen bytes of SHAKE256 output -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t * key: pointer to the key (of length KYBER_SYMBYTES) -* - const uint8_t nonce: single-byte nonce (public PRF input) -**************************************************/ -void PQCLEAN_KYBER512_CLEAN_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t extkey[KYBER_SYMBYTES + 1]; - size_t i; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extkey[i] = key[i]; - } - extkey[i] = nonce; - - shake256(output, outlen, extkey, KYBER_SYMBYTES + 1); -} diff --git a/crypto_kem/kyber512/clean/symmetric-shake.c b/crypto_kem/kyber512/clean/symmetric-shake.c new file mode 100644 index 00000000..9e6f203f --- /dev/null +++ b/crypto_kem/kyber512/clean/symmetric-shake.c @@ -0,0 +1,60 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include +#include + +/************************************************* +* Name: PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. +* +* Arguments: - keccak_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input +* to be absorbed into state +* - uint8_t i additional byte of input +* - uint8_t j additional byte of input +**************************************************/ +void PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(xof_state *state, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y) { + unsigned int i = 0; + uint8_t extseed[KYBER_SYMBYTES + 2]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extseed[i] = seed[i]; + } + extseed[i++] = x; + extseed[i] = y; + + shake128_absorb(state, extseed, sizeof(extseed)); +} + +/************************************************* +* Name: PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key +* (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce) { + unsigned int i = 0; + uint8_t extkey[KYBER_SYMBYTES + 1]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extkey[i] = key[i]; + } + extkey[i] = nonce; + + shake256(out, outlen, extkey, sizeof(extkey)); +} diff --git a/crypto_kem/kyber512/clean/symmetric.h b/crypto_kem/kyber512/clean/symmetric.h index 5ef4f0f7..1d787d82 100644 --- a/crypto_kem/kyber512/clean/symmetric.h +++ b/crypto_kem/kyber512/clean/symmetric.h @@ -2,29 +2,35 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "fips202.h" -#include +typedef shake128ctx xof_state; + +void PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(xof_state *s, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y); -typedef shake128ctx keccak_state; +void PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce); -void PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y); -void PQCLEAN_KYBER512_CLEAN_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s); -void PQCLEAN_KYBER512_CLEAN_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); +#define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(STATE, IN, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER512_CLEAN_kyber_shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER512_CLEAN_kyber_shake128_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER512_CLEAN_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER512_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) -#define XOF_BLOCKBYTES 168 - -typedef keccak_state xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber512/clean/verify.c b/crypto_kem/kyber512/clean/verify.c index 149e52d7..619f1a2b 100644 --- a/crypto_kem/kyber512/clean/verify.c +++ b/crypto_kem/kyber512/clean/verify.c @@ -1,34 +1,31 @@ #include "verify.h" - #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER512_CLEAN_verify * * Description: Compare two arrays for equality in constant time. * * Arguments: const uint8_t *a: pointer to first byte array * const uint8_t *b: pointer to second byte array -* size_t len: length of the byte arrays +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER512_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { - uint64_t r; - size_t i; - r = 0; +int PQCLEAN_KYBER512_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t i = 0; + uint8_t r = 0; for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } - r = (-r) >> 63; - return (uint8_t)r; + return (-(uint64_t)r) >> 63; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER512_CLEAN_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; @@ -37,14 +34,14 @@ uint8_t PQCLEAN_KYBER512_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t * * Arguments: uint8_t *r: pointer to output byte array * const uint8_t *x: pointer to input byte array -* size_t len: Amount of bytes to be copied +* size_t len: Amount of bytes to be copied * uint8_t b: Condition bit; has to be in {0,1} **************************************************/ void PQCLEAN_KYBER512_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t i; + size_t i = 0; b = -b; for (i = 0; i < len; i++) { - r[i] ^= b & (x[i] ^ r[i]); + r[i] ^= b & (r[i] ^ x[i]); } } diff --git a/crypto_kem/kyber512/clean/verify.h b/crypto_kem/kyber512/clean/verify.h index d95be219..0664f357 100644 --- a/crypto_kem/kyber512/clean/verify.h +++ b/crypto_kem/kyber512/clean/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER512_CLEAN_VERIFY_H +#define PQCLEAN_KYBER512_CLEAN_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER512_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER512_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER512_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber768-90s/META.yml b/crypto_kem/kyber768-90s/META.yml index f9c895d1..eda3e3f1 100644 --- a/crypto_kem/kyber768-90s/META.yml +++ b/crypto_kem/kyber768-90s/META.yml @@ -20,16 +20,17 @@ auxiliary-submitters: - Gregor Seiler - Damien Stehlé implementations: - - name: clean - version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 - - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - required_flags: - - aes - - avx2 - - bmi2 - - popcnt + - name: clean + version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 + - name: avx2 + version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - aes + - avx2 + - bmi2 + - popcnt diff --git a/crypto_kem/kyber768-90s/avx2/LICENSE b/crypto_kem/kyber768-90s/avx2/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber768-90s/avx2/LICENSE +++ b/crypto_kem/kyber768-90s/avx2/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber768-90s/avx2/Makefile b/crypto_kem/kyber768-90s/avx2/Makefile index e7dcec06..4689bf26 100644 --- a/crypto_kem/kyber768-90s/avx2/Makefile +++ b/crypto_kem/kyber768-90s/avx2/Makefile @@ -1,9 +1,40 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber768-90s_avx2.a -HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h aes256ctr.h -OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \ - verify.o indcpa.o rejsample.o aes256ctr.o +HEADERS= \ + aes256ctr.h \ + align.h \ + api.h \ + cbd.h \ + cdecl.inc \ + consts.h \ + fq.inc \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + rejsample.h \ + shuffle.inc \ + symmetric.h \ + verify.h +OBJECTS= \ + aes256ctr.o \ + basemul.o \ + cbd.o \ + consts.o \ + fq.o \ + indcpa.o \ + invntt.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + rejsample.o \ + shuffle.o \ + verify.o CFLAGS=-mavx2 -maes -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ -Wmissing-prototypes -Wredundant-decls -std=c99 \ @@ -14,11 +45,8 @@ all: $(LIB) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) $(CFLAGS) -c -o $@ $< $(LIB): $(OBJECTS) $(AR) -r $@ $(OBJECTS) diff --git a/crypto_kem/kyber768-90s/avx2/aes256ctr.c b/crypto_kem/kyber768-90s/avx2/aes256ctr.c index 7a2d3a15..4b5522a2 100644 --- a/crypto_kem/kyber768-90s/avx2/aes256ctr.c +++ b/crypto_kem/kyber768-90s/avx2/aes256ctr.c @@ -1,94 +1,68 @@ /* - crypto_stream_aes256ctr - based heavily on public-domain code by Romain Dolbeau + Based heavily on public-domain code by Romain Dolbeau Different handling of nonce+counter than original version - using separated 96-bit nonce and internal 32-bit counter, starting from zero + using separated 64-bit nonce and internal 64-bit counter, starting from zero Public Domain */ #include "aes256ctr.h" - #include +#include #include -static inline void aesni_encrypt8(uint8_t *out, +static inline void aesni_encrypt4(uint8_t out[64], __m128i *n, const __m128i rkeys[16]) { - __m128i nv0; - __m128i nv1; - __m128i nv2; - __m128i nv3; - __m128i nv4; - __m128i nv5; - __m128i nv6; - __m128i nv7; + __m128i f, f0, f1, f2, f3, t; /* Load current counter value */ - __m128i nv0i = _mm_load_si128(n); - - /* Increase counter in 8 consecutive blocks */ - nv0 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(0, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv1 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(1, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv2 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(2, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv3 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(3, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv4 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(4, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv5 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(5, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv6 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(6, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - nv7 = _mm_shuffle_epi8(_mm_add_epi32(nv0i, _mm_set_epi64x(7, 0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); - - /* Write counter for next iteration, increased by 8 */ - _mm_store_si128(n, _mm_add_epi32(nv0i, _mm_set_epi64x(8, 0))); - - /* Actual AES encryption, 8x interleaved */ - __m128i temp0 = _mm_xor_si128(nv0, rkeys[0]); - __m128i temp1 = _mm_xor_si128(nv1, rkeys[0]); - __m128i temp2 = _mm_xor_si128(nv2, rkeys[0]); - __m128i temp3 = _mm_xor_si128(nv3, rkeys[0]); - __m128i temp4 = _mm_xor_si128(nv4, rkeys[0]); - __m128i temp5 = _mm_xor_si128(nv5, rkeys[0]); - __m128i temp6 = _mm_xor_si128(nv6, rkeys[0]); - __m128i temp7 = _mm_xor_si128(nv7, rkeys[0]); - - for (uint8_t i = 1; i < 14; i++) { - temp0 = _mm_aesenc_si128(temp0, rkeys[i]); - temp1 = _mm_aesenc_si128(temp1, rkeys[i]); - temp2 = _mm_aesenc_si128(temp2, rkeys[i]); - temp3 = _mm_aesenc_si128(temp3, rkeys[i]); - temp4 = _mm_aesenc_si128(temp4, rkeys[i]); - temp5 = _mm_aesenc_si128(temp5, rkeys[i]); - temp6 = _mm_aesenc_si128(temp6, rkeys[i]); - temp7 = _mm_aesenc_si128(temp7, rkeys[i]); + f = _mm_load_si128(n); + + /* Increase counter in 4 consecutive blocks */ + t = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + f0 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(0, 0)), t); + f1 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(1, 0)), t); + f2 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(2, 0)), t); + f3 = _mm_shuffle_epi8(_mm_add_epi64(f, _mm_set_epi64x(3, 0)), t); + + /* Write counter for next iteration, increased by 4 */ + _mm_store_si128(n, _mm_add_epi64(f, _mm_set_epi64x(4, 0))); + + /* Actual AES encryption, 4x interleaved */ + t = _mm_load_si128(&rkeys[0]); + f0 = _mm_xor_si128(f0, t); + f1 = _mm_xor_si128(f1, t); + f2 = _mm_xor_si128(f2, t); + f3 = _mm_xor_si128(f3, t); + + for (int i = 1; i < 14; i++) { + t = _mm_load_si128(&rkeys[i]); + f0 = _mm_aesenc_si128(f0, t); + f1 = _mm_aesenc_si128(f1, t); + f2 = _mm_aesenc_si128(f2, t); + f3 = _mm_aesenc_si128(f3, t); } - temp0 = _mm_aesenclast_si128(temp0, rkeys[14]); - temp1 = _mm_aesenclast_si128(temp1, rkeys[14]); - temp2 = _mm_aesenclast_si128(temp2, rkeys[14]); - temp3 = _mm_aesenclast_si128(temp3, rkeys[14]); - temp4 = _mm_aesenclast_si128(temp4, rkeys[14]); - temp5 = _mm_aesenclast_si128(temp5, rkeys[14]); - temp6 = _mm_aesenclast_si128(temp6, rkeys[14]); - temp7 = _mm_aesenclast_si128(temp7, rkeys[14]); + t = _mm_load_si128(&rkeys[14]); + f0 = _mm_aesenclast_si128(f0, t); + f1 = _mm_aesenclast_si128(f1, t); + f2 = _mm_aesenclast_si128(f2, t); + f3 = _mm_aesenclast_si128(f3, t); /* Write results */ - _mm_storeu_si128((__m128i *)(out + 0), temp0); - _mm_storeu_si128((__m128i *)(out + 16), temp1); - _mm_storeu_si128((__m128i *)(out + 32), temp2); - _mm_storeu_si128((__m128i *)(out + 48), temp3); - _mm_storeu_si128((__m128i *)(out + 64), temp4); - _mm_storeu_si128((__m128i *)(out + 80), temp5); - _mm_storeu_si128((__m128i *)(out + 96), temp6); - _mm_storeu_si128((__m128i *)(out + 112), temp7); + _mm_storeu_si128((__m128i *)(out + 0), f0); + _mm_storeu_si128((__m128i *)(out + 16), f1); + _mm_storeu_si128((__m128i *)(out + 32), f2); + _mm_storeu_si128((__m128i *)(out + 48), f3); } -void PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(aes256ctr_ctx *state, - const uint8_t *key, - uint16_t nonce) { - __m128i key0 = _mm_loadu_si128((__m128i *)(key + 0)); - __m128i key1 = _mm_loadu_si128((__m128i *)(key + 16)); - __m128i temp0, temp1, temp2, temp4; - size_t idx = 0; +void PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce) { + __m128i key0, key1, temp0, temp1, temp2, temp4; + int idx = 0; - state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48); + key0 = _mm_loadu_si128((__m128i *)(key + 0)); + key1 = _mm_loadu_si128((__m128i *)(key + 16)); + state->n = _mm_loadl_epi64((__m128i *)&nonce); state->rkeys[idx++] = key0; temp0 = key0; @@ -137,38 +111,33 @@ void PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(aes256ctr_ctx *state, state->rkeys[idx++] = temp0; } -void PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce) { - state->n = _mm_set_epi64x(0, (uint64_t)nonce << 48); -} - void PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *state) { - size_t i; - + size_t i = 0; for (i = 0; i < nblocks; i++) { - aesni_encrypt8(out, &state->n, state->rkeys); - out += 128; + aesni_encrypt4(out, &state->n, state->rkeys); + out += 64; } } void PQCLEAN_KYBER76890S_AVX2_aes256ctr_prf(uint8_t *out, size_t outlen, - const uint8_t *seed, - uint8_t nonce) { - size_t i; - uint8_t buf[128]; + const uint8_t seed[32], + uint64_t nonce) { + unsigned int i = 0; + uint8_t buf[64]; aes256ctr_ctx state; - PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, seed, (uint16_t)nonce << 8); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, seed, nonce); - while (outlen >= 128) { - aesni_encrypt8(out, &state.n, state.rkeys); - outlen -= 128; + while (outlen >= 64) { + aesni_encrypt4(out, &state.n, state.rkeys); + outlen -= 64; } if (outlen) { - aesni_encrypt8(buf, &state.n, state.rkeys); + aesni_encrypt4(buf, &state.n, state.rkeys); for (i = 0; i < outlen; i++) { out[i] = buf[i]; } diff --git a/crypto_kem/kyber768-90s/avx2/aes256ctr.h b/crypto_kem/kyber768-90s/avx2/aes256ctr.h index 7e0e54d3..5725b5a6 100644 --- a/crypto_kem/kyber768-90s/avx2/aes256ctr.h +++ b/crypto_kem/kyber768-90s/avx2/aes256ctr.h @@ -5,22 +5,17 @@ #include #include +#define AES256CTR_NAMESPACE(s) pqcrystals_aes256ctr_avx2##s + +#define AES256CTR_BLOCKBYTES 64 + typedef struct { __m128i rkeys[16]; __m128i n; } aes256ctr_ctx; -void PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(aes256ctr_ctx *state, - const uint8_t *key, - uint16_t nonce); -void PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(aes256ctr_ctx *state, uint16_t nonce); -void PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, - size_t nblocks, - aes256ctr_ctx *state); - -void PQCLEAN_KYBER76890S_AVX2_aes256ctr_prf(uint8_t *out, - size_t outlen, - const uint8_t *seed, - uint8_t nonce); +void PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(aes256ctr_ctx *state, const uint8_t key[32], uint64_t nonce); +void PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(uint8_t *out, size_t nblocks, aes256ctr_ctx *state); +void PQCLEAN_KYBER76890S_AVX2_aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t seed[32], uint64_t nonce); #endif diff --git a/crypto_kem/kyber768-90s/avx2/align.h b/crypto_kem/kyber768-90s/avx2/align.h new file mode 100644 index 00000000..898b1183 --- /dev/null +++ b/crypto_kem/kyber768-90s/avx2/align.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_KYBER76890S_AVX2_ALIGN_H +#define PQCLEAN_KYBER76890S_AVX2_ALIGN_H +#include + +#define ALIGN16_TYPE(t) \ + union { \ + __m128i vec; \ + t orig; \ + } + +#define ALIGN32_ARRAY(t, s) \ + union { \ + __m256i vec; \ + t arr[(s)]; \ + } + +#define ALIGN32_ARRAY_2D(t, n, m) \ + union { \ + __m256i vec; \ + t arr[(n)][(m)]; \ + } +#endif diff --git a/crypto_kem/kyber768-90s/avx2/basemul.S b/crypto_kem/kyber768-90s/avx2/basemul.S index 7341b0b6..cf11d6c5 100644 --- a/crypto_kem/kyber768-90s/avx2/basemul.S +++ b/crypto_kem/kyber768-90s/avx2/basemul.S @@ -1,4 +1,5 @@ #include "params.h" +#include "cdecl.inc" .macro schoolbook off,sign #load @@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0 vpaddd %ymm7,%ymm8,%ymm7 # y1 .endm -.macro red a0,a1,b0,b1 x,y,z +.macro red a0,a1,b0,b1,x,y,z #pack vpxor %ymm\x,%ymm\x,%ymm\x vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y @@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0 vpsubw %ymm\y,%ymm\b0,%ymm\b0 .endm -.global PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx -PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 - +.text +basemul64_acc_avx: poly0.0: schoolbook 0,0 @@ -109,7 +105,7 @@ vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,(%rdi) @@ -144,7 +140,7 @@ vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,64(%rdi) @@ -152,17 +148,40 @@ vmovdqa %ymm5,96(%rdi) ret -.global PQCLEAN_KYBER76890S_AVX2_basemul_avx -PQCLEAN_KYBER76890S_AVX2_basemul_avx: +.global cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx): #consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +ret +basemul64_avx: schoolbook 0,0 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,(%rdi) @@ -171,10 +190,39 @@ vmovdqa %ymm12,32(%rdi) schoolbook 64,1 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,64(%rdi) vmovdqa %ymm12,96(%rdi) ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_basemul_avx): +#consts +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +ret diff --git a/crypto_kem/kyber768-90s/avx2/cbd.c b/crypto_kem/kyber768-90s/avx2/cbd.c index 7c6cd243..65716707 100644 --- a/crypto_kem/kyber768-90s/avx2/cbd.c +++ b/crypto_kem/kyber768-90s/avx2/cbd.c @@ -1,27 +1,27 @@ -#include "cbd.h" #include "params.h" - +#include "cbd.h" #include #include /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER76890S_AVX2_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA * * Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *buf: pointer to input byte array +* - const unsigned char *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_cbd(poly *r, const uint8_t *buf) { +void PQCLEAN_KYBER76890S_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { + unsigned int i = 0; __m256i vec0, vec1, vec2, vec3, tmp; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); - for (size_t i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]); + for (i = 0; i < KYBER_N / 64; i++) { + vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); vec1 = _mm256_srli_epi32(vec0, 1); vec0 = _mm256_and_si256(mask55, vec0); diff --git a/crypto_kem/kyber768-90s/avx2/cbd.h b/crypto_kem/kyber768-90s/avx2/cbd.h index 14065af5..2a88f4cc 100644 --- a/crypto_kem/kyber768-90s/avx2/cbd.h +++ b/crypto_kem/kyber768-90s/avx2/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER76890S_AVX2_CBD_H +#define PQCLEAN_KYBER76890S_AVX2_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER76890S_AVX2_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER76890S_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber768-90s/avx2/cdecl.inc b/crypto_kem/kyber768-90s/avx2/cdecl.inc new file mode 100644 index 00000000..8ded53b1 --- /dev/null +++ b/crypto_kem/kyber768-90s/avx2/cdecl.inc @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL +#define PQCLEAN_DILITHIUM2_AVX2_CDECL + +#define _16XQ 0 +#define _16XQINV 16 +#define _16XV 32 +#define _16XFLO 48 +#define _16XFHI 64 +#define _16XMONTSQLO 80 +#define _16XMONTSQHI 96 +#define _16XMASK 112 +#define _ZETAS_EXP 128 +#define _ZETAS_INV_EXP 528 + + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_kem/kyber768-90s/avx2/consts.c b/crypto_kem/kyber768-90s/avx2/consts.c index 891c1efd..9e28d64d 100644 --- a/crypto_kem/kyber768-90s/avx2/consts.c +++ b/crypto_kem/kyber768-90s/avx2/consts.c @@ -1,34 +1,155 @@ -#include "consts.h" #include "params.h" - -const uint16_t PQCLEAN_KYBER76890S_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628}; -const uint16_t PQCLEAN_KYBER76890S_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932}; +#include "consts.h" +#include #define Q KYBER_Q -#define MONT ((1U << 16) % KYBER_Q) +#define MONT ((1U << 16) % Q) #define QINV 62209 // q^-1 mod 2^16 -#define V ((1U << 26)/KYBER_Q + 1) -#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q) -#define FLO (FHI * QINV % 65536) -#define MONTSQHI (MONT * MONT % KYBER_Q) -#define MONTSQLO (MONTSQHI * QINV % 65536) +#define V (((1U << 26) + Q/2)/Q) +#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) +#define FLO (FHI*QINV % 65536) +#define MONTSQHI (MONT*MONT % Q) +#define MONTSQLO (MONTSQHI*QINV % 65536) #define MASK 4095 -const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}}; -const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}}; -const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}}; -const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}}; -const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}}; -const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}}; - -#undef Q -#undef QINV -#undef MONT -#undef V -#undef FLO -#undef FHI -#undef MONTSQLO -#undef MONTSQHI -#undef MASK + +const qdata_t PQCLEAN_KYBER76890S_AVX2_qdata = {.as_arr = { +#define _16XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, + +#define _16XQINV 16 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +#define _16XV 32 + V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + +#define _16XFLO 48 + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + +#define _16XFHI 64 + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + +#define _16XMONTSQLO 80 + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + +#define _16XMONTSQHI 96 + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + +#define _16XMASK 112 + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + +#define _ZETAS_EXP 128 + 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, + 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, + 3158, 3158, 3158, 3158, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, + 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, + 573, 573, 2004, 2004, 264, 264, 383, 383, + 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, + 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, + 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, + 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, + 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, + 2226, 555, 2078, 1550, 422, 177, 3038, 1574, + 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, + 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, + 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, + 430, 843, 871, 105, 587, 3094, 2869, 1653, + 778, 3182, 1483, 1119, 644, 349, 329, 3254, + 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, + 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, + 48842, 48842, 48842, 48842, 287, 287, 287, 287, + 287, 287, 287, 287, 202, 202, 202, 202, + 202, 202, 202, 202, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, + 31164, 31164, 31164, 31164, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, + 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, + 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, + 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, + 732, 732, 608, 608, 1787, 1787, 411, 411, + 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, + 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, + 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, + 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, + 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, + 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, + 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, + 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, + 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, + 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, + 3193, 1994, 220, 1670, 1799, 794, 2475, 478, + 3021, 991, 1869, 1628, 0, 0, 0, 0, + +#define _ZETAS_INV_EXP 528 + 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, + 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, + 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, + 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, + 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, + 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, + 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, + 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, + 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, + 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, + 951, 247, 1421, 3222, 2499, 271, 90, 853, + 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, + 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, + 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, + 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, + 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, + 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, + 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, + 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, + 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, + 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, + 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, + 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, + 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, + 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, + 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, + 2210, 1846, 147, 2551, 1676, 460, 235, 2742, + 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, + 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, + 45043, 32227, 11478, 335, 156, 2911, 872, 1590, + 602, 777, 2170, 246, 1755, 291, 3152, 2907, + 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, + 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, + 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, + 666, 320, 8, 2813, 1544, 282, 1838, 1293, + 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, + 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, + 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, + 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, + 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, + 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, + 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, + 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, + 171, 171, 171, 171, 12403, 12403, 12403, 12403, + 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, + 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, + 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, + 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, + 60300, 60300, 1932, 1932, 0, 0, 0, 0 + } +}; diff --git a/crypto_kem/kyber768-90s/avx2/consts.h b/crypto_kem/kyber768-90s/avx2/consts.h index 6c15e815..739cd184 100644 --- a/crypto_kem/kyber768-90s/avx2/consts.h +++ b/crypto_kem/kyber768-90s/avx2/consts.h @@ -1,24 +1,20 @@ -#ifndef CONSTS_H -#define CONSTS_H +#ifndef PQCLEAN_KYBER76890S_AVX2_CONSTS_H +#define PQCLEAN_KYBER76890S_AVX2_CONSTS_H +#include "cdecl.inc" + +#include "params.h" #include #include -typedef union { - uint16_t as_arr[16]; - __m256i as_vec; -} aligned_uint16_t; +#define ALIGNED_UINT16_T(N) \ + union { \ + __m256i as_vec; \ + uint16_t as_arr[(N)]; \ + } -extern const uint16_t PQCLEAN_KYBER76890S_AVX2_zetas_exp[396]; -extern const uint16_t PQCLEAN_KYBER76890S_AVX2_zetas_inv_exp[396]; +typedef ALIGNED_UINT16_T(928) qdata_t; -extern const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xq; -extern const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xqinv; -extern const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xv; -extern const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xflo; -extern const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xfhi; -extern const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xmontsqlo; -extern const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xmontsqhi; -extern const aligned_uint16_t PQCLEAN_KYBER76890S_AVX2_16xmask; +extern const qdata_t PQCLEAN_KYBER76890S_AVX2_qdata; #endif diff --git a/crypto_kem/kyber768-90s/avx2/fq.S b/crypto_kem/kyber768-90s/avx2/fq.S new file mode 100644 index 00000000..ec603823 --- /dev/null +++ b/crypto_kem/kyber768-90s/avx2/fq.S @@ -0,0 +1,129 @@ +#include "cdecl.inc" +.include "fq.inc" + +.text +reduce128_avx: +#load +vmovdqa (%rdi),%ymm2 +vmovdqa 32(%rdi),%ymm3 +vmovdqa 64(%rdi),%ymm4 +vmovdqa 96(%rdi),%ymm5 +vmovdqa 128(%rdi),%ymm6 +vmovdqa 160(%rdi),%ymm7 +vmovdqa 192(%rdi),%ymm8 +vmovdqa 224(%rdi),%ymm9 + +red16 2,10 +red16 3,11 +red16 4,12 +red16 5,13 +red16 6,14 +red16 7,15 +red16 8,10 +red16 9,11 + +#store +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm3,32(%rdi) +vmovdqa %ymm4,64(%rdi) +vmovdqa %ymm5,96(%rdi) +vmovdqa %ymm6,128(%rdi) +vmovdqa %ymm7,160(%rdi) +vmovdqa %ymm8,192(%rdi) +vmovdqa %ymm9,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_reduce_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_reduce_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XV*2(%rsi),%ymm1 +call reduce128_avx +add $256,%rdi +call reduce128_avx +ret + +csubq128_avx: +#load +vmovdqa (%rdi),%ymm1 +vmovdqa 32(%rdi),%ymm2 +vmovdqa 64(%rdi),%ymm3 +vmovdqa 96(%rdi),%ymm4 +vmovdqa 128(%rdi),%ymm5 +vmovdqa 160(%rdi),%ymm6 +vmovdqa 192(%rdi),%ymm7 +vmovdqa 224(%rdi),%ymm8 + +csubq 1,9 +csubq 2,10 +csubq 3,11 +csubq 4,12 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,9 + +#store +vmovdqa %ymm1,(%rdi) +vmovdqa %ymm2,32(%rdi) +vmovdqa %ymm3,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm5,128(%rdi) +vmovdqa %ymm6,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm8,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_csubq_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_csubq_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +call csubq128_avx +add $256,%rdi +call csubq128_avx +ret + +tomont128_avx: +#load +vmovdqa (%rdi),%ymm3 +vmovdqa 32(%rdi),%ymm4 +vmovdqa 64(%rdi),%ymm5 +vmovdqa 96(%rdi),%ymm6 +vmovdqa 128(%rdi),%ymm7 +vmovdqa 160(%rdi),%ymm8 +vmovdqa 192(%rdi),%ymm9 +vmovdqa 224(%rdi),%ymm10 + +fqmulprecomp 1,2,3,11 +fqmulprecomp 1,2,4,12 +fqmulprecomp 1,2,5,13 +fqmulprecomp 1,2,6,14 +fqmulprecomp 1,2,7,15 +fqmulprecomp 1,2,8,11 +fqmulprecomp 1,2,9,12 +fqmulprecomp 1,2,10,13 + +#store +vmovdqa %ymm3,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm5,64(%rdi) +vmovdqa %ymm6,96(%rdi) +vmovdqa %ymm7,128(%rdi) +vmovdqa %ymm8,160(%rdi) +vmovdqa %ymm9,192(%rdi) +vmovdqa %ymm10,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_tomont_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_tomont_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx +add $256,%rdi +call tomont128_avx +ret diff --git a/crypto_kem/kyber768-90s/avx2/fq.inc b/crypto_kem/kyber768-90s/avx2/fq.inc index 56e36a02..75df098a 100644 --- a/crypto_kem/kyber768-90s/avx2/fq.inc +++ b/crypto_kem/kyber768-90s/avx2/fq.inc @@ -1,24 +1,27 @@ -.macro red16 r x=12 +.macro red16 r,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x vpsraw $10,%ymm\x,%ymm\x vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro csubq r x=12 +.macro csubq r,x=12 vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r +#vpcmpgtw %ymm0,%ymm\r,%ymm\x +#vpand %ymm0,%ymm\x,%ymm\x +#vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro caddq r x=12 +.macro caddq r,x=12 vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r .endm -.macro fqmulprecomp al,ah,b x=12 +.macro fqmulprecomp al,ah,b,x=12 vpmullw %ymm\al,%ymm\b,%ymm\x vpmulhw %ymm\ah,%ymm\b,%ymm\b vpmulhw %ymm0,%ymm\x,%ymm\x diff --git a/crypto_kem/kyber768-90s/avx2/fq.s b/crypto_kem/kyber768-90s/avx2/fq.s deleted file mode 100644 index e82d0cee..00000000 --- a/crypto_kem/kyber768-90s/avx2/fq.s +++ /dev/null @@ -1,112 +0,0 @@ -.include "fq.inc" - -.global PQCLEAN_KYBER76890S_AVX2_reduce_avx -PQCLEAN_KYBER76890S_AVX2_reduce_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xv(%rip),%ymm1 - -#load -vmovdqa (%rdi),%ymm2 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm4 -vmovdqa 96(%rdi),%ymm5 -vmovdqa 128(%rdi),%ymm6 -vmovdqa 160(%rdi),%ymm7 -vmovdqa 192(%rdi),%ymm8 -vmovdqa 224(%rdi),%ymm9 - -red16 2 10 -red16 3 11 -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 10 -red16 9 11 - -#store -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm4,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm6,128(%rdi) -vmovdqa %ymm7,160(%rdi) -vmovdqa %ymm8,192(%rdi) -vmovdqa %ymm9,224(%rdi) - -ret - -.global PQCLEAN_KYBER76890S_AVX2_csubq_avx -PQCLEAN_KYBER76890S_AVX2_csubq_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 - -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm2 -vmovdqa 64(%rdi),%ymm3 -vmovdqa 96(%rdi),%ymm4 -vmovdqa 128(%rdi),%ymm5 -vmovdqa 160(%rdi),%ymm6 -vmovdqa 192(%rdi),%ymm7 -vmovdqa 224(%rdi),%ymm8 - -csubq 1 9 -csubq 2 10 -csubq 3 11 -csubq 4 12 -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 9 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm2,32(%rdi) -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm6,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm8,224(%rdi) - -ret - -.global PQCLEAN_KYBER76890S_AVX2_frommont_avx -PQCLEAN_KYBER76890S_AVX2_frommont_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xmontsqlo(%rip),%ymm1 -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xmontsqhi(%rip),%ymm2 - -#load -vmovdqa (%rdi),%ymm3 -vmovdqa 32(%rdi),%ymm4 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm6 -vmovdqa 128(%rdi),%ymm7 -vmovdqa 160(%rdi),%ymm8 -vmovdqa 192(%rdi),%ymm9 -vmovdqa 224(%rdi),%ymm10 - -fqmulprecomp 1,2,3 11 -fqmulprecomp 1,2,4 12 -fqmulprecomp 1,2,5 13 -fqmulprecomp 1,2,6 14 -fqmulprecomp 1,2,7 15 -fqmulprecomp 1,2,8 11 -fqmulprecomp 1,2,9 12 -fqmulprecomp 1,2,10 13 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm4,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm6,96(%rdi) -vmovdqa %ymm7,128(%rdi) -vmovdqa %ymm8,160(%rdi) -vmovdqa %ymm9,192(%rdi) -vmovdqa %ymm10,224(%rdi) - -ret diff --git a/crypto_kem/kyber768-90s/avx2/indcpa.c b/crypto_kem/kyber768-90s/avx2/indcpa.c index 09e4df24..90dca8ac 100644 --- a/crypto_kem/kyber768-90s/avx2/indcpa.c +++ b/crypto_kem/kyber768-90s/avx2/indcpa.c @@ -1,26 +1,33 @@ +#include "align.h" #include "cbd.h" #include "indcpa.h" #include "ntt.h" +#include "params.h" #include "poly.h" #include "polyvec.h" #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include +#include /************************************************* * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* compressed and serialized vector of polynomials pk +* serialized vector of polynomials pk * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { /************************************************* * Name: unpack_pk * -* Description: De-serialize and decompress public key from a byte array; +* Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes(r, sk); } @@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials +* (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(sk, packedsk); } @@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER76890S_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER76890S_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER76890S_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } -static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - unsigned int ctr, pos; - uint16_t val; +/************************************************* +* Name: rej_uniform +* +* Description: Run rejection sampling on uniform random bytes to generate +* uniform random integers mod q +* +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes +* +* Returns number of sampled 16-bit integers (at most len) +**************************************************/ +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { @@ -116,46 +150,47 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER76890S_AVX2_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER76890S_AVX2_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER76890S_AVX2_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - union { - uint8_t x[XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS]; - __m256i _dummy; - } buf; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +void PQCLEAN_KYBER76890S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; + ALIGN32_ARRAY(uint8_t, GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES) buf; aes256ctr_ctx state; PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, seed, 0); - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_K; j++) { + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_K; j++) { if (transposed) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(&state, (i << 8) + j); + nonce.orig = (j << 8) | i; } else { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(&state, (j << 8) + i); + nonce.orig = (i << 8) | j; } - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.x, GEN_MATRIX_MAXNBLOCKS, &state); - ctr = PQCLEAN_KYBER76890S_AVX2_rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf.x, GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES); + state.n = _mm_loadl_epi64(&nonce.vec); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, GEN_MATRIX_NBLOCKS, &state); + ctr = PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(a[i].vec[j].coeffs, buf.arr); while (ctr < KYBER_N) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.x, 1, &state); - ctr += rej_uniform_ref(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.x, XOF_BLOCKBYTES); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, 1, &state); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf.arr, + XOF_BLOCKBYTES); } PQCLEAN_KYBER76890S_AVX2_poly_nttunpack(&a[i].vec[j]); @@ -164,47 +199,53 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER76890S_AVX2_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], skpv, e, pkpv; - uint8_t buf[2 * KYBER_SYMBYTES]; - const uint8_t *publicseed = buf; - const uint8_t *noiseseed = buf + KYBER_SYMBYTES; - uint8_t nonce = 0; +void PQCLEAN_KYBER76890S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + const uint8_t *publicseed = buf.arr; + const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf, KYBER_SYMBYTES); - hash_g(buf, buf, KYBER_SYMBYTES); + randombytes(buf.arr, KYBER_SYMBYTES); + hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); gen_a(a, publicseed); + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; aes256ctr_ctx state; - uint8_t coins[128]; - PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, noiseseed, 0); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state); - PQCLEAN_KYBER76890S_AVX2_cbd(skpv.vec + i, coins); + ALIGN32_ARRAY(uint8_t, 128) coins; + PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, noiseseed, nonce.orig++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER76890S_AVX2_cbd(&skpv.vec[i], coins.arr); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(coins, 1, &state); - PQCLEAN_KYBER76890S_AVX2_cbd(e.vec + i, coins); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(coins.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER76890S_AVX2_cbd(&e.vec[i], coins.arr); } PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&skpv); PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv); - PQCLEAN_KYBER76890S_AVX2_poly_frommont(pkpv.vec + i); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER76890S_AVX2_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER76890S_AVX2_polyvec_add(&pkpv, &pkpv, &e); @@ -215,58 +256,67 @@ void PQCLEAN_KYBER76890S_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER76890S_AVX2_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec at[KYBER_K], pkpv, sp, ep, bp; - poly k, v, epp; - uint8_t seed[KYBER_SYMBYTES]; - uint8_t nonce = 0; - - unpack_pk(&pkpv, seed, pk); +void PQCLEAN_KYBER76890S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; + + unpack_pk(&pkpv, seed.arr, pk); PQCLEAN_KYBER76890S_AVX2_poly_frommsg(&k, m); - gen_at(at, seed); + gen_at(at, seed.arr); + ALIGN16_TYPE(uint64_t) nonce = {.orig = 0}; aes256ctr_ctx state; - uint8_t buf[128]; - PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, coins, 0); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER76890S_AVX2_cbd(sp.vec + i, buf); + ALIGN32_ARRAY(uint8_t, 128) buf; + PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(&state, coins, nonce.orig++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER76890S_AVX2_cbd(&sp.vec[i], buf.arr); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER76890S_AVX2_cbd(ep.vec + i, buf); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER76890S_AVX2_cbd(&ep.vec[i], buf.arr); } - PQCLEAN_KYBER76890S_AVX2_aes256ctr_select(&state, (uint16_t)nonce++ << 8); - PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf, 1, &state); - PQCLEAN_KYBER76890S_AVX2_cbd(&epp, buf); + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(buf.arr, 2, &state); + state.n = _mm_loadl_epi64(&nonce.vec); + nonce.orig++; + PQCLEAN_KYBER76890S_AVX2_cbd(&epp, buf.arr); PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER76890S_AVX2_polyvec_invntt(&bp); - PQCLEAN_KYBER76890S_AVX2_poly_invntt(&v); + PQCLEAN_KYBER76890S_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(&v); PQCLEAN_KYBER76890S_AVX2_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER76890S_AVX2_poly_add(&v, &v, &epp); @@ -278,18 +328,21 @@ void PQCLEAN_KYBER76890S_AVX2_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER76890S_AVX2_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER76890S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -297,8 +350,8 @@ void PQCLEAN_KYBER76890S_AVX2_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER76890S_AVX2_poly_invntt(&mp); + PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER76890S_AVX2_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER76890S_AVX2_poly_reduce(&mp); diff --git a/crypto_kem/kyber768-90s/avx2/indcpa.h b/crypto_kem/kyber768-90s/avx2/indcpa.h index 54ab2365..27686743 100644 --- a/crypto_kem/kyber768-90s/avx2/indcpa.h +++ b/crypto_kem/kyber768-90s/avx2/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER76890S_AVX2_INDCPA_H +#define PQCLEAN_KYBER76890S_AVX2_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER76890S_AVX2_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER76890S_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER76890S_AVX2_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER76890S_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER76890S_AVX2_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER76890S_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER76890S_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber768-90s/avx2/invntt.S b/crypto_kem/kyber768-90s/avx2/invntt.S new file mode 100644 index 00000000..b7c786e6 --- /dev/null +++ b/crypto_kem/kyber768-90s/avx2/invntt.S @@ -0,0 +1,225 @@ +#include "cdecl.inc" +.include "shuffle.inc" +.include "fq.inc" + +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 +#update & mul +vpsubw %ymm\rh0,%ymm\rl0,%ymm12 +vpsubw %ymm\rh1,%ymm\rl1,%ymm13 +vpsubw %ymm\rh2,%ymm\rl2,%ymm14 + +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpmullw %ymm\zl0,%ymm12,%ymm\rh0 + +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpmullw %ymm\zl0,%ymm13,%ymm\rh1 +vpsubw %ymm\rh3,%ymm\rl3,%ymm15 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 +vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpmullw %ymm\zl1,%ymm15,%ymm\rh3 + +vpmulhw %ymm\zh0,%ymm12,%ymm12 +vpmulhw %ymm\zh0,%ymm13,%ymm13 + +vpmulhw %ymm\zh1,%ymm14,%ymm14 +vpmulhw %ymm\zh1,%ymm15,%ymm15 + +#reduce +vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 +vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 +vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 +vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 +vpsubw %ymm\rh0,%ymm12,%ymm\rh0 +vpsubw %ymm\rh1,%ymm13,%ymm\rh1 +vpsubw %ymm\rh2,%ymm14,%ymm\rh2 +vpsubw %ymm\rh3,%ymm15,%ymm\rh3 +.endm + +.text +invntt_levels0t5_avx: +level0: +#zetas +vmovdqu (%rsi),%ymm15 +vmovdqu 64(%rsi),%ymm3 +vmovdqu 32(%rsi),%ymm1 +vmovdqu 96(%rsi),%ymm2 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +butterfly 4,5,8,9,6,7,10,11,15,3,1,2 + +level1: +#zetas +vmovdqu 128(%rsi),%ymm3 +vmovdqu 160(%rsi),%ymm2 + +butterfly 4,5,6,7,8,9,10,11,3,3,2,2 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +level2: +#zetas +vmovdqu 192(%rsi),%ymm10 +vmovdqu 224(%rsi),%ymm2 + +#consts +vmovdqa _16XV*2(%rdx),%ymm1 + +butterfly 3,4,6,8,5,7,9,11,10,10,2,2 + +red16 3 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +level3: +#zetas +vmovdqu 256(%rsi),%ymm9 +vmovdqu 288(%rsi),%ymm2 + +butterfly 10,3,6,5,4,8,7,11,9,9,2,2 + +red16 10 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +level4: +#zetas +vmovdqu 320(%rsi),%ymm7 +vmovdqu 352(%rsi),%ymm2 + +butterfly 9,10,6,4,3,5,8,11,7,7,2,2 + +red16 9 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +level5: +#zetas +vpbroadcastd 384(%rsi),%ymm8 +vpbroadcastd 388(%rsi),%ymm2 + +butterfly 7,9,6,3,10,4,5,11,8,8,2,2 + +red16 7 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +invntt_level6_avx: +#zetas +vpbroadcastd (%rsi),%ymm1 +vpbroadcastd 4(%rsi),%ymm2 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 256(%rdi),%ymm8 +vmovdqa 288(%rdi),%ymm9 +vmovdqa 320(%rdi),%ymm10 +vmovdqa 352(%rdi),%ymm11 + +butterfly 4,5,6,7,8,9,10,11 + +#consts +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 + +#store +vmovdqa %ymm8,256(%rdi) +vmovdqa %ymm9,288(%rdi) +vmovdqa %ymm10,320(%rdi) +vmovdqa %ymm11,352(%rdi) + +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 + +#store +vmovdqa %ymm4,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm7,96(%rdi) + +#load +vmovdqa 128(%rdi),%ymm4 +vmovdqa 160(%rdi),%ymm5 +vmovdqa 192(%rdi),%ymm6 +vmovdqa 224(%rdi),%ymm7 +vmovdqa 384(%rdi),%ymm8 +vmovdqa 416(%rdi),%ymm9 +vmovdqa 448(%rdi),%ymm10 +vmovdqa 480(%rdi),%ymm11 + +butterfly 4,5,6,7,8,9,10,11 + +#consts +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 + +#store +vmovdqa %ymm8,384(%rdi) +vmovdqa %ymm9,416(%rdi) +vmovdqa %ymm10,448(%rdi) +vmovdqa %ymm11,480(%rdi) + +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 + +#store +vmovdqa %ymm4,128(%rdi) +vmovdqa %ymm5,160(%rdi) +vmovdqa %ymm6,192(%rdi) +vmovdqa %ymm7,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_INV_EXP*2,%rsi +call invntt_levels0t5_avx +add $256,%rdi +add $392,%rsi +call invntt_levels0t5_avx +sub $256,%rdi +add $392,%rsi +call invntt_level6_avx +ret diff --git a/crypto_kem/kyber768-90s/avx2/invntt.s b/crypto_kem/kyber768-90s/avx2/invntt.s deleted file mode 100644 index 9e74001c..00000000 --- a/crypto_kem/kyber768-90s/avx2/invntt.s +++ /dev/null @@ -1,217 +0,0 @@ -.include "shuffle.inc" -.include "fq.inc" - -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=1,zl1=1,zh0=2,zh1=2 -#update & mul -vpsubw %ymm\rh0,%ymm\rl0,%ymm12 -vpsubw %ymm\rh1,%ymm\rl1,%ymm13 -vpsubw %ymm\rh2,%ymm\rl2,%ymm14 - -vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 -vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 -vpmullw %ymm\zl0,%ymm12,%ymm\rh0 - -vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 -vpmullw %ymm\zl0,%ymm13,%ymm\rh1 -vpsubw %ymm\rh3,%ymm\rl3,%ymm15 - -vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 -vpmullw %ymm\zl1,%ymm14,%ymm\rh2 -vpmullw %ymm\zl1,%ymm15,%ymm\rh3 - -vpmulhw %ymm\zh0,%ymm12,%ymm12 -vpmulhw %ymm\zh0,%ymm13,%ymm13 - -vpmulhw %ymm\zh1,%ymm14,%ymm14 -vpmulhw %ymm\zh1,%ymm15,%ymm15 - -#reduce -vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 -vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 -vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 -vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 -vpsubw %ymm\rh0,%ymm12,%ymm\rh0 -vpsubw %ymm\rh1,%ymm13,%ymm\rh1 -vpsubw %ymm\rh2,%ymm14,%ymm\rh2 -vpsubw %ymm\rh3,%ymm15,%ymm\rh3 -.endm - -.global PQCLEAN_KYBER76890S_AVX2_invntt_levels0t5_avx -.p2align 5 -PQCLEAN_KYBER76890S_AVX2_invntt_levels0t5_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 - -level0: -#zetas -vmovdqu (%rsi),%ymm15 -vmovdqu 64(%rsi),%ymm3 -vmovdqu 32(%rsi),%ymm1 -vmovdqu 96(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly 4,5,8,9,6,7,10,11 15,3,1,2 - -level1: -#zetas -vmovdqu 128(%rsi),%ymm3 -vmovdqu 160(%rsi),%ymm2 - -butterfly 4,5,6,7,8,9,10,11 3,3,2,2 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 -shuffle1 10,11,8,11 - -level2: -#zetas -vmovdqu 192(%rsi),%ymm10 -vmovdqu 224(%rsi),%ymm2 - -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xv(%rip),%ymm1 - -butterfly 3,4,6,8,5,7,9,11 10,10,2,2 - -red16 3 - -shuffle2 3,4,10,4 -shuffle2 6,8,3,8 -shuffle2 5,7,6,7 -shuffle2 9,11,5,11 - -level3: -#zetas -vmovdqu 256(%rsi),%ymm9 -vmovdqu 288(%rsi),%ymm2 - -butterfly 10,3,6,5,4,8,7,11 9,9,2,2 - -red16 10 - -shuffle4 10,3,9,3 -shuffle4 6,5,10,5 -shuffle4 4,8,6,8 -shuffle4 7,11,4,11 - -level4: -#zetas -vmovdqu 320(%rsi),%ymm7 -vmovdqu 352(%rsi),%ymm2 - -butterfly 9,10,6,4,3,5,8,11 7,7,2,2 - -red16 9 - -shuffle8 9,10,7,10 -shuffle8 6,4,9,4 -shuffle8 3,5,6,5 -shuffle8 8,11,3,11 - -level5: -#zetas -vpbroadcastd 384(%rsi),%ymm8 -vpbroadcastd 388(%rsi),%ymm2 - -butterfly 7,9,6,3,10,4,5,11 8,8,2,2 - -red16 7 - -#store -vmovdqa %ymm7,(%rdi) -vmovdqa %ymm9,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm3,96(%rdi) -vmovdqa %ymm10,128(%rdi) -vmovdqa %ymm4,160(%rdi) -vmovdqa %ymm5,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global PQCLEAN_KYBER76890S_AVX2_invntt_level6_avx -PQCLEAN_KYBER76890S_AVX2_invntt_level6_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 - -#zetas -vpbroadcastd (%rsi),%ymm1 -vpbroadcastd 4(%rsi),%ymm2 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xfhi(%rip),%ymm13 - -#store -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) - -#load -vmovdqa 128(%rdi),%ymm4 -vmovdqa 160(%rdi),%ymm5 -vmovdqa 192(%rdi),%ymm6 -vmovdqa 224(%rdi),%ymm7 -vmovdqa 384(%rdi),%ymm8 -vmovdqa 416(%rdi),%ymm9 -vmovdqa 448(%rdi),%ymm10 -vmovdqa 480(%rdi),%ymm11 - -butterfly 4,5,6,7,8,9,10,11 - -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xflo(%rip),%ymm12 -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xfhi(%rip),%ymm13 - -#store -vmovdqa %ymm8,384(%rdi) -vmovdqa %ymm9,416(%rdi) -vmovdqa %ymm10,448(%rdi) -vmovdqa %ymm11,480(%rdi) - -fqmulprecomp 12,13,4 8 -fqmulprecomp 12,13,5 9 -fqmulprecomp 12,13,6 10 -fqmulprecomp 12,13,7 11 - -#store -vmovdqa %ymm4,128(%rdi) -vmovdqa %ymm5,160(%rdi) -vmovdqa %ymm6,192(%rdi) -vmovdqa %ymm7,224(%rdi) - -ret diff --git a/crypto_kem/kyber768-90s/avx2/kem.c b/crypto_kem/kyber768-90s/avx2/kem.c index 3891c2ad..cabbbdbe 100644 --- a/crypto_kem/kyber768-90s/avx2/kem.c +++ b/crypto_kem/kyber768-90s/avx2/kem.c @@ -1,103 +1,127 @@ -#include "api.h" +#include "align.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include + -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER76890S_AVX2_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER76890S_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER76890S_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER76890S_AVX2_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER76890S_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - uint8_t buf[2 * KYBER_SYMBYTES]; +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + randombytes(buf.arr, KYBER_SYMBYTES); + /* Don't release system RNG output */ + hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER76890S_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER76890S_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER76890S_AVX2_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER76890S_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - union { - uint8_t x[KYBER_CIPHERTEXTBYTES]; - __m256i __dummy; - } _cmp; - uint8_t *cmp = _cmp.x; - uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER76890S_AVX2_indcpa_dec(buf, ct, sk); + PQCLEAN_KYBER76890S_AVX2_indcpa_dec(buf.arr, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER76890S_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER76890S_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER76890S_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER76890S_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER76890S_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber768-90s/avx2/kem.h b/crypto_kem/kyber768-90s/avx2/kem.h new file mode 100644 index 00000000..f34e549e --- /dev/null +++ b/crypto_kem/kyber768-90s/avx2/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER76890S_AVX2_KEM_H +#define PQCLEAN_KYBER76890S_AVX2_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER76890S_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber768-90s/avx2/ntt.S b/crypto_kem/kyber768-90s/avx2/ntt.S new file mode 100644 index 00000000..45b8a6a5 --- /dev/null +++ b/crypto_kem/kyber768-90s/avx2/ntt.S @@ -0,0 +1,220 @@ +#include "cdecl.inc" +.include "shuffle.inc" +.include "fq.inc" + +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 +#mul +vpmullw %ymm\zl0,%ymm\rh0,%ymm12 +vpmullw %ymm\zl0,%ymm\rh1,%ymm13 +vpmullw %ymm\zl1,%ymm\rh2,%ymm14 +vpmullw %ymm\zl1,%ymm\rh3,%ymm15 +vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 +vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 +vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 +vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 + +#reduce +vpmulhw %ymm0,%ymm12,%ymm12 +vpmulhw %ymm0,%ymm13,%ymm13 +vpmulhw %ymm0,%ymm14,%ymm14 +vpmulhw %ymm0,%ymm15,%ymm15 +vpsubw %ymm12,%ymm\rh0,%ymm12 +vpsubw %ymm13,%ymm\rh1,%ymm13 +vpsubw %ymm14,%ymm\rh2,%ymm14 +vpsubw %ymm15,%ymm\rh3,%ymm15 + +#update +vpsubw %ymm12,%ymm\rl0,%ymm\rh0 +vpaddw %ymm12,%ymm\rl0,%ymm\rl0 +vpsubw %ymm13,%ymm\rl1,%ymm\rh1 +vpaddw %ymm13,%ymm\rl1,%ymm\rl1 +vpsubw %ymm14,%ymm\rl2,%ymm\rh2 +vpaddw %ymm14,%ymm\rl2,%ymm\rl2 +vpsubw %ymm15,%ymm\rl3,%ymm\rh3 +vpaddw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +# We break the dependency chains with the cost of slightly more additions. +# But they can be run in parallel to the multiplications on execution port 5 +# (multiplications only go to ports 0 and 1) +.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 +#mul +vpmullw %ymm\zl0,%ymm\rh0,%ymm12 +vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x +vpmullw %ymm\zl0,%ymm\rh1,%ymm13 +vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 +vpmullw %ymm\zl1,%ymm\rh2,%ymm14 +vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y +vpmullw %ymm\zl1,%ymm\rh3,%ymm15 +vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 + +#reduce +vpmulhw %ymm0,%ymm12,%ymm12 +vpmulhw %ymm0,%ymm13,%ymm13 +vpmulhw %ymm0,%ymm14,%ymm14 +vpmulhw %ymm0,%ymm15,%ymm15 + +vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 +vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 +vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 +vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 +vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 +vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 +vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 +vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 + +#update +vpaddw %ymm12,%ymm\rh0,%ymm\rh0 +vpsubw %ymm12,%ymm\rl0,%ymm\rl0 +vpaddw %ymm13,%ymm\rh1,%ymm\rh1 +vpsubw %ymm13,%ymm\rl1,%ymm\rl1 +vpaddw %ymm14,%ymm\rh2,%ymm\rh2 +vpsubw %ymm14,%ymm\rl2,%ymm\rl2 +vpaddw %ymm15,%ymm\rh3,%ymm\rh3 +vpsubw %ymm15,%ymm\rl3,%ymm\rl3 +.endm + +.text +ntt_level0_avx: +level0: +#zetas +vpbroadcastd (%rsi),%ymm15 +vpbroadcastd 4(%rsi),%ymm1 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 256(%rdi),%ymm8 +vmovdqa 288(%rdi),%ymm9 +vmovdqa 320(%rdi),%ymm10 +vmovdqa 352(%rdi),%ymm11 + +butterfly2 4,5,6,7,8,9,10,11 + +#store +vmovdqa %ymm4,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm7,96(%rdi) +vmovdqa %ymm8,256(%rdi) +vmovdqa %ymm9,288(%rdi) +vmovdqa %ymm10,320(%rdi) +vmovdqa %ymm11,352(%rdi) + +ret + +ntt_levels1t6_avx: +level1: +#zetas +vpbroadcastd (%rsi),%ymm15 +vpbroadcastd 4(%rsi),%ymm1 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +butterfly2 4,5,6,7,8,9,10,11,3 + +level2: +#zetas +vmovdqu 8(%rsi),%ymm15 +vmovdqu 40(%rsi),%ymm1 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +butterfly2 3,8,4,9,5,10,6,11,7 + +level3: +#zetas +vmovdqu 72(%rsi),%ymm15 +vmovdqu 104(%rsi),%ymm1 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +butterfly2 7,5,3,10,8,6,4,11,9 + +level4: +#zetas +vmovdqu 136(%rsi),%ymm15 +vmovdqu 168(%rsi),%ymm1 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +butterfly2 9,8,7,6,5,4,3,11,10 + +level5: +#zetas +vmovdqu 200(%rsi),%ymm15 +vmovdqu 232(%rsi),%ymm1 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +butterfly2 10,5,9,4,8,3,7,11,6 + +level6: +#zetas +vmovdqu 264(%rsi),%ymm14 +vmovdqu 328(%rsi),%ymm15 +vmovdqu 296(%rsi),%ymm1 +vmovdqu 360(%rsi),%ymm2 + +butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 + +vmovdqa _16XV*2(%rdx),%ymm1 +red16 10,12 +red16 5,13 +red16 9,14 +red16 4,15 +red16 8,2 +red16 3,6 +red16 7,12 +red16 11,13 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_ntt_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_ntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_EXP*2,%rsi +call ntt_level0_avx +add $128,%rdi +call ntt_level0_avx +sub $128,%rdi +add $8,%rsi +call ntt_levels1t6_avx +add $256,%rdi +add $392,%rsi +call ntt_levels1t6_avx +ret diff --git a/crypto_kem/kyber768-90s/avx2/ntt.h b/crypto_kem/kyber768-90s/avx2/ntt.h index 0b9158df..2184fa6f 100644 --- a/crypto_kem/kyber768-90s/avx2/ntt.h +++ b/crypto_kem/kyber768-90s/avx2/ntt.h @@ -2,19 +2,27 @@ #define NTT_H #include "consts.h" - +#include "params.h" #include -void PQCLEAN_KYBER76890S_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER76890S_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER76890S_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER76890S_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER76890S_AVX2_nttpack_avx(int16_t *r); -void PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(int16_t *r); -void PQCLEAN_KYBER76890S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); -void PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); - -void PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a); -void PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a); + +void PQCLEAN_KYBER76890S_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + +void PQCLEAN_KYBER76890S_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + + +void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + +void PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + + +void PQCLEAN_KYBER76890S_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + +void PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + + +void PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + +void PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber768-90s/avx2/ntt.s b/crypto_kem/kyber768-90s/avx2/ntt.s deleted file mode 100644 index d0194bc4..00000000 --- a/crypto_kem/kyber768-90s/avx2/ntt.s +++ /dev/null @@ -1,209 +0,0 @@ -.include "shuffle.inc" -.include "fq.inc" - -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -vpsubw %ymm12,%ymm\rh0,%ymm12 -vpsubw %ymm13,%ymm\rh1,%ymm13 -vpsubw %ymm14,%ymm\rh2,%ymm14 -vpsubw %ymm15,%ymm\rh3,%ymm15 - -#update -vpsubw %ymm12,%ymm\rl0,%ymm\rh0 -vpaddw %ymm12,%ymm\rl0,%ymm\rl0 -vpsubw %ymm13,%ymm\rl1,%ymm\rh1 -vpaddw %ymm13,%ymm\rl1,%ymm\rl1 -vpsubw %ymm14,%ymm\rl2,%ymm\rh2 -vpaddw %ymm14,%ymm\rl2,%ymm\rl2 -vpsubw %ymm15,%ymm\rl3,%ymm\rh3 -vpaddw %ymm15,%ymm\rl3,%ymm\rl3 -.endm - -# We break the dependency chains with the cost of slightly more additions. -# But they can be run in parallel to the multiplications on execution port 5 -# (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 - -vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 -vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 -vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 -vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 - -#update -vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm12,%ymm\rl0,%ymm\rl0 -vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm13,%ymm\rl1,%ymm\rl1 -vpaddw %ymm14,%ymm\rh2,%ymm\rh2 -vpsubw %ymm14,%ymm\rl2,%ymm\rl2 -vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -vpsubw %ymm15,%ymm\rl3,%ymm\rl3 -.endm - -.global PQCLEAN_KYBER76890S_AVX2_ntt_level0_avx -PQCLEAN_KYBER76890S_AVX2_ntt_level0_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 - -level0: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -ret - -.global PQCLEAN_KYBER76890S_AVX2_ntt_levels1t6_avx -PQCLEAN_KYBER76890S_AVX2_ntt_levels1t6_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 - -level1: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 3 - -level2: -#zetas -vmovdqu 8(%rsi),%ymm15 -vmovdqu 40(%rsi),%ymm1 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly2 3,8,4,9,5,10,6,11 7 - -level3: -#zetas -vmovdqu 72(%rsi),%ymm15 -vmovdqu 104(%rsi),%ymm1 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly2 7,5,3,10,8,6,4,11 9 - -level4: -#zetas -vmovdqu 136(%rsi),%ymm15 -vmovdqu 168(%rsi),%ymm1 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -butterfly2 9,8,7,6,5,4,3,11 10 - -level5: -#zetas -vmovdqu 200(%rsi),%ymm15 -vmovdqu 232(%rsi),%ymm1 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -butterfly2 10,5,9,4,8,3,7,11 6 - -level6: -#zetas -vmovdqu 264(%rsi),%ymm14 -vmovdqu 328(%rsi),%ymm15 -vmovdqu 296(%rsi),%ymm1 -vmovdqu 360(%rsi),%ymm2 - -butterfly2 10,5,8,3,9,4,7,11 6,1,14,15,1,2 - -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xv(%rip),%ymm1 -red16 10 12 -red16 5 13 -red16 9 14 -red16 4 15 -red16 8 2 -red16 3 6 -red16 7 12 -red16 11 13 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret diff --git a/crypto_kem/kyber768-90s/avx2/params.h b/crypto_kem/kyber768-90s/avx2/params.h index 3a1e0d10..20acde49 100644 --- a/crypto_kem/kyber768-90s/avx2/params.h +++ b/crypto_kem/kyber768-90s/avx2/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER76890S_AVX2_PARAMS_H +#define PQCLEAN_KYBER76890S_AVX2_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 3 #define KYBER_POLYCOMPRESSEDBYTES 128 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber768-90s/avx2/poly.c b/crypto_kem/kyber768-90s/avx2/poly.c index 87e8ac1d..6653818b 100644 --- a/crypto_kem/kyber768-90s/avx2/poly.c +++ b/crypto_kem/kyber768-90s/avx2/poly.c @@ -1,112 +1,197 @@ +#include "align.h" #include "cbd.h" +#include "consts.h" #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include #include /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER76890S_AVX2_poly_compress * * Description: Compression and subsequent serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER76890S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t i, j, k = 0; PQCLEAN_KYBER76890S_AVX2_poly_csubq(a); - for (i = 0; i < KYBER_N; i += 8) { + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = (uint8_t)(((((uint16_t)a->coeffs[i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15); + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; } - r[k] = (uint8_t)(t[0] | (t[1] << 4)); - r[k + 1] = (uint8_t)(t[2] | (t[3] << 4)); - r[k + 2] = (uint8_t)(t[4] | (t[5] << 4)); - r[k + 3] = (uint8_t)(t[6] | (t[7] << 4)); - k += 4; + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER76890S_AVX2_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER76890S_AVX2_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_decompress(poly *r, const uint8_t *a) { - size_t i; - for (i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)((((a[0] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 1] = (int16_t)((((a[0] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 2] = (int16_t)((((a[1] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 3] = (int16_t)((((a[1] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 4] = (int16_t)((((a[2] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 5] = (int16_t)((((a[2] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 6] = (int16_t)((((a[3] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 7] = (int16_t)((((a[3] >> 4) * KYBER_Q) + 8) >> 4); - a += 4; +void PQCLEAN_KYBER76890S_AVX2_poly_decompress(poly *restrict r, + const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; + a += 1; } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER76890S_AVX2_poly_tobytes * * Description: Serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_tobytes(uint8_t *r, poly *a) { - PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(r, a->coeffs); - PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128); +void PQCLEAN_KYBER76890S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER76890S_AVX2_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER76890S_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) +**************************************************/ +void PQCLEAN_KYBER76890S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER76890S_AVX2_qdata); +} + +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *restrict r, + const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; + const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); + const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2); + +#define FROMMSG64(i) \ + g3 = _mm256_shuffle_epi32(f,0x55*(i)); \ + g3 = _mm256_sllv_epi32(g3,shift); \ + g3 = _mm256_shuffle_epi8(g3,idx); \ + g0 = _mm256_slli_epi16(g3,12); \ + g1 = _mm256_slli_epi16(g3,8); \ + g2 = _mm256_slli_epi16(g3,4); \ + g0 = _mm256_srai_epi16(g0,15); \ + g1 = _mm256_srai_epi16(g1,15); \ + g2 = _mm256_srai_epi16(g2,15); \ + g3 = _mm256_srai_epi16(g3,15); \ + g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ + g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ + g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ + g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ + h0 = _mm256_unpacklo_epi64(g0,g1); \ + h2 = _mm256_unpackhi_epi64(g0,g1); \ + h1 = _mm256_unpacklo_epi64(g2,g3); \ + h3 = _mm256_unpackhi_epi64(g2,g3); \ + g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ + g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ + g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ + g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + + f = _mm256_load_si256((__m256i *)msg); + FROMMSG64(0); + FROMMSG64(1); + FROMMSG64(2); + FROMMSG64(3); +} + +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_frombytes(poly *r, const uint8_t *a) { - PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(r->coeffs, a); - PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192); +void PQCLEAN_KYBER76890S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { + unsigned int i = 0; + uint32_t small = 0; + __m256i f0, f1, g0, g1; + const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); + f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); + f0 = _mm256_sub_epi16(hqs, f0); + f1 = _mm256_sub_epi16(hqs, f1); + g0 = _mm256_srai_epi16(f0, 15); + g1 = _mm256_srai_epi16(f1, 15); + f0 = _mm256_xor_si256(f0, g0); + f1 = _mm256_xor_si256(f1, g1); + f0 = _mm256_sub_epi16(hhqs, f0); + f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_packs_epi16(f0, f1); + small = _mm256_movemask_epi8(f0); + small = ~small; + msg[4 * i + 0] = small; + msg[4 * i + 1] = small >> 16; + msg[4 * i + 2] = small >> 8; + msg[4 * i + 3] = small >> 24; + } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER76890S_AVX2_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); - PQCLEAN_KYBER76890S_AVX2_cbd(r, buf); +void PQCLEAN_KYBER76890S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; + prf(buf.arr, sizeof(buf.arr), seed, nonce); + PQCLEAN_KYBER76890S_AVX2_cbd(r, buf.arr); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER76890S_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -115,73 +200,78 @@ void PQCLEAN_KYBER76890S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_ * Arguments: - uint16_t *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER76890S_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER76890S_AVX2_zetas_exp); - PQCLEAN_KYBER76890S_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER76890S_AVX2_zetas_exp); - PQCLEAN_KYBER76890S_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER76890S_AVX2_zetas_exp + 4); - PQCLEAN_KYBER76890S_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER76890S_AVX2_zetas_exp + 200); + PQCLEAN_KYBER76890S_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_invntt(poly *r) { - PQCLEAN_KYBER76890S_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER76890S_AVX2_zetas_inv_exp); - PQCLEAN_KYBER76890S_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER76890S_AVX2_zetas_inv_exp + 196); - PQCLEAN_KYBER76890S_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER76890S_AVX2_zetas_inv_exp + 392); +void PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(poly *r) { + PQCLEAN_KYBER76890S_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } -// FIXME void PQCLEAN_KYBER76890S_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(r->coeffs); - PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(r->coeffs + 128); + PQCLEAN_KYBER76890S_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } -//XXX Add comment -void PQCLEAN_KYBER76890S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER76890S_AVX2_basemul_avx(r->coeffs, - a->coeffs, - b->coeffs, - PQCLEAN_KYBER76890S_AVX2_zetas_exp + 152); - PQCLEAN_KYBER76890S_AVX2_basemul_avx(r->coeffs + 64, - a->coeffs + 64, - b->coeffs + 64, - PQCLEAN_KYBER76890S_AVX2_zetas_exp + 184); - PQCLEAN_KYBER76890S_AVX2_basemul_avx(r->coeffs + 128, - a->coeffs + 128, - b->coeffs + 128, - PQCLEAN_KYBER76890S_AVX2_zetas_exp + 348); - PQCLEAN_KYBER76890S_AVX2_basemul_avx(r->coeffs + 192, - a->coeffs + 192, - b->coeffs + 192, - PQCLEAN_KYBER76890S_AVX2_zetas_exp + 380); +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + PQCLEAN_KYBER76890S_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } -// FIXME -void PQCLEAN_KYBER76890S_AVX2_poly_frommont(poly *r) { - PQCLEAN_KYBER76890S_AVX2_frommont_avx(r->coeffs); - PQCLEAN_KYBER76890S_AVX2_frommont_avx(r->coeffs + 128); +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_poly_tomont +* +* Description: Inplace conversion of all coefficients of a polynomial +* from normal domain to Montgomery domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_KYBER76890S_AVX2_poly_tomont(poly *r) { + PQCLEAN_KYBER76890S_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER76890S_AVX2_reduce_avx(r->coeffs); - PQCLEAN_KYBER76890S_AVX2_reduce_avx(r->coeffs + 128); + PQCLEAN_KYBER76890S_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_poly_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER76890S_AVX2_csubq_avx(r->coeffs); - PQCLEAN_KYBER76890S_AVX2_csubq_avx(r->coeffs + 128); + PQCLEAN_KYBER76890S_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER76890S_AVX2_poly_add * * Description: Add two polynomials * @@ -190,18 +280,19 @@ void PQCLEAN_KYBER76890S_AVX2_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_add_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_add_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER76890S_AVX2_poly_sub * * Description: Subtract two polynomials * @@ -210,127 +301,13 @@ void PQCLEAN_KYBER76890S_AVX2_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER76890S_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_sub_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); - } -} - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - __m128i tmp; - __m256i a[4], d0, d1, d2, d3; - const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i zeros = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi32(1); - const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2); - - tmp = _mm_loadu_si128((__m128i *)msg); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2); - } - - tmp = _mm_loadu_si128((__m128i *)&msg[16]); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2); - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER76890S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint32_t small; - __m256i vec, tmp; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); - - for (size_t i = 0; i < KYBER_N / 16; i++) { - vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]); - vec = _mm256_sub_epi16(hqs, vec); - tmp = _mm256_srai_epi16(vec, 15); - vec = _mm256_xor_si256(vec, tmp); - vec = _mm256_sub_epi16(hhqs, vec); - small = (uint32_t)_mm256_movemask_epi8(vec); - small = _pext_u32(small, 0xAAAAAAAA); - small = ~small; - msg[2 * i + 0] = (uint8_t)small; - msg[2 * i + 1] = (uint8_t)(small >> 8); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_sub_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } diff --git a/crypto_kem/kyber768-90s/avx2/poly.h b/crypto_kem/kyber768-90s/avx2/poly.h index d29881f6..fb2e94ef 100644 --- a/crypto_kem/kyber768-90s/avx2/poly.h +++ b/crypto_kem/kyber768-90s/avx2/poly.h @@ -1,8 +1,7 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER76890S_AVX2_POLY_H +#define PQCLEAN_KYBER76890S_AVX2_POLY_H #include "params.h" - #include #include @@ -11,32 +10,47 @@ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] */ typedef union { + __m256i dummy; int16_t coeffs[KYBER_N]; - __m256i _dummy; } poly; -void PQCLEAN_KYBER76890S_AVX2_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER76890S_AVX2_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER76890S_AVX2_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER76890S_AVX2_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER76890S_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER76890S_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER76890S_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER76890S_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER76890S_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER76890S_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER76890S_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER76890S_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER76890S_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); void PQCLEAN_KYBER76890S_AVX2_poly_ntt(poly *r); -void PQCLEAN_KYBER76890S_AVX2_poly_invntt(poly *r); + +void PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(poly *r); + void PQCLEAN_KYBER76890S_AVX2_poly_nttunpack(poly *r); -void PQCLEAN_KYBER76890S_AVX2_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER76890S_AVX2_poly_frommont(poly *r); + +void PQCLEAN_KYBER76890S_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER76890S_AVX2_poly_tomont(poly *r); + void PQCLEAN_KYBER76890S_AVX2_poly_reduce(poly *r); + void PQCLEAN_KYBER76890S_AVX2_poly_csubq(poly *r); + void PQCLEAN_KYBER76890S_AVX2_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER76890S_AVX2_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber768-90s/avx2/polyvec.c b/crypto_kem/kyber768-90s/avx2/polyvec.c index 01a5e8f3..f61832b8 100644 --- a/crypto_kem/kyber768-90s/avx2/polyvec.c +++ b/crypto_kem/kyber768-90s/avx2/polyvec.c @@ -1,157 +1,188 @@ +#include "params.h" +#include "consts.h" #include "ntt.h" #include "poly.h" #include "polyvec.h" - #include /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_compress * * Description: Compress and serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], + polyvec *restrict a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER76890S_AVX2_polyvec_csubq(a); uint16_t t[4]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - for (size_t k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) + / KYBER_Q) & 0x3ff; + } } - r[5 * j + 0] = (uint8_t)t[0]; - r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x3f) << 2)); - r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] & 0x0f) << 4)); - r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] & 0x03) << 6)); - r[5 * j + 4] = (uint8_t)((t[3] >> 2)); + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; } - r += 320; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER76890S_AVX2_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - r->vec[i].coeffs[4 * j + 0] = (int16_t)( (((a[5 * j + 0] | (((uint32_t)a[5 * j + 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 1] = (int16_t)(((((a[5 * j + 1] >> 2) | (((uint32_t)a[5 * j + 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 2] = (int16_t)(((((a[5 * j + 2] >> 4) | (((uint32_t)a[5 * j + 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 3] = (int16_t)(((((a[5 * j + 3] >> 6) | (((uint32_t)a[5 * j + 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10); +void PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(polyvec *restrict r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } } - a += 320; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes * * Description: Serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_AVX2_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_AVX2_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER76890S_AVX2_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_AVX2_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { - PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(r->coeffs, - a->vec->coeffs, - b->vec->coeffs, - PQCLEAN_KYBER76890S_AVX2_zetas_exp + 152); - PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(r->coeffs + 64, - a->vec->coeffs + 64, - b->vec->coeffs + 64, - PQCLEAN_KYBER76890S_AVX2_zetas_exp + 184); - PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(r->coeffs + 128, - a->vec->coeffs + 128, - b->vec->coeffs + 128, - PQCLEAN_KYBER76890S_AVX2_zetas_exp + 348); - PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(r->coeffs + 192, - a->vec->coeffs + 192, - b->vec->coeffs + 192, - PQCLEAN_KYBER76890S_AVX2_zetas_exp + 380); +void PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + PQCLEAN_KYBER76890S_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER76890S_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_AVX2_poly_reduce(&r->vec[i]); } } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of each element of a vector of polynomials +* for details of conditional subtraction of q see comments in +* reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER76890S_AVX2_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_AVX2_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER76890S_AVX2_polyvec_add * * Description: Add vectors of polynomials * @@ -160,7 +191,8 @@ void PQCLEAN_KYBER76890S_AVX2_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER76890S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber768-90s/avx2/polyvec.h b/crypto_kem/kyber768-90s/avx2/polyvec.h index ec55b86e..386db8c1 100644 --- a/crypto_kem/kyber768-90s/avx2/polyvec.h +++ b/crypto_kem/kyber768-90s/avx2/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER76890S_AVX2_POLYVEC_H +#define PQCLEAN_KYBER76890S_AVX2_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER76890S_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER76890S_AVX2_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER76890S_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER76890S_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER76890S_AVX2_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER76890S_AVX2_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER76890S_AVX2_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER76890S_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER76890S_AVX2_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER76890S_AVX2_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER76890S_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber768-90s/avx2/reduce.h b/crypto_kem/kyber768-90s/avx2/reduce.h index 9905eacb..db8eb984 100644 --- a/crypto_kem/kyber768-90s/avx2/reduce.h +++ b/crypto_kem/kyber768-90s/avx2/reduce.h @@ -3,8 +3,14 @@ #include -int16_t PQCLEAN_KYBER76890S_AVX2_reduce_avx(int16_t *r); -int16_t PQCLEAN_KYBER76890S_AVX2_csubq_avx(int16_t *r); -int16_t PQCLEAN_KYBER76890S_AVX2_frommont_avx(int16_t *r); +#include "consts.h" +#include "params.h" + + +int16_t PQCLEAN_KYBER76890S_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + +int16_t PQCLEAN_KYBER76890S_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); + +int16_t PQCLEAN_KYBER76890S_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER76890S_AVX2_qdata); #endif diff --git a/crypto_kem/kyber768-90s/avx2/rejsample.c b/crypto_kem/kyber768-90s/avx2/rejsample.c index 6e0b1ad0..c85eff40 100644 --- a/crypto_kem/kyber768-90s/avx2/rejsample.c +++ b/crypto_kem/kyber768-90s/avx2/rejsample.c @@ -1,386 +1,360 @@ +#include "align.h" #include "consts.h" #include "params.h" #include "rejsample.h" - -#include #include - -static const uint8_t idx[256][8] = { - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 2, 0, 0, 0, 0, 0, 0, 0}, - { 0, 2, 0, 0, 0, 0, 0, 0}, - { 4, 0, 0, 0, 0, 0, 0, 0}, - { 0, 4, 0, 0, 0, 0, 0, 0}, - { 2, 4, 0, 0, 0, 0, 0, 0}, - { 0, 2, 4, 0, 0, 0, 0, 0}, - { 6, 0, 0, 0, 0, 0, 0, 0}, - { 0, 6, 0, 0, 0, 0, 0, 0}, - { 2, 6, 0, 0, 0, 0, 0, 0}, - { 0, 2, 6, 0, 0, 0, 0, 0}, - { 4, 6, 0, 0, 0, 0, 0, 0}, - { 0, 4, 6, 0, 0, 0, 0, 0}, - { 2, 4, 6, 0, 0, 0, 0, 0}, - { 0, 2, 4, 6, 0, 0, 0, 0}, - { 8, 0, 0, 0, 0, 0, 0, 0}, - { 0, 8, 0, 0, 0, 0, 0, 0}, - { 2, 8, 0, 0, 0, 0, 0, 0}, - { 0, 2, 8, 0, 0, 0, 0, 0}, - { 4, 8, 0, 0, 0, 0, 0, 0}, - { 0, 4, 8, 0, 0, 0, 0, 0}, - { 2, 4, 8, 0, 0, 0, 0, 0}, - { 0, 2, 4, 8, 0, 0, 0, 0}, - { 6, 8, 0, 0, 0, 0, 0, 0}, - { 0, 6, 8, 0, 0, 0, 0, 0}, - { 2, 6, 8, 0, 0, 0, 0, 0}, - { 0, 2, 6, 8, 0, 0, 0, 0}, - { 4, 6, 8, 0, 0, 0, 0, 0}, - { 0, 4, 6, 8, 0, 0, 0, 0}, - { 2, 4, 6, 8, 0, 0, 0, 0}, - { 0, 2, 4, 6, 8, 0, 0, 0}, - {10, 0, 0, 0, 0, 0, 0, 0}, - { 0, 10, 0, 0, 0, 0, 0, 0}, - { 2, 10, 0, 0, 0, 0, 0, 0}, - { 0, 2, 10, 0, 0, 0, 0, 0}, - { 4, 10, 0, 0, 0, 0, 0, 0}, - { 0, 4, 10, 0, 0, 0, 0, 0}, - { 2, 4, 10, 0, 0, 0, 0, 0}, - { 0, 2, 4, 10, 0, 0, 0, 0}, - { 6, 10, 0, 0, 0, 0, 0, 0}, - { 0, 6, 10, 0, 0, 0, 0, 0}, - { 2, 6, 10, 0, 0, 0, 0, 0}, - { 0, 2, 6, 10, 0, 0, 0, 0}, - { 4, 6, 10, 0, 0, 0, 0, 0}, - { 0, 4, 6, 10, 0, 0, 0, 0}, - { 2, 4, 6, 10, 0, 0, 0, 0}, - { 0, 2, 4, 6, 10, 0, 0, 0}, - { 8, 10, 0, 0, 0, 0, 0, 0}, - { 0, 8, 10, 0, 0, 0, 0, 0}, - { 2, 8, 10, 0, 0, 0, 0, 0}, - { 0, 2, 8, 10, 0, 0, 0, 0}, - { 4, 8, 10, 0, 0, 0, 0, 0}, - { 0, 4, 8, 10, 0, 0, 0, 0}, - { 2, 4, 8, 10, 0, 0, 0, 0}, - { 0, 2, 4, 8, 10, 0, 0, 0}, - { 6, 8, 10, 0, 0, 0, 0, 0}, - { 0, 6, 8, 10, 0, 0, 0, 0}, - { 2, 6, 8, 10, 0, 0, 0, 0}, - { 0, 2, 6, 8, 10, 0, 0, 0}, - { 4, 6, 8, 10, 0, 0, 0, 0}, - { 0, 4, 6, 8, 10, 0, 0, 0}, - { 2, 4, 6, 8, 10, 0, 0, 0}, - { 0, 2, 4, 6, 8, 10, 0, 0}, - {12, 0, 0, 0, 0, 0, 0, 0}, - { 0, 12, 0, 0, 0, 0, 0, 0}, - { 2, 12, 0, 0, 0, 0, 0, 0}, - { 0, 2, 12, 0, 0, 0, 0, 0}, - { 4, 12, 0, 0, 0, 0, 0, 0}, - { 0, 4, 12, 0, 0, 0, 0, 0}, - { 2, 4, 12, 0, 0, 0, 0, 0}, - { 0, 2, 4, 12, 0, 0, 0, 0}, - { 6, 12, 0, 0, 0, 0, 0, 0}, - { 0, 6, 12, 0, 0, 0, 0, 0}, - { 2, 6, 12, 0, 0, 0, 0, 0}, - { 0, 2, 6, 12, 0, 0, 0, 0}, - { 4, 6, 12, 0, 0, 0, 0, 0}, - { 0, 4, 6, 12, 0, 0, 0, 0}, - { 2, 4, 6, 12, 0, 0, 0, 0}, - { 0, 2, 4, 6, 12, 0, 0, 0}, - { 8, 12, 0, 0, 0, 0, 0, 0}, - { 0, 8, 12, 0, 0, 0, 0, 0}, - { 2, 8, 12, 0, 0, 0, 0, 0}, - { 0, 2, 8, 12, 0, 0, 0, 0}, - { 4, 8, 12, 0, 0, 0, 0, 0}, - { 0, 4, 8, 12, 0, 0, 0, 0}, - { 2, 4, 8, 12, 0, 0, 0, 0}, - { 0, 2, 4, 8, 12, 0, 0, 0}, - { 6, 8, 12, 0, 0, 0, 0, 0}, - { 0, 6, 8, 12, 0, 0, 0, 0}, - { 2, 6, 8, 12, 0, 0, 0, 0}, - { 0, 2, 6, 8, 12, 0, 0, 0}, - { 4, 6, 8, 12, 0, 0, 0, 0}, - { 0, 4, 6, 8, 12, 0, 0, 0}, - { 2, 4, 6, 8, 12, 0, 0, 0}, - { 0, 2, 4, 6, 8, 12, 0, 0}, - {10, 12, 0, 0, 0, 0, 0, 0}, - { 0, 10, 12, 0, 0, 0, 0, 0}, - { 2, 10, 12, 0, 0, 0, 0, 0}, - { 0, 2, 10, 12, 0, 0, 0, 0}, - { 4, 10, 12, 0, 0, 0, 0, 0}, - { 0, 4, 10, 12, 0, 0, 0, 0}, - { 2, 4, 10, 12, 0, 0, 0, 0}, - { 0, 2, 4, 10, 12, 0, 0, 0}, - { 6, 10, 12, 0, 0, 0, 0, 0}, - { 0, 6, 10, 12, 0, 0, 0, 0}, - { 2, 6, 10, 12, 0, 0, 0, 0}, - { 0, 2, 6, 10, 12, 0, 0, 0}, - { 4, 6, 10, 12, 0, 0, 0, 0}, - { 0, 4, 6, 10, 12, 0, 0, 0}, - { 2, 4, 6, 10, 12, 0, 0, 0}, - { 0, 2, 4, 6, 10, 12, 0, 0}, - { 8, 10, 12, 0, 0, 0, 0, 0}, - { 0, 8, 10, 12, 0, 0, 0, 0}, - { 2, 8, 10, 12, 0, 0, 0, 0}, - { 0, 2, 8, 10, 12, 0, 0, 0}, - { 4, 8, 10, 12, 0, 0, 0, 0}, - { 0, 4, 8, 10, 12, 0, 0, 0}, - { 2, 4, 8, 10, 12, 0, 0, 0}, - { 0, 2, 4, 8, 10, 12, 0, 0}, - { 6, 8, 10, 12, 0, 0, 0, 0}, - { 0, 6, 8, 10, 12, 0, 0, 0}, - { 2, 6, 8, 10, 12, 0, 0, 0}, - { 0, 2, 6, 8, 10, 12, 0, 0}, - { 4, 6, 8, 10, 12, 0, 0, 0}, - { 0, 4, 6, 8, 10, 12, 0, 0}, - { 2, 4, 6, 8, 10, 12, 0, 0}, - { 0, 2, 4, 6, 8, 10, 12, 0}, - {14, 0, 0, 0, 0, 0, 0, 0}, - { 0, 14, 0, 0, 0, 0, 0, 0}, - { 2, 14, 0, 0, 0, 0, 0, 0}, - { 0, 2, 14, 0, 0, 0, 0, 0}, - { 4, 14, 0, 0, 0, 0, 0, 0}, - { 0, 4, 14, 0, 0, 0, 0, 0}, - { 2, 4, 14, 0, 0, 0, 0, 0}, - { 0, 2, 4, 14, 0, 0, 0, 0}, - { 6, 14, 0, 0, 0, 0, 0, 0}, - { 0, 6, 14, 0, 0, 0, 0, 0}, - { 2, 6, 14, 0, 0, 0, 0, 0}, - { 0, 2, 6, 14, 0, 0, 0, 0}, - { 4, 6, 14, 0, 0, 0, 0, 0}, - { 0, 4, 6, 14, 0, 0, 0, 0}, - { 2, 4, 6, 14, 0, 0, 0, 0}, - { 0, 2, 4, 6, 14, 0, 0, 0}, - { 8, 14, 0, 0, 0, 0, 0, 0}, - { 0, 8, 14, 0, 0, 0, 0, 0}, - { 2, 8, 14, 0, 0, 0, 0, 0}, - { 0, 2, 8, 14, 0, 0, 0, 0}, - { 4, 8, 14, 0, 0, 0, 0, 0}, - { 0, 4, 8, 14, 0, 0, 0, 0}, - { 2, 4, 8, 14, 0, 0, 0, 0}, - { 0, 2, 4, 8, 14, 0, 0, 0}, - { 6, 8, 14, 0, 0, 0, 0, 0}, - { 0, 6, 8, 14, 0, 0, 0, 0}, - { 2, 6, 8, 14, 0, 0, 0, 0}, - { 0, 2, 6, 8, 14, 0, 0, 0}, - { 4, 6, 8, 14, 0, 0, 0, 0}, - { 0, 4, 6, 8, 14, 0, 0, 0}, - { 2, 4, 6, 8, 14, 0, 0, 0}, - { 0, 2, 4, 6, 8, 14, 0, 0}, - {10, 14, 0, 0, 0, 0, 0, 0}, - { 0, 10, 14, 0, 0, 0, 0, 0}, - { 2, 10, 14, 0, 0, 0, 0, 0}, - { 0, 2, 10, 14, 0, 0, 0, 0}, - { 4, 10, 14, 0, 0, 0, 0, 0}, - { 0, 4, 10, 14, 0, 0, 0, 0}, - { 2, 4, 10, 14, 0, 0, 0, 0}, - { 0, 2, 4, 10, 14, 0, 0, 0}, - { 6, 10, 14, 0, 0, 0, 0, 0}, - { 0, 6, 10, 14, 0, 0, 0, 0}, - { 2, 6, 10, 14, 0, 0, 0, 0}, - { 0, 2, 6, 10, 14, 0, 0, 0}, - { 4, 6, 10, 14, 0, 0, 0, 0}, - { 0, 4, 6, 10, 14, 0, 0, 0}, - { 2, 4, 6, 10, 14, 0, 0, 0}, - { 0, 2, 4, 6, 10, 14, 0, 0}, - { 8, 10, 14, 0, 0, 0, 0, 0}, - { 0, 8, 10, 14, 0, 0, 0, 0}, - { 2, 8, 10, 14, 0, 0, 0, 0}, - { 0, 2, 8, 10, 14, 0, 0, 0}, - { 4, 8, 10, 14, 0, 0, 0, 0}, - { 0, 4, 8, 10, 14, 0, 0, 0}, - { 2, 4, 8, 10, 14, 0, 0, 0}, - { 0, 2, 4, 8, 10, 14, 0, 0}, - { 6, 8, 10, 14, 0, 0, 0, 0}, - { 0, 6, 8, 10, 14, 0, 0, 0}, - { 2, 6, 8, 10, 14, 0, 0, 0}, - { 0, 2, 6, 8, 10, 14, 0, 0}, - { 4, 6, 8, 10, 14, 0, 0, 0}, - { 0, 4, 6, 8, 10, 14, 0, 0}, - { 2, 4, 6, 8, 10, 14, 0, 0}, - { 0, 2, 4, 6, 8, 10, 14, 0}, - {12, 14, 0, 0, 0, 0, 0, 0}, - { 0, 12, 14, 0, 0, 0, 0, 0}, - { 2, 12, 14, 0, 0, 0, 0, 0}, - { 0, 2, 12, 14, 0, 0, 0, 0}, - { 4, 12, 14, 0, 0, 0, 0, 0}, - { 0, 4, 12, 14, 0, 0, 0, 0}, - { 2, 4, 12, 14, 0, 0, 0, 0}, - { 0, 2, 4, 12, 14, 0, 0, 0}, - { 6, 12, 14, 0, 0, 0, 0, 0}, - { 0, 6, 12, 14, 0, 0, 0, 0}, - { 2, 6, 12, 14, 0, 0, 0, 0}, - { 0, 2, 6, 12, 14, 0, 0, 0}, - { 4, 6, 12, 14, 0, 0, 0, 0}, - { 0, 4, 6, 12, 14, 0, 0, 0}, - { 2, 4, 6, 12, 14, 0, 0, 0}, - { 0, 2, 4, 6, 12, 14, 0, 0}, - { 8, 12, 14, 0, 0, 0, 0, 0}, - { 0, 8, 12, 14, 0, 0, 0, 0}, - { 2, 8, 12, 14, 0, 0, 0, 0}, - { 0, 2, 8, 12, 14, 0, 0, 0}, - { 4, 8, 12, 14, 0, 0, 0, 0}, - { 0, 4, 8, 12, 14, 0, 0, 0}, - { 2, 4, 8, 12, 14, 0, 0, 0}, - { 0, 2, 4, 8, 12, 14, 0, 0}, - { 6, 8, 12, 14, 0, 0, 0, 0}, - { 0, 6, 8, 12, 14, 0, 0, 0}, - { 2, 6, 8, 12, 14, 0, 0, 0}, - { 0, 2, 6, 8, 12, 14, 0, 0}, - { 4, 6, 8, 12, 14, 0, 0, 0}, - { 0, 4, 6, 8, 12, 14, 0, 0}, - { 2, 4, 6, 8, 12, 14, 0, 0}, - { 0, 2, 4, 6, 8, 12, 14, 0}, - {10, 12, 14, 0, 0, 0, 0, 0}, - { 0, 10, 12, 14, 0, 0, 0, 0}, - { 2, 10, 12, 14, 0, 0, 0, 0}, - { 0, 2, 10, 12, 14, 0, 0, 0}, - { 4, 10, 12, 14, 0, 0, 0, 0}, - { 0, 4, 10, 12, 14, 0, 0, 0}, - { 2, 4, 10, 12, 14, 0, 0, 0}, - { 0, 2, 4, 10, 12, 14, 0, 0}, - { 6, 10, 12, 14, 0, 0, 0, 0}, - { 0, 6, 10, 12, 14, 0, 0, 0}, - { 2, 6, 10, 12, 14, 0, 0, 0}, - { 0, 2, 6, 10, 12, 14, 0, 0}, - { 4, 6, 10, 12, 14, 0, 0, 0}, - { 0, 4, 6, 10, 12, 14, 0, 0}, - { 2, 4, 6, 10, 12, 14, 0, 0}, - { 0, 2, 4, 6, 10, 12, 14, 0}, - { 8, 10, 12, 14, 0, 0, 0, 0}, - { 0, 8, 10, 12, 14, 0, 0, 0}, - { 2, 8, 10, 12, 14, 0, 0, 0}, - { 0, 2, 8, 10, 12, 14, 0, 0}, - { 4, 8, 10, 12, 14, 0, 0, 0}, - { 0, 4, 8, 10, 12, 14, 0, 0}, - { 2, 4, 8, 10, 12, 14, 0, 0}, - { 0, 2, 4, 8, 10, 12, 14, 0}, - { 6, 8, 10, 12, 14, 0, 0, 0}, - { 0, 6, 8, 10, 12, 14, 0, 0}, - { 2, 6, 8, 10, 12, 14, 0, 0}, - { 0, 2, 6, 8, 10, 12, 14, 0}, - { 4, 6, 8, 10, 12, 14, 0, 0}, - { 0, 4, 6, 8, 10, 12, 14, 0}, - { 2, 4, 6, 8, 10, 12, 14, 0}, - { 0, 2, 4, 6, 8, 10, 12, 14} +static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { + {-1, -1, -1, -1, -1, -1, -1, -1}, + { 0, -1, -1, -1, -1, -1, -1, -1}, + { 2, -1, -1, -1, -1, -1, -1, -1}, + { 0, 2, -1, -1, -1, -1, -1, -1}, + { 4, -1, -1, -1, -1, -1, -1, -1}, + { 0, 4, -1, -1, -1, -1, -1, -1}, + { 2, 4, -1, -1, -1, -1, -1, -1}, + { 0, 2, 4, -1, -1, -1, -1, -1}, + { 6, -1, -1, -1, -1, -1, -1, -1}, + { 0, 6, -1, -1, -1, -1, -1, -1}, + { 2, 6, -1, -1, -1, -1, -1, -1}, + { 0, 2, 6, -1, -1, -1, -1, -1}, + { 4, 6, -1, -1, -1, -1, -1, -1}, + { 0, 4, 6, -1, -1, -1, -1, -1}, + { 2, 4, 6, -1, -1, -1, -1, -1}, + { 0, 2, 4, 6, -1, -1, -1, -1}, + { 8, -1, -1, -1, -1, -1, -1, -1}, + { 0, 8, -1, -1, -1, -1, -1, -1}, + { 2, 8, -1, -1, -1, -1, -1, -1}, + { 0, 2, 8, -1, -1, -1, -1, -1}, + { 4, 8, -1, -1, -1, -1, -1, -1}, + { 0, 4, 8, -1, -1, -1, -1, -1}, + { 2, 4, 8, -1, -1, -1, -1, -1}, + { 0, 2, 4, 8, -1, -1, -1, -1}, + { 6, 8, -1, -1, -1, -1, -1, -1}, + { 0, 6, 8, -1, -1, -1, -1, -1}, + { 2, 6, 8, -1, -1, -1, -1, -1}, + { 0, 2, 6, 8, -1, -1, -1, -1}, + { 4, 6, 8, -1, -1, -1, -1, -1}, + { 0, 4, 6, 8, -1, -1, -1, -1}, + { 2, 4, 6, 8, -1, -1, -1, -1}, + { 0, 2, 4, 6, 8, -1, -1, -1}, + {10, -1, -1, -1, -1, -1, -1, -1}, + { 0, 10, -1, -1, -1, -1, -1, -1}, + { 2, 10, -1, -1, -1, -1, -1, -1}, + { 0, 2, 10, -1, -1, -1, -1, -1}, + { 4, 10, -1, -1, -1, -1, -1, -1}, + { 0, 4, 10, -1, -1, -1, -1, -1}, + { 2, 4, 10, -1, -1, -1, -1, -1}, + { 0, 2, 4, 10, -1, -1, -1, -1}, + { 6, 10, -1, -1, -1, -1, -1, -1}, + { 0, 6, 10, -1, -1, -1, -1, -1}, + { 2, 6, 10, -1, -1, -1, -1, -1}, + { 0, 2, 6, 10, -1, -1, -1, -1}, + { 4, 6, 10, -1, -1, -1, -1, -1}, + { 0, 4, 6, 10, -1, -1, -1, -1}, + { 2, 4, 6, 10, -1, -1, -1, -1}, + { 0, 2, 4, 6, 10, -1, -1, -1}, + { 8, 10, -1, -1, -1, -1, -1, -1}, + { 0, 8, 10, -1, -1, -1, -1, -1}, + { 2, 8, 10, -1, -1, -1, -1, -1}, + { 0, 2, 8, 10, -1, -1, -1, -1}, + { 4, 8, 10, -1, -1, -1, -1, -1}, + { 0, 4, 8, 10, -1, -1, -1, -1}, + { 2, 4, 8, 10, -1, -1, -1, -1}, + { 0, 2, 4, 8, 10, -1, -1, -1}, + { 6, 8, 10, -1, -1, -1, -1, -1}, + { 0, 6, 8, 10, -1, -1, -1, -1}, + { 2, 6, 8, 10, -1, -1, -1, -1}, + { 0, 2, 6, 8, 10, -1, -1, -1}, + { 4, 6, 8, 10, -1, -1, -1, -1}, + { 0, 4, 6, 8, 10, -1, -1, -1}, + { 2, 4, 6, 8, 10, -1, -1, -1}, + { 0, 2, 4, 6, 8, 10, -1, -1}, + {12, -1, -1, -1, -1, -1, -1, -1}, + { 0, 12, -1, -1, -1, -1, -1, -1}, + { 2, 12, -1, -1, -1, -1, -1, -1}, + { 0, 2, 12, -1, -1, -1, -1, -1}, + { 4, 12, -1, -1, -1, -1, -1, -1}, + { 0, 4, 12, -1, -1, -1, -1, -1}, + { 2, 4, 12, -1, -1, -1, -1, -1}, + { 0, 2, 4, 12, -1, -1, -1, -1}, + { 6, 12, -1, -1, -1, -1, -1, -1}, + { 0, 6, 12, -1, -1, -1, -1, -1}, + { 2, 6, 12, -1, -1, -1, -1, -1}, + { 0, 2, 6, 12, -1, -1, -1, -1}, + { 4, 6, 12, -1, -1, -1, -1, -1}, + { 0, 4, 6, 12, -1, -1, -1, -1}, + { 2, 4, 6, 12, -1, -1, -1, -1}, + { 0, 2, 4, 6, 12, -1, -1, -1}, + { 8, 12, -1, -1, -1, -1, -1, -1}, + { 0, 8, 12, -1, -1, -1, -1, -1}, + { 2, 8, 12, -1, -1, -1, -1, -1}, + { 0, 2, 8, 12, -1, -1, -1, -1}, + { 4, 8, 12, -1, -1, -1, -1, -1}, + { 0, 4, 8, 12, -1, -1, -1, -1}, + { 2, 4, 8, 12, -1, -1, -1, -1}, + { 0, 2, 4, 8, 12, -1, -1, -1}, + { 6, 8, 12, -1, -1, -1, -1, -1}, + { 0, 6, 8, 12, -1, -1, -1, -1}, + { 2, 6, 8, 12, -1, -1, -1, -1}, + { 0, 2, 6, 8, 12, -1, -1, -1}, + { 4, 6, 8, 12, -1, -1, -1, -1}, + { 0, 4, 6, 8, 12, -1, -1, -1}, + { 2, 4, 6, 8, 12, -1, -1, -1}, + { 0, 2, 4, 6, 8, 12, -1, -1}, + {10, 12, -1, -1, -1, -1, -1, -1}, + { 0, 10, 12, -1, -1, -1, -1, -1}, + { 2, 10, 12, -1, -1, -1, -1, -1}, + { 0, 2, 10, 12, -1, -1, -1, -1}, + { 4, 10, 12, -1, -1, -1, -1, -1}, + { 0, 4, 10, 12, -1, -1, -1, -1}, + { 2, 4, 10, 12, -1, -1, -1, -1}, + { 0, 2, 4, 10, 12, -1, -1, -1}, + { 6, 10, 12, -1, -1, -1, -1, -1}, + { 0, 6, 10, 12, -1, -1, -1, -1}, + { 2, 6, 10, 12, -1, -1, -1, -1}, + { 0, 2, 6, 10, 12, -1, -1, -1}, + { 4, 6, 10, 12, -1, -1, -1, -1}, + { 0, 4, 6, 10, 12, -1, -1, -1}, + { 2, 4, 6, 10, 12, -1, -1, -1}, + { 0, 2, 4, 6, 10, 12, -1, -1}, + { 8, 10, 12, -1, -1, -1, -1, -1}, + { 0, 8, 10, 12, -1, -1, -1, -1}, + { 2, 8, 10, 12, -1, -1, -1, -1}, + { 0, 2, 8, 10, 12, -1, -1, -1}, + { 4, 8, 10, 12, -1, -1, -1, -1}, + { 0, 4, 8, 10, 12, -1, -1, -1}, + { 2, 4, 8, 10, 12, -1, -1, -1}, + { 0, 2, 4, 8, 10, 12, -1, -1}, + { 6, 8, 10, 12, -1, -1, -1, -1}, + { 0, 6, 8, 10, 12, -1, -1, -1}, + { 2, 6, 8, 10, 12, -1, -1, -1}, + { 0, 2, 6, 8, 10, 12, -1, -1}, + { 4, 6, 8, 10, 12, -1, -1, -1}, + { 0, 4, 6, 8, 10, 12, -1, -1}, + { 2, 4, 6, 8, 10, 12, -1, -1}, + { 0, 2, 4, 6, 8, 10, 12, -1}, + {14, -1, -1, -1, -1, -1, -1, -1}, + { 0, 14, -1, -1, -1, -1, -1, -1}, + { 2, 14, -1, -1, -1, -1, -1, -1}, + { 0, 2, 14, -1, -1, -1, -1, -1}, + { 4, 14, -1, -1, -1, -1, -1, -1}, + { 0, 4, 14, -1, -1, -1, -1, -1}, + { 2, 4, 14, -1, -1, -1, -1, -1}, + { 0, 2, 4, 14, -1, -1, -1, -1}, + { 6, 14, -1, -1, -1, -1, -1, -1}, + { 0, 6, 14, -1, -1, -1, -1, -1}, + { 2, 6, 14, -1, -1, -1, -1, -1}, + { 0, 2, 6, 14, -1, -1, -1, -1}, + { 4, 6, 14, -1, -1, -1, -1, -1}, + { 0, 4, 6, 14, -1, -1, -1, -1}, + { 2, 4, 6, 14, -1, -1, -1, -1}, + { 0, 2, 4, 6, 14, -1, -1, -1}, + { 8, 14, -1, -1, -1, -1, -1, -1}, + { 0, 8, 14, -1, -1, -1, -1, -1}, + { 2, 8, 14, -1, -1, -1, -1, -1}, + { 0, 2, 8, 14, -1, -1, -1, -1}, + { 4, 8, 14, -1, -1, -1, -1, -1}, + { 0, 4, 8, 14, -1, -1, -1, -1}, + { 2, 4, 8, 14, -1, -1, -1, -1}, + { 0, 2, 4, 8, 14, -1, -1, -1}, + { 6, 8, 14, -1, -1, -1, -1, -1}, + { 0, 6, 8, 14, -1, -1, -1, -1}, + { 2, 6, 8, 14, -1, -1, -1, -1}, + { 0, 2, 6, 8, 14, -1, -1, -1}, + { 4, 6, 8, 14, -1, -1, -1, -1}, + { 0, 4, 6, 8, 14, -1, -1, -1}, + { 2, 4, 6, 8, 14, -1, -1, -1}, + { 0, 2, 4, 6, 8, 14, -1, -1}, + {10, 14, -1, -1, -1, -1, -1, -1}, + { 0, 10, 14, -1, -1, -1, -1, -1}, + { 2, 10, 14, -1, -1, -1, -1, -1}, + { 0, 2, 10, 14, -1, -1, -1, -1}, + { 4, 10, 14, -1, -1, -1, -1, -1}, + { 0, 4, 10, 14, -1, -1, -1, -1}, + { 2, 4, 10, 14, -1, -1, -1, -1}, + { 0, 2, 4, 10, 14, -1, -1, -1}, + { 6, 10, 14, -1, -1, -1, -1, -1}, + { 0, 6, 10, 14, -1, -1, -1, -1}, + { 2, 6, 10, 14, -1, -1, -1, -1}, + { 0, 2, 6, 10, 14, -1, -1, -1}, + { 4, 6, 10, 14, -1, -1, -1, -1}, + { 0, 4, 6, 10, 14, -1, -1, -1}, + { 2, 4, 6, 10, 14, -1, -1, -1}, + { 0, 2, 4, 6, 10, 14, -1, -1}, + { 8, 10, 14, -1, -1, -1, -1, -1}, + { 0, 8, 10, 14, -1, -1, -1, -1}, + { 2, 8, 10, 14, -1, -1, -1, -1}, + { 0, 2, 8, 10, 14, -1, -1, -1}, + { 4, 8, 10, 14, -1, -1, -1, -1}, + { 0, 4, 8, 10, 14, -1, -1, -1}, + { 2, 4, 8, 10, 14, -1, -1, -1}, + { 0, 2, 4, 8, 10, 14, -1, -1}, + { 6, 8, 10, 14, -1, -1, -1, -1}, + { 0, 6, 8, 10, 14, -1, -1, -1}, + { 2, 6, 8, 10, 14, -1, -1, -1}, + { 0, 2, 6, 8, 10, 14, -1, -1}, + { 4, 6, 8, 10, 14, -1, -1, -1}, + { 0, 4, 6, 8, 10, 14, -1, -1}, + { 2, 4, 6, 8, 10, 14, -1, -1}, + { 0, 2, 4, 6, 8, 10, 14, -1}, + {12, 14, -1, -1, -1, -1, -1, -1}, + { 0, 12, 14, -1, -1, -1, -1, -1}, + { 2, 12, 14, -1, -1, -1, -1, -1}, + { 0, 2, 12, 14, -1, -1, -1, -1}, + { 4, 12, 14, -1, -1, -1, -1, -1}, + { 0, 4, 12, 14, -1, -1, -1, -1}, + { 2, 4, 12, 14, -1, -1, -1, -1}, + { 0, 2, 4, 12, 14, -1, -1, -1}, + { 6, 12, 14, -1, -1, -1, -1, -1}, + { 0, 6, 12, 14, -1, -1, -1, -1}, + { 2, 6, 12, 14, -1, -1, -1, -1}, + { 0, 2, 6, 12, 14, -1, -1, -1}, + { 4, 6, 12, 14, -1, -1, -1, -1}, + { 0, 4, 6, 12, 14, -1, -1, -1}, + { 2, 4, 6, 12, 14, -1, -1, -1}, + { 0, 2, 4, 6, 12, 14, -1, -1}, + { 8, 12, 14, -1, -1, -1, -1, -1}, + { 0, 8, 12, 14, -1, -1, -1, -1}, + { 2, 8, 12, 14, -1, -1, -1, -1}, + { 0, 2, 8, 12, 14, -1, -1, -1}, + { 4, 8, 12, 14, -1, -1, -1, -1}, + { 0, 4, 8, 12, 14, -1, -1, -1}, + { 2, 4, 8, 12, 14, -1, -1, -1}, + { 0, 2, 4, 8, 12, 14, -1, -1}, + { 6, 8, 12, 14, -1, -1, -1, -1}, + { 0, 6, 8, 12, 14, -1, -1, -1}, + { 2, 6, 8, 12, 14, -1, -1, -1}, + { 0, 2, 6, 8, 12, 14, -1, -1}, + { 4, 6, 8, 12, 14, -1, -1, -1}, + { 0, 4, 6, 8, 12, 14, -1, -1}, + { 2, 4, 6, 8, 12, 14, -1, -1}, + { 0, 2, 4, 6, 8, 12, 14, -1}, + {10, 12, 14, -1, -1, -1, -1, -1}, + { 0, 10, 12, 14, -1, -1, -1, -1}, + { 2, 10, 12, 14, -1, -1, -1, -1}, + { 0, 2, 10, 12, 14, -1, -1, -1}, + { 4, 10, 12, 14, -1, -1, -1, -1}, + { 0, 4, 10, 12, 14, -1, -1, -1}, + { 2, 4, 10, 12, 14, -1, -1, -1}, + { 0, 2, 4, 10, 12, 14, -1, -1}, + { 6, 10, 12, 14, -1, -1, -1, -1}, + { 0, 6, 10, 12, 14, -1, -1, -1}, + { 2, 6, 10, 12, 14, -1, -1, -1}, + { 0, 2, 6, 10, 12, 14, -1, -1}, + { 4, 6, 10, 12, 14, -1, -1, -1}, + { 0, 4, 6, 10, 12, 14, -1, -1}, + { 2, 4, 6, 10, 12, 14, -1, -1}, + { 0, 2, 4, 6, 10, 12, 14, -1}, + { 8, 10, 12, 14, -1, -1, -1, -1}, + { 0, 8, 10, 12, 14, -1, -1, -1}, + { 2, 8, 10, 12, 14, -1, -1, -1}, + { 0, 2, 8, 10, 12, 14, -1, -1}, + { 4, 8, 10, 12, 14, -1, -1, -1}, + { 0, 4, 8, 10, 12, 14, -1, -1}, + { 2, 4, 8, 10, 12, 14, -1, -1}, + { 0, 2, 4, 8, 10, 12, 14, -1}, + { 6, 8, 10, 12, 14, -1, -1, -1}, + { 0, 6, 8, 10, 12, 14, -1, -1}, + { 2, 6, 8, 10, 12, 14, -1, -1}, + { 0, 2, 6, 8, 10, 12, 14, -1}, + { 4, 6, 8, 10, 12, 14, -1, -1}, + { 0, 4, 6, 8, 10, 12, 14, -1}, + { 2, 4, 6, 8, 10, 12, 14, -1}, + { 0, 2, 4, 6, 8, 10, 12, 14} + } }; -#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) -#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) +#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) +#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -size_t PQCLEAN_KYBER76890S_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint16_t val; - uint32_t good0, good1, good2; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison +#define REJ_UNIFORM_BUFLEN 576 +unsigned int PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(int16_t *restrict r, + const uint8_t *restrict buf) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; + uint32_t good = 0; + const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER76890S_AVX2_16xq.as_vec); - const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER76890S_AVX2_16xv.as_vec); - __m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2; - __m128i d, tmp, pilo, pihi; + const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER76890S_AVX2_qdata.as_arr[_16XQ]); + const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER76890S_AVX2_qdata.as_arr[_16XV]); + __m256i f0, f1, g0, g1, g2, g3; + __m128i f, t, pilo, pihi; - ctr = pos = 0; - while (ctr + 48 <= len && pos + 96 <= buflen) { - d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]); - d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]); - d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]); + ctr = 0; + for (pos = 0; pos < 2 * KYBER_N; pos += 64) { + f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); + f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); - tmp0 = _mm256_cmpge_epu16(bound, d0); - tmp1 = _mm256_cmpge_epu16(bound, d1); - tmp2 = _mm256_cmpge_epu16(bound, d2); - good0 = (uint32_t)_mm256_movemask_epi8(tmp0); - good1 = (uint32_t)_mm256_movemask_epi8(tmp1); - good2 = (uint32_t)_mm256_movemask_epi8(tmp2); - good0 = _pext_u32(good0, 0x55555555); - good1 = _pext_u32(good1, 0x55555555); - good2 = _pext_u32(good2, 0x55555555); + g0 = _mm256_cmpge_epu16(bound, f0); + g1 = _mm256_cmpge_epu16(bound, f1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]); - pi0 = _mm256_castsi128_si256(pilo); - pi0 = _mm256_inserti128_si256(pi0, pihi, 1); + g0 = _mm256_packs_epi16(g0, g1); + good = _mm256_movemask_epi8(g0); - pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]); - pi1 = _mm256_castsi128_si256(pilo); - pi1 = _mm256_inserti128_si256(pi1, pihi, 1); + g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); + g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); + g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); + g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]); - pi2 = _mm256_castsi128_si256(pilo); - pi2 = _mm256_inserti128_si256(pi2, pihi, 1); + //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); + //g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8); - tmp0 = _mm256_add_epi8(pi0, ones); - tmp1 = _mm256_add_epi8(pi1, ones); - tmp2 = _mm256_add_epi8(pi2, ones); - pi0 = _mm256_unpacklo_epi8(pi0, tmp0); - pi1 = _mm256_unpacklo_epi8(pi1, tmp1); - pi2 = _mm256_unpacklo_epi8(pi2, tmp2); + /* Barrett reduction of (still unsigned) values */ + g2 = _mm256_mulhi_epu16(f0, v); + g3 = _mm256_mulhi_epu16(f1, v); + g2 = _mm256_srli_epi16(g2, 10); + g3 = _mm256_srli_epi16(g3, 10); + g2 = _mm256_mullo_epi16(g2, kyberq); + g3 = _mm256_mullo_epi16(g3, kyberq); + f0 = _mm256_sub_epi16(f0, g2); + f1 = _mm256_sub_epi16(f1, g3); - d0 = _mm256_shuffle_epi8(d0, pi0); - d1 = _mm256_shuffle_epi8(d1, pi1); - d2 = _mm256_shuffle_epi8(d2, pi2); + g2 = _mm256_add_epi8(g0, ones); + g3 = _mm256_add_epi8(g1, ones); + g0 = _mm256_unpacklo_epi8(g0, g2); + g1 = _mm256_unpacklo_epi8(g1, g3); - /* Barrett reduction of (still unsigned) d values */ - tmp0 = _mm256_mulhi_epu16(d0, v); - tmp1 = _mm256_mulhi_epu16(d1, v); - tmp2 = _mm256_mulhi_epu16(d2, v); - tmp0 = _mm256_srli_epi16(tmp0, 10); - tmp1 = _mm256_srli_epi16(tmp1, 10); - tmp2 = _mm256_srli_epi16(tmp2, 10); - tmp0 = _mm256_mullo_epi16(tmp0, kyberq); - tmp1 = _mm256_mullo_epi16(tmp1, kyberq); - tmp2 = _mm256_mullo_epi16(tmp2, kyberq); - d0 = _mm256_sub_epi16(d0, tmp0); - d1 = _mm256_sub_epi16(d1, tmp1); - d2 = _mm256_sub_epi16(d2, tmp2); + f0 = _mm256_shuffle_epi8(f0, g0); + f1 = _mm256_shuffle_epi8(f1, g1); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0)); - ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1)); - ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2)); - ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF); - pos += 96; + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); + ctr += _mm_popcnt_u32((good >> 0) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); + ctr += _mm_popcnt_u32((good >> 16) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); + ctr += _mm_popcnt_u32((good >> 8) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); + ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr + 8 <= len && pos + 16 <= buflen) { - d = _mm_loadu_si128((__m128i *)&buf[pos]); - tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d); - good0 = (uint32_t)_mm_movemask_epi8(tmp); - good0 = _pext_u32(good0, 0x55555555); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0]); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { + f = _mm_load_si128((__m128i *)&buf[pos]); + t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + good = _mm_movemask_epi8(t); + good = _pext_u32(good, 0x5555); + pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - d = _mm_shuffle_epi8(d, pilo); /* Barrett reduction */ - tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v)); - tmp = _mm_srli_epi16(tmp, 10); - tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq)); - d = _mm_sub_epi16(d, tmp); + t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); + t = _mm_srli_epi16(t, 10); + t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); + f = _mm_sub_epi16(f, t); - _mm_storeu_si128((__m128i *)&r[ctr], d); - ctr += (unsigned int)_mm_popcnt_u32(good0); + f = _mm_shuffle_epi8(f, pilo); + _mm_storeu_si128((__m128i *)&r[ctr], f); + ctr += _mm_popcnt_u32(good); pos += 16; } - while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = (int16_t)val; + r[ctr++] = val; } } diff --git a/crypto_kem/kyber768-90s/avx2/rejsample.h b/crypto_kem/kyber768-90s/avx2/rejsample.h index 7ad7a6ec..3fe092a9 100644 --- a/crypto_kem/kyber768-90s/avx2/rejsample.h +++ b/crypto_kem/kyber768-90s/avx2/rejsample.h @@ -1,12 +1,11 @@ #ifndef REJSAMPLE_H #define REJSAMPLE_H -#include +#include "params.h" #include -size_t PQCLEAN_KYBER76890S_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); + +unsigned int PQCLEAN_KYBER76890S_AVX2_rej_uniform_avx(int16_t *r, + const unsigned char *buf); #endif diff --git a/crypto_kem/kyber768-90s/avx2/shuffle.S b/crypto_kem/kyber768-90s/avx2/shuffle.S new file mode 100644 index 00000000..d4b097c9 --- /dev/null +++ b/crypto_kem/kyber768-90s/avx2/shuffle.S @@ -0,0 +1,255 @@ +#include "cdecl.inc" +.include "fq.inc" +.include "shuffle.inc" + +/* +nttpack_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret +*/ + +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +shuffle8 4,8,3,8 +shuffle8 5,9,4,9 +shuffle8 6,10,5,10 +shuffle8 7,11,6,11 + +shuffle4 3,5,7,5 +shuffle4 8,10,3,10 +shuffle4 4,6,8,6 +shuffle4 9,11,4,11 + +shuffle2 7,8,9,8 +shuffle2 5,6,7,6 +shuffle2 3,4,5,4 +shuffle2 10,11,3,11 + +shuffle1 9,5,10,5 +shuffle1 8,4,9,4 +shuffle1 7,3,8,3 +shuffle1 6,11,7,11 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm9,64(%rdi) +vmovdqa %ymm4,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm3,160(%rdi) +vmovdqa %ymm7,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_nttunpack_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret + +ntttobytes128_avx: +#load +vmovdqa (%rsi),%ymm5 +vmovdqa 32(%rsi),%ymm6 +vmovdqa 64(%rsi),%ymm7 +vmovdqa 96(%rsi),%ymm8 +vmovdqa 128(%rsi),%ymm9 +vmovdqa 160(%rsi),%ymm10 +vmovdqa 192(%rsi),%ymm11 +vmovdqa 224(%rsi),%ymm12 + +#csubq +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,1 +csubq 9,13 +csubq 10,14 +csubq 11,15 +csubq 12,1 + +#bitpack +vpsllw $12,%ymm6,%ymm4 +vpor %ymm4,%ymm5,%ymm4 + +vpsrlw $4,%ymm6,%ymm5 +vpsllw $8,%ymm7,%ymm6 +vpor %ymm5,%ymm6,%ymm5 + +vpsrlw $8,%ymm7,%ymm6 +vpsllw $4,%ymm8,%ymm7 +vpor %ymm6,%ymm7,%ymm6 + +vpsllw $12,%ymm10,%ymm7 +vpor %ymm7,%ymm9,%ymm7 + +vpsrlw $4,%ymm10,%ymm8 +vpsllw $8,%ymm11,%ymm9 +vpor %ymm8,%ymm9,%ymm8 + +vpsrlw $8,%ymm11,%ymm9 +vpsllw $4,%ymm12,%ymm10 +vpor %ymm9,%ymm10,%ymm9 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 + +shuffle2 3,4,8,4 +shuffle2 6,5,3,5 +shuffle2 7,9,6,9 + +shuffle4 8,3,7,3 +shuffle4 6,4,8,4 +shuffle4 5,9,6,9 + +shuffle8 7,8,5,8 +shuffle8 6,3,7,3 +shuffle8 4,9,6,9 + +#store +vmovdqu %ymm5,(%rdi) +vmovdqu %ymm7,32(%rdi) +vmovdqu %ymm6,64(%rdi) +vmovdqu %ymm8,96(%rdi) +vmovdqu %ymm3,128(%rdi) +vmovdqu %ymm9,160(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx): +#consts +vmovdqa _16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret + +nttfrombytes128_avx: +#load +vmovdqu (%rsi),%ymm4 +vmovdqu 32(%rsi),%ymm5 +vmovdqu 64(%rsi),%ymm6 +vmovdqu 96(%rsi),%ymm7 +vmovdqu 128(%rsi),%ymm8 +vmovdqu 160(%rsi),%ymm9 + +shuffle8 4,7,3,7 +shuffle8 5,8,4,8 +shuffle8 6,9,5,9 + +shuffle4 3,8,6,8 +shuffle4 7,5,3,5 +shuffle4 4,9,7,9 + +shuffle2 6,5,4,5 +shuffle2 8,7,6,7 +shuffle2 3,9,8,9 + +shuffle1 4,7,10,7 +shuffle1 5,8,4,8 +shuffle1 6,9,5,9 + +#bitunpack +vpsrlw $12,%ymm10,%ymm11 +vpsllw $4,%ymm7,%ymm12 +vpor %ymm11,%ymm12,%ymm11 +vpand %ymm0,%ymm10,%ymm10 +vpand %ymm0,%ymm11,%ymm11 + +vpsrlw $8,%ymm7,%ymm12 +vpsllw $8,%ymm4,%ymm13 +vpor %ymm12,%ymm13,%ymm12 +vpand %ymm0,%ymm12,%ymm12 + +vpsrlw $4,%ymm4,%ymm13 +vpand %ymm0,%ymm13,%ymm13 + +vpsrlw $12,%ymm8,%ymm14 +vpsllw $4,%ymm5,%ymm15 +vpor %ymm14,%ymm15,%ymm14 +vpand %ymm0,%ymm8,%ymm8 +vpand %ymm0,%ymm14,%ymm14 + +vpsrlw $8,%ymm5,%ymm15 +vpsllw $8,%ymm9,%ymm1 +vpor %ymm15,%ymm1,%ymm15 +vpand %ymm0,%ymm15,%ymm15 + +vpsrlw $4,%ymm9,%ymm1 +vpand %ymm0,%ymm1,%ymm1 + +#store +vmovdqa %ymm10,(%rdi) +vmovdqa %ymm11,32(%rdi) +vmovdqa %ymm12,64(%rdi) +vmovdqa %ymm13,96(%rdi) +vmovdqa %ymm8,128(%rdi) +vmovdqa %ymm14,160(%rdi) +vmovdqa %ymm15,192(%rdi) +vmovdqa %ymm1,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx) +cdecl(PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx): +#consts +vmovdqa _16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret diff --git a/crypto_kem/kyber768-90s/avx2/shuffle.inc b/crypto_kem/kyber768-90s/avx2/shuffle.inc index df352030..d4b092bc 100644 --- a/crypto_kem/kyber768-90s/avx2/shuffle.inc +++ b/crypto_kem/kyber768-90s/avx2/shuffle.inc @@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 +#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 vpsllq $32,%ymm\r1,%ymm12 vpsrlq $32,%ymm\r0,%ymm13 vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 diff --git a/crypto_kem/kyber768-90s/avx2/shuffle.s b/crypto_kem/kyber768-90s/avx2/shuffle.s deleted file mode 100644 index e7970412..00000000 --- a/crypto_kem/kyber768-90s/avx2/shuffle.s +++ /dev/null @@ -1,206 +0,0 @@ -.include "fq.inc" -.include "shuffle.inc" - -.global PQCLEAN_KYBER76890S_AVX2_nttunpack_avx -PQCLEAN_KYBER76890S_AVX2_nttunpack_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xv(%rip),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -/* -#reduce -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 12 -red16 9 13 -red16 10 14 -red16 11 15 -*/ - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx -PQCLEAN_KYBER76890S_AVX2_ntttobytes_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xq(%rip),%ymm0 - -#load -vmovdqa (%rsi),%ymm5 -vmovdqa 32(%rsi),%ymm6 -vmovdqa 64(%rsi),%ymm7 -vmovdqa 96(%rsi),%ymm8 -vmovdqa 128(%rsi),%ymm9 -vmovdqa 160(%rsi),%ymm10 -vmovdqa 192(%rsi),%ymm11 -vmovdqa 224(%rsi),%ymm12 - -#csubq -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 1 -csubq 9 13 -csubq 10 14 -csubq 11 15 -csubq 12 1 - -#bitpack -vpsllw $12,%ymm6,%ymm4 -vpor %ymm4,%ymm5,%ymm4 - -vpsrlw $4,%ymm6,%ymm5 -vpsllw $8,%ymm7,%ymm6 -vpor %ymm5,%ymm6,%ymm5 - -vpsrlw $8,%ymm7,%ymm6 -vpsllw $4,%ymm8,%ymm7 -vpor %ymm6,%ymm7,%ymm6 - -vpsllw $12,%ymm10,%ymm7 -vpor %ymm7,%ymm9,%ymm7 - -vpsrlw $4,%ymm10,%ymm8 -vpsllw $8,%ymm11,%ymm9 -vpor %ymm8,%ymm9,%ymm8 - -vpsrlw $8,%ymm11,%ymm9 -vpsllw $4,%ymm12,%ymm10 -vpor %ymm9,%ymm10,%ymm9 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 - -shuffle2 3,4,8,4 -shuffle2 6,5,3,5 -shuffle2 7,9,6,9 - -shuffle4 8,3,7,3 -shuffle4 6,4,8,4 -shuffle4 5,9,6,9 - -shuffle8 7,8,5,8 -shuffle8 6,3,7,3 -shuffle8 4,9,6,9 - -#store -vmovdqu %ymm5,(%rdi) -vmovdqu %ymm7,32(%rdi) -vmovdqu %ymm6,64(%rdi) -vmovdqu %ymm8,96(%rdi) -vmovdqu %ymm3,128(%rdi) -vmovdqu %ymm9,160(%rdi) - -ret - -.global PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx -PQCLEAN_KYBER76890S_AVX2_nttfrombytes_avx: -#consts -vmovdqa PQCLEAN_KYBER76890S_AVX2_16xmask(%rip),%ymm0 - -#load -vmovdqu (%rsi),%ymm4 -vmovdqu 32(%rsi),%ymm5 -vmovdqu 64(%rsi),%ymm6 -vmovdqu 96(%rsi),%ymm7 -vmovdqu 128(%rsi),%ymm8 -vmovdqu 160(%rsi),%ymm9 - -shuffle8 4,7,3,7 -shuffle8 5,8,4,8 -shuffle8 6,9,5,9 - -shuffle4 3,8,6,8 -shuffle4 7,5,3,5 -shuffle4 4,9,7,9 - -shuffle2 6,5,4,5 -shuffle2 8,7,6,7 -shuffle2 3,9,8,9 - -shuffle1 4,7,10,7 -shuffle1 5,8,4,8 -shuffle1 6,9,5,9 - -#bitunpack -vpsrlw $12,%ymm10,%ymm11 -vpsllw $4,%ymm7,%ymm12 -vpor %ymm11,%ymm12,%ymm11 -vpand %ymm0,%ymm10,%ymm10 -vpand %ymm0,%ymm11,%ymm11 - -vpsrlw $8,%ymm7,%ymm12 -vpsllw $8,%ymm4,%ymm13 -vpor %ymm12,%ymm13,%ymm12 -vpand %ymm0,%ymm12,%ymm12 - -vpsrlw $4,%ymm4,%ymm13 -vpand %ymm0,%ymm13,%ymm13 - -vpsrlw $12,%ymm8,%ymm14 -vpsllw $4,%ymm5,%ymm15 -vpor %ymm14,%ymm15,%ymm14 -vpand %ymm0,%ymm8,%ymm8 -vpand %ymm0,%ymm14,%ymm14 - -vpsrlw $8,%ymm5,%ymm15 -vpsllw $8,%ymm9,%ymm1 -vpor %ymm15,%ymm1,%ymm15 -vpand %ymm0,%ymm15,%ymm15 - -vpsrlw $4,%ymm9,%ymm1 -vpand %ymm0,%ymm1,%ymm1 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm11,32(%rdi) -vmovdqa %ymm12,64(%rdi) -vmovdqa %ymm13,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm14,160(%rdi) -vmovdqa %ymm15,192(%rdi) -vmovdqa %ymm1,224(%rdi) - -ret diff --git a/crypto_kem/kyber768-90s/avx2/symmetric.h b/crypto_kem/kyber768-90s/avx2/symmetric.h index f1247c36..de4cbb63 100644 --- a/crypto_kem/kyber768-90s/avx2/symmetric.h +++ b/crypto_kem/kyber768-90s/avx2/symmetric.h @@ -2,22 +2,26 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "aes256ctr.h" #include "sha2.h" -#define hash_h(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES)) -#define hash_g(OUT, IN, INBYTES) sha512((OUT), (IN), (INBYTES)) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER76890S_AVX2_aes256ctr_init((STATE), (IN), (Y) + ((uint16_t)(X) << 8)) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks((OUT), (OUTBLOCKS), (STATE)) -#define xof_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER76890S_AVX2_aes256ctr_prf((OUT), (OUTBYTES), (KEY), (NONCE)) -#define kdf(OUT, IN, INBYTES) sha256((OUT), (IN), (INBYTES)) - -#define XOF_BLOCKBYTES 128 - typedef aes256ctr_ctx xof_state; +#define XOF_BLOCKBYTES AES256CTR_BLOCKBYTES + +#define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) +#define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) +#define xof_absorb(STATE, SEED, X, Y) \ + PQCLEAN_KYBER76890S_AVX2_aes256ctr_init(STATE, SEED, (X) | ((uint16_t)(Y) << 8)) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + PQCLEAN_KYBER76890S_AVX2_aes256ctr_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER76890S_AVX2_aes256ctr_prf(OUT, OUTBYTES, KEY, NONCE) +#define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) + #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber768-90s/avx2/verify.c b/crypto_kem/kyber768-90s/avx2/verify.c index c8e0a592..b80bf8a9 100644 --- a/crypto_kem/kyber768-90s/avx2/verify.c +++ b/crypto_kem/kyber768-90s/avx2/verify.c @@ -1,23 +1,22 @@ #include "verify.h" - #include #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER76890S_AVX2_verify * * Description: Compare two arrays for equality in constant time. * -* Arguments: const uint8_t *a: pointer to first byte array -* const uint8_t *b: pointer to second byte array +* Arguments: const unsigned char *a: pointer to first byte array +* const unsigned char *b: pointer to second byte array * size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER76890S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; - uint64_t r; +int PQCLEAN_KYBER76890S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t pos = 0; + uint64_t r = 0; __m256i avec, bvec, cvec; cvec = _mm256_setzero_si256(); @@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER76890S_AVX2_verify(const uint8_t *a, const uint8_t *b, size avec = _mm256_xor_si256(avec, bvec); cvec = _mm256_or_si256(cvec, avec); } + r = !_mm256_testz_si256(cvec, cvec); - cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256()); - r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1); - - while (pos < len) { - r |= a[pos] ^ b[pos]; - pos += 1; + if (pos < len) { + avec = _mm256_loadu_si256((__m256i *)&a[pos]); + bvec = _mm256_loadu_si256((__m256i *)&b[pos]); + cvec = _mm256_cmpeq_epi8(avec, bvec); + r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); } r = (-r) >> 63; - return (uint8_t)r; + return r; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER76890S_AVX2_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: uint8_t *r: pointer to output byte array -* const uint8_t *x: pointer to input byte array +* Arguments: unsigned char *r: pointer to output byte array +* const unsigned char *x: pointer to input byte array * size_t len: Amount of bytes to be copied -* uint8_t b: Condition bit; has to be in {0,1} +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER76890S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER76890S_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { + size_t pos = 0; __m256i xvec, rvec, bvec; b = -b; - bvec = _mm256_set1_epi8((char)b); + bvec = _mm256_set1_epi8(b); for (pos = 0; pos + 32 <= len; pos += 32) { rvec = _mm256_loadu_si256((__m256i *)&r[pos]); diff --git a/crypto_kem/kyber768-90s/avx2/verify.h b/crypto_kem/kyber768-90s/avx2/verify.h index ff0b0193..40e0da15 100644 --- a/crypto_kem/kyber768-90s/avx2/verify.h +++ b/crypto_kem/kyber768-90s/avx2/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER76890S_AVX2_VERIFY_H +#define PQCLEAN_KYBER76890S_AVX2_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER76890S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER76890S_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER76890S_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber768-90s/clean/LICENSE b/crypto_kem/kyber768-90s/clean/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber768-90s/clean/LICENSE +++ b/crypto_kem/kyber768-90s/clean/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber768-90s/clean/Makefile b/crypto_kem/kyber768-90s/clean/Makefile index 5abd4540..ce63bd78 100644 --- a/crypto_kem/kyber768-90s/clean/Makefile +++ b/crypto_kem/kyber768-90s/clean/Makefile @@ -1,8 +1,29 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber768-90s_clean.a -HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h aes256ctr.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o aes256ctr.o +HEADERS= \ + api.h \ + cbd.h \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + symmetric-aes.h \ + symmetric.h \ + verify.h +OBJECTS= \ + cbd.o \ + indcpa.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + reduce.o \ + verify.o \ + symmetric-aes.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber768-90s/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber768-90s/clean/Makefile.Microsoft_nmake index 3c6fb9b2..63b39fdf 100644 --- a/crypto_kem/kyber768-90s/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber768-90s/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber768-90s_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj aes256ctr.obj +OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-aes.o # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber768-90s/clean/cbd.c b/crypto_kem/kyber768-90s/clean/cbd.c index 798d7176..993e8fa6 100644 --- a/crypto_kem/kyber768-90s/clean/cbd.c +++ b/crypto_kem/kyber768-90s/clean/cbd.c @@ -1,7 +1,5 @@ -#include "cbd.h" #include "params.h" - -#include +#include "cbd.h" #include /************************************************* @@ -14,8 +12,8 @@ * * Returns 32-bit unsigned integer loaded from x **************************************************/ -static uint32_t load32_littleendian(const uint8_t *x) { - uint32_t r; +static uint32_t load32_littleendian(const uint8_t x[4]) { + uint32_t r = 0; r = (uint32_t)x[0]; r |= (uint32_t)x[1] << 8; r |= (uint32_t)x[2] << 16; @@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) { } /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER76890S_CLEAN_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA -* specialized for KYBER_ETA=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_cbd(poly *r, const uint8_t *buf) { - uint32_t d, t; - int16_t a, b; +void PQCLEAN_KYBER76890S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { + unsigned int i = 0, j = 0; + uint32_t t = 0, d = 0; + int16_t a = 0, b = 0; - for (size_t i = 0; i < KYBER_N / 8; i++) { - t = load32_littleendian(buf + 4 * i); + for (i = 0; i < KYBER_N / 8; i++) { + t = load32_littleendian(buf + 4 * i); d = t & 0x55555555; d += (t >> 1) & 0x55555555; - for (size_t j = 0; j < 8; j++) { - a = (d >> 4 * j) & 0x3; + for (j = 0; j < 8; j++) { + a = (d >> (4 * j + 0)) & 0x3; b = (d >> (4 * j + 2)) & 0x3; r->coeffs[8 * i + j] = a - b; } diff --git a/crypto_kem/kyber768-90s/clean/cbd.h b/crypto_kem/kyber768-90s/clean/cbd.h index bc1457c4..bb651eb0 100644 --- a/crypto_kem/kyber768-90s/clean/cbd.h +++ b/crypto_kem/kyber768-90s/clean/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER76890S_CLEAN_CBD_H +#define PQCLEAN_KYBER76890S_CLEAN_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER76890S_CLEAN_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER76890S_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber768-90s/clean/indcpa.c b/crypto_kem/kyber768-90s/clean/indcpa.c index 438cbc81..c87af874 100644 --- a/crypto_kem/kyber768-90s/clean/indcpa.c +++ b/crypto_kem/kyber768-90s/clean/indcpa.c @@ -5,7 +5,7 @@ #include "polyvec.h" #include "randombytes.h" #include "symmetric.h" - +#include #include /************************************************* @@ -16,12 +16,15 @@ * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key +* polynomial vector +* - uint8_t *seed: pointer to output seed to generate +* matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(r, sk); } @@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of +* polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER76890S_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER76890S_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - size_t len: requested number of 16-bit integers (uniform mod q) -* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes) -* - size_t buflen: length of input buffer in bytes +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) **************************************************/ -static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - size_t ctr, pos; - uint16_t val; +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { - val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction + val -= (val >> 12) * KYBER_Q; // Barrett reduction r[ctr++] = (int16_t)val; } } @@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER76890S_CLEAN_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER76890S_CLEAN_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER76890S_CLEAN_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T +* is generated **************************************************/ -#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - uint8_t i, j; - uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1]; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +// Not static for benchmarking +void PQCLEAN_KYBER76890S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { xof_absorb(&state, seed, j, i); } - xof_squeezeblocks(buf, MAXNBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES); + xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); while (ctr < KYBER_N) { xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, + XOF_BLOCKBYTES); } xof_ctx_release(&state); } @@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], e, pkpv, skpv; +void PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t *publicseed = buf; - uint8_t *noiseseed = buf + KYBER_SYMBYTES; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; uint8_t nonce = 0; + polyvec a[KYBER_K], e, pkpv, skpv; randombytes(buf, KYBER_SYMBYTES); hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(&skpv); PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv); - PQCLEAN_KYBER76890S_CLEAN_poly_frommont(&pkpv.vec[i]); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER76890S_CLEAN_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER76890S_CLEAN_polyvec_add(&pkpv, &pkpv, &e); @@ -217,34 +243,40 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER76890S_CLEAN_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec sp, pkpv, ep, at[KYBER_K], bp; - poly v, k, epp; +void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER76890S_CLEAN_poly_frommsg(&k, m); gen_at(at, seed); - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); } PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(&epp, coins, nonce++); @@ -252,14 +284,14 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t *c, PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt(&bp); - PQCLEAN_KYBER76890S_CLEAN_poly_invntt(&v); + PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(&v); PQCLEAN_KYBER76890S_CLEAN_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER76890S_CLEAN_poly_add(&v, &v, &epp); @@ -271,18 +303,21 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER76890S_CLEAN_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER76890S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -290,8 +325,8 @@ void PQCLEAN_KYBER76890S_CLEAN_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER76890S_CLEAN_poly_invntt(&mp); + PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER76890S_CLEAN_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER76890S_CLEAN_poly_reduce(&mp); diff --git a/crypto_kem/kyber768-90s/clean/indcpa.h b/crypto_kem/kyber768-90s/clean/indcpa.h index 54120670..85782d60 100644 --- a/crypto_kem/kyber768-90s/clean/indcpa.h +++ b/crypto_kem/kyber768-90s/clean/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER76890S_CLEAN_INDCPA_H +#define PQCLEAN_KYBER76890S_CLEAN_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER76890S_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER76890S_CLEAN_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER76890S_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber768-90s/clean/kem.c b/crypto_kem/kyber768-90s/clean/kem.c index f2f645ca..ca1a54f1 100644 --- a/crypto_kem/kyber768-90s/clean/kem.c +++ b/crypto_kem/kyber768-90s/clean/kem.c @@ -1,99 +1,125 @@ -#include "api.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER76890S_CLEAN_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER76890S_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER76890S_CLEAN_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { uint8_t buf[2 * KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + /* Don't release system RNG output */ + hash_h(buf, buf, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER76890S_CLEAN_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; PQCLEAN_KYBER76890S_CLEAN_indcpa_dec(buf, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER76890S_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER76890S_CLEAN_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER76890S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER76890S_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber768-90s/clean/kem.h b/crypto_kem/kyber768-90s/clean/kem.h new file mode 100644 index 00000000..6548f8ec --- /dev/null +++ b/crypto_kem/kyber768-90s/clean/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER76890S_CLEAN_KEM_H +#define PQCLEAN_KYBER76890S_CLEAN_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER76890S_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber768-90s/clean/ntt.c b/crypto_kem/kyber768-90s/clean/ntt.c index 2ca52459..1d65a104 100644 --- a/crypto_kem/kyber768-90s/clean/ntt.c +++ b/crypto_kem/kyber768-90s/clean/ntt.c @@ -1,11 +1,9 @@ -#include "ntt.h" #include "params.h" +#include "ntt.h" #include "reduce.h" - -#include #include -/* Code to generate zetas and zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER76890S_CLEAN_zetas and PQCLEAN_KYBER76890S_CLEAN_zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 @@ -17,12 +15,8 @@ static const uint16_t tree[128] = { 1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121, 5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125, 3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123, - 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127}; - - -static int16_t fqmul(int16_t a, int16_t b) { - return montgomery_reduce((int32_t)a*b); -} + 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127 +}; void init_ntt() { unsigned int i, j, k; @@ -33,40 +27,44 @@ void init_ntt() { tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); for(i = 0; i < 128; ++i) - zetas[i] = tmp[tree[i]]; + PQCLEAN_KYBER76890S_CLEAN_zetas[i] = tmp[tree[i]]; k = 0; for(i = 64; i >= 1; i >>= 1) for(j = i; j < 2*i; ++j) - zetas_inv[k++] = -tmp[128 - tree[j]]; + PQCLEAN_KYBER76890S_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + PQCLEAN_KYBER76890S_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; } */ + const int16_t PQCLEAN_KYBER76890S_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, - 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, - 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, - 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, - 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, - 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, + 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, + 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, + 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, + 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, + 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, + 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, + 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, + 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 }; const int16_t PQCLEAN_KYBER76890S_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, - 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, - 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, - 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, - 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, - 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 + 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, + 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, + 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, + 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, + 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, + 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, + 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, + 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, + 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 }; - /************************************************* * Name: fqmul * @@ -82,68 +80,73 @@ static int16_t fqmul(int16_t a, int16_t b) { } /************************************************* -* Name: ntt +* Name: PQCLEAN_KYBER76890S_CLEAN_ntt * * Description: Inplace number-theoretic transform (NTT) in Rq * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_ntt(int16_t poly[256]) { - size_t j, k = 1; - int16_t t, zeta; +void PQCLEAN_KYBER76890S_CLEAN_ntt(int16_t r[256]) { + unsigned int len = 0, start = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 128; len >= 2; len >>= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 1; + for (len = 128; len >= 2; len >>= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER76890S_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = fqmul(zeta, poly[j + len]); - poly[j + len] = poly[j] - t; - poly[j] = poly[j] + t; + t = fqmul(zeta, r[j + len]); + r[j + len] = r[j] - t; + r[j] = r[j] + t; } } } } /************************************************* -* Name: invntt +* Name: invntt_tomont * -* Description: Inplace inverse number-theoretic transform in Rq -* input is in bitreversed order, output is in standard order +* Description: Inplace inverse number-theoretic transform in Rq and +* multiplication by Montgomery factor 2^16. +* Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_invntt(int16_t poly[256]) { - size_t j, k = 0; - int16_t t, zeta; +void PQCLEAN_KYBER76890S_CLEAN_invntt(int16_t r[256]) { + unsigned int start = 0, len = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 2; len <= 128; len <<= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 0; + for (len = 2; len <= 128; len <<= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER76890S_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { - t = poly[j]; - poly[j] = PQCLEAN_KYBER76890S_CLEAN_barrett_reduce(t + poly[j + len]); - poly[j + len] = t - poly[j + len]; - poly[j + len] = fqmul(zeta, poly[j + len]); + t = r[j]; + r[j] = PQCLEAN_KYBER76890S_CLEAN_barrett_reduce(t + r[j + len]); + r[j + len] = t - r[j + len]; + r[j + len] = fqmul(zeta, r[j + len]); } } } for (j = 0; j < 256; ++j) { - poly[j] = fqmul(poly[j], PQCLEAN_KYBER76890S_CLEAN_zetas_inv[127]); + r[j] = fqmul(r[j], PQCLEAN_KYBER76890S_CLEAN_zetas_inv[127]); } } /************************************************* -* Name: basemul +* Name: PQCLEAN_KYBER76890S_CLEAN_basemul * -* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta)) +* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); diff --git a/crypto_kem/kyber768-90s/clean/ntt.h b/crypto_kem/kyber768-90s/clean/ntt.h index 90b83eb7..cd5c868e 100644 --- a/crypto_kem/kyber768-90s/clean/ntt.h +++ b/crypto_kem/kyber768-90s/clean/ntt.h @@ -1,13 +1,22 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER76890S_CLEAN_NTT_H +#define PQCLEAN_KYBER76890S_CLEAN_NTT_H +#include "params.h" #include + extern const int16_t PQCLEAN_KYBER76890S_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER76890S_CLEAN_zetasinv[128]; -void PQCLEAN_KYBER76890S_CLEAN_ntt(int16_t *poly); -void PQCLEAN_KYBER76890S_CLEAN_invntt(int16_t *poly); + +extern const int16_t PQCLEAN_KYBER76890S_CLEAN_zetas_inv[128]; + + +void PQCLEAN_KYBER76890S_CLEAN_ntt(int16_t r[256]); + + +void PQCLEAN_KYBER76890S_CLEAN_invntt(int16_t r[256]); + + void PQCLEAN_KYBER76890S_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber768-90s/clean/params.h b/crypto_kem/kyber768-90s/clean/params.h index 3a1e0d10..f33f77f4 100644 --- a/crypto_kem/kyber768-90s/clean/params.h +++ b/crypto_kem/kyber768-90s/clean/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER76890S_CLEAN_PARAMS_H +#define PQCLEAN_KYBER76890S_CLEAN_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 3 #define KYBER_POLYCOMPRESSEDBYTES 128 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber768-90s/clean/poly.c b/crypto_kem/kyber768-90s/clean/poly.c index b5ad12b2..30bf5c70 100644 --- a/crypto_kem/kyber768-90s/clean/poly.c +++ b/crypto_kem/kyber768-90s/clean/poly.c @@ -1,119 +1,164 @@ +#include "params.h" #include "cbd.h" #include "ntt.h" -#include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include + /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_compress * * Description: Compression and subsequent serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t k = 0; PQCLEAN_KYBER76890S_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N; i += 8) { - for (size_t j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; } - r[k] = (uint8_t)(t[0] | (t[1] << 4)); - r[k + 1] = (uint8_t)(t[2] | (t[3] << 4)); - r[k + 2] = (uint8_t)(t[4] | (t[5] << 4)); - r[k + 3] = (uint8_t)(t[6] | (t[7] << 4)); - k += 4; + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER76890S_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_decompress(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)((((a[0] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 1] = (int16_t)((((a[0] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 2] = (int16_t)((((a[1] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 3] = (int16_t)((((a[1] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 4] = (int16_t)((((a[2] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 5] = (int16_t)((((a[2] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 6] = (int16_t)((((a[3] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 7] = (int16_t)((((a[3] >> 4) * KYBER_Q) + 8) >> 4); - a += 4; +void PQCLEAN_KYBER76890S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; + a += 1; } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_tobytes * * Description: Serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t *r, poly *a) { - int16_t t0, t1; +void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + unsigned int i = 0; + uint16_t t0 = 0, t1 = 0; PQCLEAN_KYBER76890S_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N / 2; i++) { + for (i = 0; i < KYBER_N / 2; i++) { t0 = a->coeffs[2 * i]; t1 = a->coeffs[2 * i + 1]; - r[3 * i] = t0 & 0xff; - r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 & 0xf) << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER76890S_CLEAN_poly_tobytes +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) +**************************************************/ +void PQCLEAN_KYBER76890S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF; + r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF; + } +} + +/************************************************* +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_frommsg +* +* Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_frombytes(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N / 2; i++) { - r->coeffs[2 * i] = (int16_t)(a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8); - r->coeffs[2 * i + 1] = (int16_t)(a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4); +void PQCLEAN_KYBER76890S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + unsigned int i = 0, j = 0; + int16_t mask = 0; + + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + mask = -(int16_t)((msg[i] >> j) & 1); + r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER76890S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { + unsigned int i = 0, j = 0; + uint16_t t = 0; + + PQCLEAN_KYBER76890S_CLEAN_poly_csubq(a); + + for (i = 0; i < KYBER_N / 8; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + msg[i] |= t << j; + } } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { +void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); + prf(buf, sizeof(buf), seed, nonce); PQCLEAN_KYBER76890S_CLEAN_cbd(r, buf); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -127,20 +172,20 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_ntt(poly *r) { } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_invntt(poly *r) { +void PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(poly *r) { PQCLEAN_KYBER76890S_CLEAN_invntt(r->coeffs); } /************************************************* -* Name: poly_basemul +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_basemul_montgomery * * Description: Multiplication of two polynomials in NTT domain * @@ -148,68 +193,64 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_invntt(poly *r) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N / 4; ++i) { - PQCLEAN_KYBER76890S_CLEAN_basemul( - r->coeffs + 4 * i, - a->coeffs + 4 * i, - b->coeffs + 4 * i, - PQCLEAN_KYBER76890S_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER76890S_CLEAN_basemul( - r->coeffs + 4 * i + 2, - a->coeffs + 4 * i + 2, - b->coeffs + 4 * i + 2, - -PQCLEAN_KYBER76890S_CLEAN_zetas[64 + i]); +void PQCLEAN_KYBER76890S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 4; i++) { + PQCLEAN_KYBER76890S_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER76890S_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER76890S_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], + -PQCLEAN_KYBER76890S_CLEAN_zetas[64 + i]); } } /************************************************* -* Name: poly_frommont +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_tomont * * Description: Inplace conversion of all coefficients of a polynomial -* from Montgomery domain to normal domain +* from normal domain to Montgomery domain * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_frommont(poly *r) { +void PQCLEAN_KYBER76890S_CLEAN_poly_tomont(poly *r) { + unsigned int i = 0; const int16_t f = (1ULL << 32) % KYBER_Q; - - for (size_t i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce( - (int32_t)r->coeffs[i] * f); + for (i = 0; i < KYBER_N; i++) { + r->coeffs[i] = PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce((int32_t)r->coeffs[i] * f); } } /************************************************* -* Name: poly_reduce +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_reduce * * Description: Applies Barrett reduction to all coefficients of a polynomial * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_poly_reduce(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER76890S_CLEAN_barrett_reduce(r->coeffs[i]); } } /************************************************* -* Name: poly_csubq +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_csubq * -* Description: Applies conditional subtraction of q to each coefficient of a polynomial -* for details of conditional subtraction of q see comments in reduce.c +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_poly_csubq(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER76890S_CLEAN_csubq(r->coeffs[i]); } } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_add * * Description: Add two polynomials * @@ -218,13 +259,14 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER76890S_CLEAN_poly_sub * * Description: Subtract two polynomials * @@ -233,48 +275,8 @@ void PQCLEAN_KYBER76890S_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] - b->coeffs[i]; } } - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - uint16_t mask; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - for (size_t j = 0; j < 8; j++) { - mask = -((msg[i] >> j) & 1); - r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); - } - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint16_t t; - - PQCLEAN_KYBER76890S_CLEAN_poly_csubq(a); - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - msg[i] = 0; - for (size_t j = 0; j < 8; j++) { - t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; - msg[i] |= t << j; - } - } -} diff --git a/crypto_kem/kyber768-90s/clean/poly.h b/crypto_kem/kyber768-90s/clean/poly.h index 53c39fa2..54398da3 100644 --- a/crypto_kem/kyber768-90s/clean/poly.h +++ b/crypto_kem/kyber768-90s/clean/poly.h @@ -1,9 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER76890S_CLEAN_POLY_H +#define PQCLEAN_KYBER76890S_CLEAN_POLY_H #include "params.h" - #include + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -12,26 +12,41 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER76890S_CLEAN_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER76890S_CLEAN_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER76890S_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER76890S_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER76890S_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER76890S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER76890S_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER76890S_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER76890S_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER76890S_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + void PQCLEAN_KYBER76890S_CLEAN_poly_ntt(poly *r); -void PQCLEAN_KYBER76890S_CLEAN_poly_invntt(poly *r); -void PQCLEAN_KYBER76890S_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER76890S_CLEAN_poly_frommont(poly *r); + +void PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(poly *r); + +void PQCLEAN_KYBER76890S_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER76890S_CLEAN_poly_tomont(poly *r); + void PQCLEAN_KYBER76890S_CLEAN_poly_reduce(poly *r); + void PQCLEAN_KYBER76890S_CLEAN_poly_csubq(poly *r); + void PQCLEAN_KYBER76890S_CLEAN_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER76890S_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber768-90s/clean/polyvec.c b/crypto_kem/kyber768-90s/clean/polyvec.c index 459feb6d..b261f051 100644 --- a/crypto_kem/kyber768-90s/clean/polyvec.c +++ b/crypto_kem/kyber768-90s/clean/polyvec.c @@ -1,128 +1,153 @@ -#include "polyvec.h" - +#include "params.h" #include "poly.h" - -#include +#include "polyvec.h" #include + /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_compress * * Description: Compress and serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq(a); uint16_t t[4]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - for (size_t k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) + / KYBER_Q) & 0x3ff; + } } - r[5 * j + 0] = (uint8_t)t[0]; - r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x3f) << 2)); - r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] & 0x0f) << 4)); - r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] & 0x03) << 6)); - r[5 * j + 4] = (uint8_t)((t[3] >> 2)); + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; } - r += 320; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER76890S_CLEAN_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - r->vec[i].coeffs[4 * j + 0] = (int16_t)( (((a[5 * j + 0] | (((uint32_t)a[5 * j + 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 1] = (int16_t)(((((a[5 * j + 1] >> 2) | (((uint32_t)a[5 * j + 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 2] = (int16_t)(((((a[5 * j + 2] >> 4) | (((uint32_t)a[5 * j + 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 3] = (int16_t)(((((a[5 * j + 3] >> 6) | (((uint32_t)a[5 * j + 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10); +void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } } - a += 320; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes * * Description: Serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) +* Arguments: - uint8_t *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_CLEAN_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { +void PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + unsigned int i = 0; poly t; - PQCLEAN_KYBER76890S_CLEAN_poly_basemul(r, &a->vec[0], &b->vec[0]); - for (size_t i = 1; i < KYBER_K; i++) { - PQCLEAN_KYBER76890S_CLEAN_poly_basemul(&t, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER76890S_CLEAN_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER76890S_CLEAN_poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]); PQCLEAN_KYBER76890S_CLEAN_poly_add(r, r, &t); } @@ -130,37 +155,40 @@ void PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, } /************************************************* -* Name: polyvec_reduce +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient * of each element of a vector of polynomials * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_reduce(&r->vec[i]); } } /************************************************* -* Name: polyvec_csubq +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq * * Description: Applies conditional subtraction of q to each coefficient * of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in reduce.c +* for details of conditional subtraction of q see comments in +* reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER76890S_CLEAN_polyvec_add * * Description: Add vectors of polynomials * @@ -169,7 +197,8 @@ void PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER76890S_CLEAN_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber768-90s/clean/polyvec.h b/crypto_kem/kyber768-90s/clean/polyvec.h index c2bb284b..2ade476f 100644 --- a/crypto_kem/kyber768-90s/clean/polyvec.h +++ b/crypto_kem/kyber768-90s/clean/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER76890S_CLEAN_POLYVEC_H +#define PQCLEAN_KYBER76890S_CLEAN_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER76890S_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER76890S_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER76890S_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER76890S_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER76890S_CLEAN_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER76890S_CLEAN_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER76890S_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER76890S_CLEAN_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER76890S_CLEAN_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER76890S_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber768-90s/clean/reduce.c b/crypto_kem/kyber768-90s/clean/reduce.c index 9aa7f5bf..2ad02e13 100644 --- a/crypto_kem/kyber768-90s/clean/reduce.c +++ b/crypto_kem/kyber768-90s/clean/reduce.c @@ -1,32 +1,32 @@ -#include "reduce.h" - #include "params.h" - +#include "reduce.h" #include + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes * 16-bit integer congruent to a * R^-1 mod q, * where R=2^16 * -* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1} +* Arguments: - int32_t a: input integer to be reduced; +* has to be in {-q2^15,...,q2^15-1} * * Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. **************************************************/ int16_t PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce(int32_t a) { - int32_t t; - int16_t u; + int32_t t = 0; + int16_t u = 0; u = (int16_t)(a * (int64_t)QINV); t = (int32_t)u * KYBER_Q; t = a - t; t >>= 16; - return (int16_t)t; + return t; } /************************************************* -* Name: barrett_reduce +* Name: PQCLEAN_KYBER76890S_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes * 16-bit integer congruent to a mod q in {0,...,q} @@ -36,21 +36,20 @@ int16_t PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce(int32_t a) { * Returns: integer in {0,...,q} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER76890S_CLEAN_barrett_reduce(int16_t a) { - int32_t t; - const int32_t v = (1U << 26) / KYBER_Q + 1; + int16_t t = 0; + const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = v * a; - t >>= 26; + t = (int32_t)v * a >> 26; t *= KYBER_Q; - return a - (int16_t)t; + return a - t; } /************************************************* -* Name: csubq +* Name: PQCLEAN_KYBER76890S_CLEAN_csubq * * Description: Conditionallly subtract q * -* Arguments: - int16_t a: input integer +* Arguments: - int16_t x: input integer * * Returns: a - q if a >= q, else a **************************************************/ diff --git a/crypto_kem/kyber768-90s/clean/reduce.h b/crypto_kem/kyber768-90s/clean/reduce.h index 20a73c41..f08b032c 100644 --- a/crypto_kem/kyber768-90s/clean/reduce.h +++ b/crypto_kem/kyber768-90s/clean/reduce.h @@ -1,15 +1,19 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER76890S_CLEAN_REDUCE_H +#define PQCLEAN_KYBER76890S_CLEAN_REDUCE_H +#include "params.h" #include -#define MONT 2285 // 2^16 % Q -#define QINV 62209 // q^(-1) mod 2^16 +#define MONT 2285 // 2^16 mod q +#define QINV 62209 // q^-1 mod 2^16 + int16_t PQCLEAN_KYBER76890S_CLEAN_montgomery_reduce(int32_t a); + int16_t PQCLEAN_KYBER76890S_CLEAN_barrett_reduce(int16_t a); + int16_t PQCLEAN_KYBER76890S_CLEAN_csubq(int16_t a); #endif diff --git a/crypto_kem/kyber768-90s/clean/aes256ctr.c b/crypto_kem/kyber768-90s/clean/symmetric-aes.c similarity index 98% rename from crypto_kem/kyber768-90s/clean/aes256ctr.c rename to crypto_kem/kyber768-90s/clean/symmetric-aes.c index 3deae94c..e1ef178c 100644 --- a/crypto_kem/kyber768-90s/clean/aes256ctr.c +++ b/crypto_kem/kyber768-90s/clean/symmetric-aes.c @@ -1,4 +1,4 @@ -#include "aes256ctr.h" +#include "symmetric-aes.h" #include "aes.h" #include #include @@ -14,7 +14,7 @@ static inline void br_enc32be(unsigned char *dst, uint32_t x) { static void aes256_ctr_xof(unsigned char *out, size_t outlen, const unsigned char *iv, uint32_t ctr, const aes256ctx *ctx) { uint8_t ivw[16]; uint8_t buf[AES_BLOCKBYTES]; - size_t i; + size_t i = 0; memcpy(ivw, iv, AESCTR_NONCEBYTES); br_enc32be(ivw + AESCTR_NONCEBYTES, ctr); @@ -94,7 +94,6 @@ void PQCLEAN_KYBER76890S_CLEAN_aes256xof_squeezeblocks(uint8_t *out, size_t nblo s->ctr += (uint32_t) (4 * nblocks); } -/** Free the AES ctx **/ void PQCLEAN_KYBER76890S_CLEAN_aes256xof_ctx_release(aes256xof_ctx *s) { aes256_ctx_release(&s->sk_exp); } diff --git a/crypto_kem/kyber768-90s/clean/aes256ctr.h b/crypto_kem/kyber768-90s/clean/symmetric-aes.h similarity index 100% rename from crypto_kem/kyber768-90s/clean/aes256ctr.h rename to crypto_kem/kyber768-90s/clean/symmetric-aes.h diff --git a/crypto_kem/kyber768-90s/clean/symmetric.h b/crypto_kem/kyber768-90s/clean/symmetric.h index 681ea3d7..9e46b337 100644 --- a/crypto_kem/kyber768-90s/clean/symmetric.h +++ b/crypto_kem/kyber768-90s/clean/symmetric.h @@ -2,22 +2,24 @@ #define SYMMETRIC_H #include "params.h" +#include +#include -#include "aes256ctr.h" #include "sha2.h" +#include "symmetric-aes.h" + +typedef aes256xof_ctx xof_state; + +#define XOF_BLOCKBYTES 64 #define hash_h(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER76890S_CLEAN_aes256xof_absorb(STATE, IN, X, Y) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER76890S_CLEAN_aes256xof_absorb(STATE, SEED, X, Y) #define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER76890S_CLEAN_aes256xof_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) PQCLEAN_KYBER76890S_CLEAN_aes256xof_ctx_release(STATE) #define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER76890S_CLEAN_aes256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) sha256(OUT, IN, INBYTES) -#define XOF_BLOCKBYTES 64 - -typedef aes256xof_ctx xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber768-90s/clean/verify.c b/crypto_kem/kyber768-90s/clean/verify.c index 3a41a400..ec472fe3 100644 --- a/crypto_kem/kyber768-90s/clean/verify.c +++ b/crypto_kem/kyber768-90s/clean/verify.c @@ -1,34 +1,31 @@ #include "verify.h" - #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER76890S_CLEAN_verify * * Description: Compare two arrays for equality in constant time. * * Arguments: const uint8_t *a: pointer to first byte array * const uint8_t *b: pointer to second byte array -* size_t len: length of the byte arrays +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER76890S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { - uint64_t r; - size_t i; - r = 0; +int PQCLEAN_KYBER76890S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t i = 0; + uint8_t r = 0; for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } - r = (-r) >> 63; - return (uint8_t)r; + return (-(uint64_t)r) >> 63; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER76890S_CLEAN_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; @@ -37,14 +34,14 @@ uint8_t PQCLEAN_KYBER76890S_CLEAN_verify(const uint8_t *a, const uint8_t *b, siz * * Arguments: uint8_t *r: pointer to output byte array * const uint8_t *x: pointer to input byte array -* size_t len: Amount of bytes to be copied +* size_t len: Amount of bytes to be copied * uint8_t b: Condition bit; has to be in {0,1} **************************************************/ void PQCLEAN_KYBER76890S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t i; + size_t i = 0; b = -b; for (i = 0; i < len; i++) { - r[i] ^= b & (x[i] ^ r[i]); + r[i] ^= b & (r[i] ^ x[i]); } } diff --git a/crypto_kem/kyber768-90s/clean/verify.h b/crypto_kem/kyber768-90s/clean/verify.h index 7079be39..216f2792 100644 --- a/crypto_kem/kyber768-90s/clean/verify.h +++ b/crypto_kem/kyber768-90s/clean/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER76890S_CLEAN_VERIFY_H +#define PQCLEAN_KYBER76890S_CLEAN_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER76890S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER76890S_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER76890S_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber768/META.yml b/crypto_kem/kyber768/META.yml index 73056cd7..34af9744 100644 --- a/crypto_kem/kyber768/META.yml +++ b/crypto_kem/kyber768/META.yml @@ -20,15 +20,16 @@ auxiliary-submitters: - Gregor Seiler - Damien Stehlé implementations: - - name: clean - version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 - - name: avx2 - version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 - supported_platforms: - - architecture: x86_64 - operating_systems: - - Linux - required_flags: - - avx2 - - bmi2 - - popcnt + - name: clean + version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 + - name: avx2 + version: https://github.com/pq-crystals/kyber/commit/46e283ab575ec92dfe82fb12229ae2d9d6246682 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 + - bmi2 + - popcnt diff --git a/crypto_kem/kyber768/avx2/LICENSE b/crypto_kem/kyber768/avx2/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber768/avx2/LICENSE +++ b/crypto_kem/kyber768/avx2/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber768/avx2/Makefile b/crypto_kem/kyber768/avx2/Makefile index 6addb34c..2008206d 100644 --- a/crypto_kem/kyber768/avx2/Makefile +++ b/crypto_kem/kyber768/avx2/Makefile @@ -1,26 +1,58 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber768_avx2.a -HEADERS=api.h params.h poly.h polyvec.h reduce.h fq.inc cbd.h consts.h ntt.h shuffle.inc verify.h indcpa.h rejsample.h symmetric.h fips202x4.h -OBJECTS=kem.o poly.o polyvec.o fq.o shuffle.o cbd.o ntt.o invntt.o basemul.o consts.o \ - verify.o indcpa.o rejsample.o fips202x4.o symmetric-fips202.o +HEADERS= \ + align.h \ + api.h \ + cbd.h \ + cdecl.inc \ + consts.h \ + fips202x4.h \ + fq.inc \ + indcpa.h \ + kem.h \ + ntt.h \ + params.h \ + poly.h \ + polyvec.h \ + reduce.h \ + rejsample.h \ + shuffle.inc \ + symmetric.h \ + verify.h +OBJECTS= \ + basemul.o \ + cbd.o \ + consts.o \ + fips202x4.o \ + fq.o \ + indcpa.o \ + invntt.o \ + kem.o \ + ntt.o \ + poly.o \ + polyvec.o \ + rejsample.o \ + shuffle.o \ + symmetric-shake.o \ + verify.o KECCAK4XDIR=../../../common/keccak4x KECCAK4XOBJ=KeccakP-1600-times4-SIMD256.o KECCAK4X=$(KECCAK4XDIR)/$(KECCAK4XOBJ) -CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +CFLAGS=-mavx2 -mbmi2 -mpopcnt -O3 -Wall -Wextra -Wpedantic -Werror \ + -Wmissing-prototypes -Wredundant-decls \ + -Wpointer-arith -Wshadow \ + -std=c99 -I../../../common $(EXTRAFLAGS) all: $(LIB) %.o: %.c $(HEADERS) $(CC) $(CFLAGS) -c -o $@ $< -%.o: %.s $(HEADERS) - $(AS) -o $@ $< - %.o: %.S $(HEADERS) - $(AS) -c -o $@ $< + $(CC) $(CFLAGS) -c -o $@ $< $(LIB): $(OBJECTS) $(KECCAK4X) $(AR) -r $@ $(OBJECTS) $(KECCAK4X) diff --git a/crypto_kem/kyber768/avx2/align.h b/crypto_kem/kyber768/avx2/align.h new file mode 100644 index 00000000..517fb939 --- /dev/null +++ b/crypto_kem/kyber768/avx2/align.h @@ -0,0 +1,22 @@ +#ifndef PQCLEAN_KYBER768_AVX2_ALIGN_H +#define PQCLEAN_KYBER768_AVX2_ALIGN_H +#include + +#define ALIGN16_TYPE(t) \ + union { \ + __m128i vec; \ + t orig; \ + } + +#define ALIGN32_ARRAY(t, s) \ + union { \ + __m256i vec; \ + t arr[(s)]; \ + } + +#define ALIGN32_ARRAY_2D(t, n, m) \ + union { \ + __m256i vec; \ + t arr[(n)][(m)]; \ + } +#endif diff --git a/crypto_kem/kyber768/avx2/basemul.S b/crypto_kem/kyber768/avx2/basemul.S index 1e857a48..487f3fd6 100644 --- a/crypto_kem/kyber768/avx2/basemul.S +++ b/crypto_kem/kyber768/avx2/basemul.S @@ -1,4 +1,5 @@ #include "params.h" +#include "cdecl.inc" .macro schoolbook off,sign #load @@ -48,7 +49,7 @@ vpaddd %ymm12,%ymm13,%ymm12 # y0 vpaddd %ymm7,%ymm8,%ymm7 # y1 .endm -.macro red a0,a1,b0,b1 x,y,z +.macro red a0,a1,b0,b1,x,y,z #pack vpxor %ymm\x,%ymm\x,%ymm\x vpblendw $0xAA,%ymm\x,%ymm\a0,%ymm\y @@ -73,13 +74,8 @@ vpsubw %ymm\z,%ymm\a0,%ymm\a0 vpsubw %ymm\y,%ymm\b0,%ymm\b0 .endm -.global PQCLEAN_KYBER768_AVX2_basemul_acc_avx -PQCLEAN_KYBER768_AVX2_basemul_acc_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER768_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 - +.text +basemul64_acc_avx: poly0.0: schoolbook 0,0 @@ -109,7 +105,7 @@ vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,(%rdi) @@ -144,7 +140,7 @@ vpaddd %ymm7,%ymm6,%ymm6 #reduce -red 3,4,5,6 7,8,9 +red 3,4,5,6,7,8,9 #store vmovdqa %ymm3,64(%rdi) @@ -152,17 +148,40 @@ vmovdqa %ymm5,96(%rdi) ret -.global PQCLEAN_KYBER768_AVX2_basemul_avx -PQCLEAN_KYBER768_AVX2_basemul_avx: +.global cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx) +cdecl(PQCLEAN_KYBER768_AVX2_basemul_acc_avx): #consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER768_AVX2_16xqinv(%rip),%ymm1 -vmovdqu (%rcx),%ymm2 +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_acc_avx + +ret +basemul64_avx: schoolbook 0,0 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,(%rdi) @@ -171,10 +190,39 @@ vmovdqa %ymm12,32(%rdi) schoolbook 64,1 #reduce -red 14,9,12,7 8,10,11 +red 14,9,12,7,8,10,11 #store vmovdqa %ymm14,64(%rdi) vmovdqa %ymm12,96(%rdi) ret + +.global cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx) +cdecl(PQCLEAN_KYBER768_AVX2_basemul_avx): +#consts +vmovdqa _16XQ*2(%rcx),%ymm0 +vmovdqa _16XQINV*2(%rcx),%ymm1 + +vmovdqu (_ZETAS_EXP+152)*2(%rcx),%ymm2 +call basemul64_avx + +vmovdqu (_ZETAS_EXP+184)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+348)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +vmovdqu (_ZETAS_EXP+380)*2(%rcx),%ymm2 +add $128,%rdi +add $128,%rsi +add $128,%rdx +call basemul64_avx + +ret diff --git a/crypto_kem/kyber768/avx2/cbd.c b/crypto_kem/kyber768/avx2/cbd.c index ac2abaa2..7d4a3b30 100644 --- a/crypto_kem/kyber768/avx2/cbd.c +++ b/crypto_kem/kyber768/avx2/cbd.c @@ -1,27 +1,27 @@ -#include "cbd.h" #include "params.h" - +#include "cbd.h" #include #include /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER768_AVX2_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA * * Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *buf: pointer to input byte array +* - const unsigned char *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER768_AVX2_cbd(poly *r, const uint8_t *buf) { +void PQCLEAN_KYBER768_AVX2_cbd(poly *restrict r, const uint8_t *restrict buf) { + unsigned int i = 0; __m256i vec0, vec1, vec2, vec3, tmp; const __m256i mask55 = _mm256_set1_epi32(0x55555555); const __m256i mask33 = _mm256_set1_epi32(0x33333333); const __m256i mask03 = _mm256_set1_epi32(0x03030303); - for (size_t i = 0; i < KYBER_N / 64; i++) { - vec0 = _mm256_loadu_si256((__m256i *)&buf[32 * i]); + for (i = 0; i < KYBER_N / 64; i++) { + vec0 = _mm256_load_si256((__m256i *)&buf[32 * i]); vec1 = _mm256_srli_epi32(vec0, 1); vec0 = _mm256_and_si256(mask55, vec0); diff --git a/crypto_kem/kyber768/avx2/cbd.h b/crypto_kem/kyber768/avx2/cbd.h index 9ea7fdf5..4bbb86e8 100644 --- a/crypto_kem/kyber768/avx2/cbd.h +++ b/crypto_kem/kyber768/avx2/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER768_AVX2_CBD_H +#define PQCLEAN_KYBER768_AVX2_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER768_AVX2_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER768_AVX2_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber768/avx2/cdecl.inc b/crypto_kem/kyber768/avx2/cdecl.inc new file mode 100644 index 00000000..8ded53b1 --- /dev/null +++ b/crypto_kem/kyber768/avx2/cdecl.inc @@ -0,0 +1,30 @@ +#ifndef PQCLEAN_DILITHIUM2_AVX2_CDECL +#define PQCLEAN_DILITHIUM2_AVX2_CDECL + +#define _16XQ 0 +#define _16XQINV 16 +#define _16XV 32 +#define _16XFLO 48 +#define _16XFHI 64 +#define _16XMONTSQLO 80 +#define _16XMONTSQHI 96 +#define _16XMASK 112 +#define _ZETAS_EXP 128 +#define _ZETAS_INV_EXP 528 + + +/* The C ABI on MacOS exports all symbols with a leading + * underscore. This means that any symbols we refer to from + * C files (functions) can't be found, and all symbols we + * refer to from ASM also can't be found (nttconsts.c). + * + * This define helps us get around this + */ + +#if defined(__WIN32__) || defined(__APPLE__) +#define cdecl(s) _##s +#else +#define cdecl(s) s +#endif + +#endif diff --git a/crypto_kem/kyber768/avx2/consts.c b/crypto_kem/kyber768/avx2/consts.c index 5281c841..f0652610 100644 --- a/crypto_kem/kyber768/avx2/consts.c +++ b/crypto_kem/kyber768/avx2/consts.c @@ -1,34 +1,155 @@ -#include "consts.h" #include "params.h" - -const uint16_t PQCLEAN_KYBER768_AVX2_zetas_exp[396] = {31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, 3158, 3158, 3158, 3158, 622, 622, 622, 622, 1577, 1577, 1577, 1577, 182, 182, 182, 182, 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, 573, 573, 2004, 2004, 264, 264, 383, 383, 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, 2226, 555, 2078, 1550, 422, 177, 3038, 1574, 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, 430, 843, 871, 105, 587, 3094, 2869, 1653, 778, 3182, 1483, 1119, 644, 349, 329, 3254, 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 48842, 287, 287, 287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690, 10690, 10690, 10690, 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, 31164, 31164, 31164, 31164, 962, 962, 962, 962, 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, 732, 732, 608, 608, 1787, 1787, 411, 411, 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, 3193, 1994, 220, 1670, 1799, 794, 2475, 478, 3021, 991, 1869, 1628}; -const uint16_t PQCLEAN_KYBER768_AVX2_zetas_inv_exp[396] = {42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, 2210, 1846, 147, 2551, 1676, 460, 235, 2742, 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, 45043, 32227, 11478, 335, 156, 2911, 872, 1590, 602, 777, 2170, 246, 1755, 291, 3152, 2907, 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, 171, 171, 171, 171, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, 60300, 60300, 1932, 1932}; +#include "consts.h" +#include #define Q KYBER_Q -#define MONT ((1U << 16) % KYBER_Q) +#define MONT ((1U << 16) % Q) #define QINV 62209 // q^-1 mod 2^16 -#define V ((1U << 26)/KYBER_Q + 1) -#define FHI (MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q) -#define FLO (FHI * QINV % 65536) -#define MONTSQHI (MONT * MONT % KYBER_Q) -#define MONTSQLO (MONTSQHI * QINV % 65536) +#define V (((1U << 26) + Q/2)/Q) +#define FHI (MONT*(MONT*(Q-1)*((Q-1)/128) % Q) % Q) +#define FLO (FHI*QINV % 65536) +#define MONTSQHI (MONT*MONT % Q) +#define MONTSQLO (MONTSQHI*QINV % 65536) #define MASK 4095 -const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xq = {.as_arr = {Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q}}; -const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xqinv = {.as_arr = {QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV}}; -const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xv = {.as_arr = {V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}}; -const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xflo = {.as_arr = {FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO}}; -const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xfhi = {.as_arr = {FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI}}; -const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xmontsqlo = {.as_arr = {MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO}}; -const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xmontsqhi = {.as_arr = {MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI}}; -const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xmask = {.as_arr = {MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK}}; - -#undef Q -#undef QINV -#undef MONT -#undef V -#undef FLO -#undef FHI -#undef MONTSQLO -#undef MONTSQHI -#undef MASK + +const qdata_t PQCLEAN_KYBER768_AVX2_qdata = {.as_arr = { +#define _16XQ 0 + Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, Q, + +#define _16XQINV 16 + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + QINV, QINV, QINV, QINV, QINV, QINV, QINV, QINV, + +#define _16XV 32 + V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, + +#define _16XFLO 48 + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + FLO, FLO, FLO, FLO, FLO, FLO, FLO, FLO, + +#define _16XFHI 64 + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + FHI, FHI, FHI, FHI, FHI, FHI, FHI, FHI, + +#define _16XMONTSQLO 80 + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + MONTSQLO, MONTSQLO, MONTSQLO, MONTSQLO, + +#define _16XMONTSQHI 96 + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + MONTSQHI, MONTSQHI, MONTSQHI, MONTSQHI, + +#define _16XMASK 112 + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + MASK, MASK, MASK, MASK, MASK, MASK, MASK, MASK, + +#define _ZETAS_EXP 128 + 31499, 31499, 2571, 2571, 14746, 14746, 2970, 2970, + 13525, 13525, 13525, 13525, 13525, 13525, 13525, 13525, + 53134, 53134, 53134, 53134, 53134, 53134, 53134, 53134, + 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493, + 1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, + 44630, 44630, 44630, 44630, 27758, 27758, 27758, 27758, + 61737, 61737, 61737, 61737, 49846, 49846, 49846, 49846, + 3158, 3158, 3158, 3158, 622, 622, 622, 622, + 1577, 1577, 1577, 1577, 182, 182, 182, 182, + 59709, 59709, 17364, 17364, 39176, 39176, 36479, 36479, + 5572, 5572, 64434, 64434, 21439, 21439, 39295, 39295, + 573, 573, 2004, 2004, 264, 264, 383, 383, + 2500, 2500, 1458, 1458, 1727, 1727, 3199, 3199, + 59847, 59020, 1497, 30967, 41972, 20179, 20711, 25081, + 52740, 26617, 16065, 53095, 9135, 64887, 39550, 27837, + 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, + 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, + 65202, 54059, 33310, 20494, 37798, 945, 50654, 6182, + 32011, 10631, 29176, 36775, 47051, 17561, 51106, 60261, + 2226, 555, 2078, 1550, 422, 177, 3038, 1574, + 3083, 1159, 2552, 2727, 1739, 2457, 418, 3173, + 11182, 13387, 51303, 43881, 13131, 60950, 23093, 5493, + 33034, 30318, 46795, 12639, 20100, 18525, 19529, 52918, + 430, 843, 871, 105, 587, 3094, 2869, 1653, + 778, 3182, 1483, 1119, 644, 349, 329, 3254, + 788, 788, 1812, 1812, 28191, 28191, 28191, 28191, + 28191, 28191, 28191, 28191, 48842, 48842, 48842, 48842, + 48842, 48842, 48842, 48842, 287, 287, 287, 287, + 287, 287, 287, 287, 202, 202, 202, 202, + 202, 202, 202, 202, 10690, 10690, 10690, 10690, + 1359, 1359, 1359, 1359, 54335, 54335, 54335, 54335, + 31164, 31164, 31164, 31164, 962, 962, 962, 962, + 2127, 2127, 2127, 2127, 1855, 1855, 1855, 1855, + 1468, 1468, 1468, 1468, 37464, 37464, 24313, 24313, + 55004, 55004, 8800, 8800, 18427, 18427, 8859, 8859, + 26676, 26676, 49374, 49374, 2648, 2648, 1017, 1017, + 732, 732, 608, 608, 1787, 1787, 411, 411, + 3124, 3124, 1758, 1758, 19884, 37287, 49650, 56638, + 37227, 9076, 35338, 18250, 13427, 14017, 36381, 52780, + 16832, 4312, 41381, 47622, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, + 448, 2264, 677, 2054, 34353, 25435, 58154, 24392, + 44610, 10946, 24215, 16990, 10336, 57603, 43035, 10907, + 31637, 28644, 23998, 48114, 817, 603, 1322, 1864, + 2114, 1218, 2455, 2142, 2144, 2051, 1819, 2459, + 3221, 996, 958, 1522, 20297, 2146, 15356, 33152, + 59257, 50634, 54492, 14470, 44039, 45338, 23211, 48094, + 41677, 45279, 7757, 23132, 1097, 610, 2044, 384, + 3193, 1994, 220, 1670, 1799, 794, 2475, 478, + 3021, 991, 1869, 1628, 0, 0, 0, 0, + +#define _ZETAS_INV_EXP 528 + 42405, 57780, 20258, 23860, 17443, 42326, 20199, 21498, + 51067, 11045, 14903, 6280, 32385, 50181, 63391, 45240, + 1701, 1460, 2338, 308, 2851, 854, 2535, 1530, + 1659, 3109, 1335, 136, 2945, 1285, 2719, 2232, + 17423, 41539, 36893, 33900, 54630, 22502, 7934, 55201, + 48547, 41322, 54591, 20927, 41145, 7383, 40102, 31184, + 1807, 2371, 2333, 108, 870, 1510, 1278, 1185, + 1187, 874, 2111, 1215, 1465, 2007, 2726, 2512, + 17915, 24156, 61225, 48705, 12757, 29156, 51520, 52110, + 47287, 30199, 56461, 28310, 8899, 15887, 28250, 45653, + 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, + 951, 247, 1421, 3222, 2499, 271, 90, 853, + 16163, 16163, 38861, 38861, 56678, 56678, 47110, 47110, + 56737, 56737, 10533, 10533, 41224, 41224, 28073, 28073, + 1571, 1571, 205, 205, 2918, 2918, 1542, 1542, + 2721, 2721, 2597, 2597, 2312, 2312, 681, 681, + 34373, 34373, 34373, 34373, 11202, 11202, 11202, 11202, + 64178, 64178, 64178, 64178, 54847, 54847, 54847, 54847, + 1861, 1861, 1861, 1861, 1474, 1474, 1474, 1474, + 1202, 1202, 1202, 1202, 2367, 2367, 2367, 2367, + 16695, 16695, 16695, 16695, 16695, 16695, 16695, 16695, + 37346, 37346, 37346, 37346, 37346, 37346, 37346, 37346, + 3127, 3127, 3127, 3127, 3127, 3127, 3127, 3127, + 3042, 3042, 3042, 3042, 3042, 3042, 3042, 3042, + 64749, 64749, 1517, 1517, 12619, 46008, 47012, 45437, + 52898, 18742, 35219, 32503, 60044, 42444, 4587, 52406, + 21656, 14234, 52150, 54355, 75, 3000, 2980, 2685, + 2210, 1846, 147, 2551, 1676, 460, 235, 2742, + 3224, 2458, 2486, 2899, 5276, 14431, 47976, 18486, + 28762, 36361, 54906, 33526, 59355, 14883, 64592, 27739, + 45043, 32227, 11478, 335, 156, 2911, 872, 1590, + 602, 777, 2170, 246, 1755, 291, 3152, 2907, + 1779, 1251, 2774, 1103, 37700, 25987, 650, 56402, + 12442, 49472, 38920, 12797, 40456, 44826, 45358, 23565, + 34570, 64040, 6517, 5690, 1860, 3203, 1162, 1618, + 666, 320, 8, 2813, 1544, 282, 1838, 1293, + 2314, 552, 2677, 2106, 26242, 26242, 44098, 44098, + 1103, 1103, 59965, 59965, 29058, 29058, 26361, 26361, + 48173, 48173, 5828, 5828, 130, 130, 1602, 1602, + 1871, 1871, 829, 829, 2946, 2946, 3065, 3065, + 1325, 1325, 2756, 2756, 15691, 15691, 15691, 15691, + 3800, 3800, 3800, 3800, 37779, 37779, 37779, 37779, + 20907, 20907, 20907, 20907, 3147, 3147, 3147, 3147, + 1752, 1752, 1752, 1752, 2707, 2707, 2707, 2707, + 171, 171, 171, 171, 12403, 12403, 12403, 12403, + 12403, 12403, 12403, 12403, 52012, 52012, 52012, 52012, + 52012, 52012, 52012, 52012, 1907, 1907, 1907, 1907, + 1907, 1907, 1907, 1907, 1836, 1836, 1836, 1836, + 1836, 1836, 1836, 1836, 50791, 50791, 359, 359, + 60300, 60300, 1932, 1932, 0, 0, 0, 0 + } +}; diff --git a/crypto_kem/kyber768/avx2/consts.h b/crypto_kem/kyber768/avx2/consts.h index b18ae260..f7bcfb7c 100644 --- a/crypto_kem/kyber768/avx2/consts.h +++ b/crypto_kem/kyber768/avx2/consts.h @@ -1,24 +1,20 @@ -#ifndef CONSTS_H -#define CONSTS_H +#ifndef PQCLEAN_KYBER768_AVX2_CONSTS_H +#define PQCLEAN_KYBER768_AVX2_CONSTS_H +#include "cdecl.inc" + +#include "params.h" #include #include -typedef union { - uint16_t as_arr[16]; - __m256i as_vec; -} aligned_uint16_t; +#define ALIGNED_UINT16_T(N) \ + union { \ + __m256i as_vec; \ + uint16_t as_arr[(N)]; \ + } -extern const uint16_t PQCLEAN_KYBER768_AVX2_zetas_exp[396]; -extern const uint16_t PQCLEAN_KYBER768_AVX2_zetas_inv_exp[396]; +typedef ALIGNED_UINT16_T(928) qdata_t; -extern const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xq; -extern const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xqinv; -extern const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xv; -extern const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xflo; -extern const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xfhi; -extern const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xmontsqlo; -extern const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xmontsqhi; -extern const aligned_uint16_t PQCLEAN_KYBER768_AVX2_16xmask; +extern const qdata_t PQCLEAN_KYBER768_AVX2_qdata; #endif diff --git a/crypto_kem/kyber768/avx2/fips202x4.c b/crypto_kem/kyber768/avx2/fips202x4.c index cade5edf..80f89aa7 100644 --- a/crypto_kem/kyber768/avx2/fips202x4.c +++ b/crypto_kem/kyber768/avx2/fips202x4.c @@ -1,148 +1,111 @@ #include "fips202.h" #include "fips202x4.h" -#include "params.h" - #include +#include #include +#include +/* Use implementation from the Keccak Code Package */ +#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds +extern void KeccakF1600_StatePermute4x(__m256i *s); -#define NROUNDS 24 -#define ROL(a, offset) (((a) << (offset)) ^ ((a) >> (64-(offset)))) - -static uint64_t load64(const uint8_t *x) { - unsigned long long r = 0, i; - - for (i = 0; i < 8; ++i) { - r |= (unsigned long long)x[i] << 8 * i; - } - return r; -} - -static void store64(uint8_t *x, uint64_t u) { - size_t i; +static inline void store64(uint8_t x[8], uint64_t u) { + unsigned int i = 0; - for (i = 0; i < 8; ++i) { - x[i] = (uint8_t)u; - u >>= 8; + for (i = 0; i < 8; i++) { + x[i] = u >> 8 * i; } } -/* Use implementation from the Keccak Code Package */ -extern void KeccakP1600times4_PermuteAll_24rounds(__m256i *s); -#define KeccakF1600_StatePermute4x KeccakP1600times4_PermuteAll_24rounds - -static void keccak_absorb4x(__m256i *s, +static void keccakx4_absorb(__m256i s[25], unsigned int r, - const uint8_t *m0, - const uint8_t *m1, - const uint8_t *m2, - const uint8_t *m3, - size_t mlen, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen, uint8_t p) { - size_t i; - uint8_t t0[200] = {0}; - uint8_t t1[200] = {0}; - uint8_t t2[200] = {0}; - uint8_t t3[200] = {0}; + size_t i = 0, pos = 0; + __m256i t, idx; - unsigned long long *ss = (unsigned long long *)s; + for (i = 0; i < 25; ++i) { + s[i] = _mm256_setzero_si256(); + } - while (mlen >= r) { + idx = _mm256_set_epi64x((long long)in3, (long long)in2, (long long)in1, (long long)in0); + while (inlen >= r) { for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(m0 + 8 * i); - ss[4 * i + 1] ^= load64(m1 + 8 * i); - ss[4 * i + 2] ^= load64(m2 + 8 * i); - ss[4 * i + 3] ^= load64(m3 + 8 * i); + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + pos += 8; } KeccakF1600_StatePermute4x(s); - mlen -= r; - m0 += r; - m1 += r; - m2 += r; - m3 += r; + inlen -= r; } - for (i = 0; i < mlen; ++i) { - t0[i] = m0[i]; - t1[i] = m1[i]; - t2[i] = m2[i]; - t3[i] = m3[i]; + i = 0; + while (inlen >= 8) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + s[i] = _mm256_xor_si256(s[i], t); + + i++; + pos += 8; + inlen -= 8; } - t0[i] = p; - t1[i] = p; - t2[i] = p; - t3[i] = p; - - t0[r - 1] |= 128; - t1[r - 1] |= 128; - t2[r - 1] |= 128; - t3[r - 1] |= 128; - - for (i = 0; i < r / 8; ++i) { - ss[4 * i + 0] ^= load64(t0 + 8 * i); - ss[4 * i + 1] ^= load64(t1 + 8 * i); - ss[4 * i + 2] ^= load64(t2 + 8 * i); - ss[4 * i + 3] ^= load64(t3 + 8 * i); + if (inlen) { + t = _mm256_i64gather_epi64((long long *)pos, idx, 1); + idx = _mm256_set1_epi64x((long long)((1ULL << (8 * inlen)) - 1)); + t = _mm256_and_si256(t, idx); + s[i] = _mm256_xor_si256(s[i], t); } + + t = _mm256_set1_epi64x((uint64_t)p << 8 * inlen); + s[i] = _mm256_xor_si256(s[i], t); + t = _mm256_set1_epi64x((long long)(1ULL << 63)); + s[r / 8 - 1] = _mm256_xor_si256(s[r / 8 - 1], t); } -static void keccak_squeezeblocks4x(uint8_t *h0, - uint8_t *h1, - uint8_t *h2, - uint8_t *h3, +static void keccakx4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, size_t nblocks, - __m256i *s, - unsigned int r) { - unsigned long long *ss = (unsigned long long *)s; + unsigned int r, + __m256i s[25]) { + unsigned int i = 0; + uint64_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; while (nblocks > 0) { KeccakF1600_StatePermute4x(s); - for (size_t i = 0; i < (r >> 3); i++) { - store64(h0 + 8 * i, ss[4 * i + 0]); - store64(h1 + 8 * i, ss[4 * i + 1]); - store64(h2 + 8 * i, ss[4 * i + 2]); - store64(h3 + 8 * i, ss[4 * i + 3]); + for (i = 0; i < r / 8; ++i) { + f0 = _mm256_extract_epi64(s[i], 0); + f1 = _mm256_extract_epi64(s[i], 1); + f2 = _mm256_extract_epi64(s[i], 2); + f3 = _mm256_extract_epi64(s[i], 3); + store64(out0, f0); + store64(out1, f1); + store64(out2, f2); + store64(out3, f3); + + out0 += 8; + out1 += 8; + out2 += 8; + out3 += 8; } - h0 += r; - h1 += r; - h2 += r; - h3 += r; - nblocks--; - } -} -void PQCLEAN_KYBER768_AVX2_kyber_shake128x4_absorb(keccak4x_state *state, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3) { - uint8_t extseed[4][KYBER_SYMBYTES + 2]; - - for (size_t i = 0; i < KYBER_SYMBYTES; ++i) { - extseed[0][i] = seed[i]; - extseed[1][i] = seed[i]; - extseed[2][i] = seed[i]; - extseed[3][i] = seed[i]; - } - extseed[0][KYBER_SYMBYTES + 0] = (uint8_t)nonce0; - extseed[0][KYBER_SYMBYTES + 1] = (uint8_t)(nonce0 >> 8); - extseed[1][KYBER_SYMBYTES + 0] = (uint8_t)nonce1; - extseed[1][KYBER_SYMBYTES + 1] = (uint8_t)(nonce1 >> 8); - extseed[2][KYBER_SYMBYTES + 0] = (uint8_t)nonce2; - extseed[2][KYBER_SYMBYTES + 1] = (uint8_t)(nonce2 >> 8); - extseed[3][KYBER_SYMBYTES + 0] = (uint8_t)nonce3; - extseed[3][KYBER_SYMBYTES + 1] = (uint8_t)(nonce3 >> 8); - - /* zero state */ - for (size_t i = 0; i < 25; i++) { - state->s[i] = _mm256_xor_si256(state->s[i], state->s[i]); + --nblocks; } +} - /* absorb 4 message of identical length in parallel */ - keccak_absorb4x(state->s, SHAKE128_RATE, extseed[0], extseed[1], extseed[2], extseed[3], KYBER_SYMBYTES + 2, 0x1F); +void PQCLEAN_KYBER768_AVX2_shake128x4_absorb(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb(state->s, SHAKE128_RATE, in0, in1, in2, in3, inlen, 0x1F); } void PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(uint8_t *out0, @@ -150,82 +113,78 @@ void PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out2, uint8_t *out3, size_t nblocks, - keccak4x_state *state) { - keccak_squeezeblocks4x(out0, out1, out2, out3, nblocks, state->s, SHAKE128_RATE); + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE128_RATE, + state->s); } -static void shake256x4(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, size_t outlen, - const uint8_t *in0, - const uint8_t *in1, - const uint8_t *in2, - const uint8_t *in3, size_t inlen) { - __m256i s[25]; - uint8_t t0[SHAKE256_RATE]; - uint8_t t1[SHAKE256_RATE]; - uint8_t t2[SHAKE256_RATE]; - uint8_t t3[SHAKE256_RATE]; - - /* zero state */ - for (size_t i = 0; i < 25; i++) { - s[i] = _mm256_xor_si256(s[i], s[i]); - } - - /* absorb 4 message of identical length in parallel */ - keccak_absorb4x(s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); - - /* Squeeze output */ - keccak_squeezeblocks4x(out0, out1, out2, out3, outlen / SHAKE256_RATE, s, SHAKE256_RATE); +void PQCLEAN_KYBER768_AVX2_shake256x4_absorb(keccakx4_state *state, + const uint8_t *in0, + const uint8_t *in1, + const uint8_t *in2, + const uint8_t *in3, + size_t inlen) { + keccakx4_absorb(state->s, SHAKE256_RATE, in0, in1, in2, in3, inlen, 0x1F); +} - out0 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out1 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out2 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; - out3 += (outlen / SHAKE256_RATE) * SHAKE256_RATE; +void PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(uint8_t *out0, + uint8_t *out1, + uint8_t *out2, + uint8_t *out3, + size_t nblocks, + keccakx4_state *state) { + keccakx4_squeezeblocks(out0, out1, out2, out3, nblocks, SHAKE256_RATE, + state->s); +} - if (outlen % SHAKE256_RATE) { - keccak_squeezeblocks4x(t0, t1, t2, t3, 1, s, SHAKE256_RATE); - for (size_t i = 0; i < outlen % SHAKE256_RATE; i++) { - out0[i] = t0[i]; - out1[i] = t1[i]; - out2[i] = t2[i]; - out3[i] = t3[i]; +void PQCLEAN_KYBER768_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { + unsigned int i = 0; + size_t nblocks = outlen / SHAKE128_RATE; + uint8_t t[4][SHAKE128_RATE]; + keccakx4_state state; + + PQCLEAN_KYBER768_AVX2_shake128x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks * SHAKE128_RATE; + out1 += nblocks * SHAKE128_RATE; + out2 += nblocks * SHAKE128_RATE; + out3 += nblocks * SHAKE128_RATE; + outlen -= nblocks * SHAKE128_RATE; + + if (outlen) { + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; } } } -void PQCLEAN_KYBER768_AVX2_shake256x4_prf(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *key, - uint8_t nonce0, - uint8_t nonce1, - uint8_t nonce2, - uint8_t nonce3) { - uint8_t extseed[4][KYBER_SYMBYTES + 1]; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - extseed[0][i] = key[i]; - extseed[1][i] = key[i]; - extseed[2][i] = key[i]; - extseed[3][i] = key[i]; +void PQCLEAN_KYBER768_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen) { + unsigned int i = 0; + size_t nblocks = outlen / SHAKE256_RATE; + uint8_t t[4][SHAKE256_RATE]; + keccakx4_state state; + + PQCLEAN_KYBER768_AVX2_shake256x4_absorb(&state, in0, in1, in2, in3, inlen); + PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(out0, out1, out2, out3, nblocks, &state); + + out0 += nblocks * SHAKE256_RATE; + out1 += nblocks * SHAKE256_RATE; + out2 += nblocks * SHAKE256_RATE; + out3 += nblocks * SHAKE256_RATE; + outlen -= nblocks * SHAKE256_RATE; + + if (outlen) { + PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(t[0], t[1], t[2], t[3], 1, &state); + for (i = 0; i < outlen; ++i) { + out0[i] = t[0][i]; + out1[i] = t[1][i]; + out2[i] = t[2][i]; + out3[i] = t[3][i]; + } } - extseed[0][KYBER_SYMBYTES] = nonce0; - extseed[1][KYBER_SYMBYTES] = nonce1; - extseed[2][KYBER_SYMBYTES] = nonce2; - extseed[3][KYBER_SYMBYTES] = nonce3; - - shake256x4(out0, - out1, - out2, - out3, - outlen, - extseed[0], - extseed[1], - extseed[2], - extseed[3], - KYBER_SYMBYTES + 1); } diff --git a/crypto_kem/kyber768/avx2/fips202x4.h b/crypto_kem/kyber768/avx2/fips202x4.h index ffc1d4b0..2f525dc8 100644 --- a/crypto_kem/kyber768/avx2/fips202x4.h +++ b/crypto_kem/kyber768/avx2/fips202x4.h @@ -7,31 +7,19 @@ typedef struct { __m256i s[25]; -} keccak4x_state; - -void PQCLEAN_KYBER768_AVX2_kyber_shake128x4_absorb(keccak4x_state *state, - const uint8_t *seed, - uint16_t nonce0, - uint16_t nonce1, - uint16_t nonce2, - uint16_t nonce3); - -void PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t nblocks, - keccak4x_state *state); - -void PQCLEAN_KYBER768_AVX2_shake256x4_prf(uint8_t *out0, - uint8_t *out1, - uint8_t *out2, - uint8_t *out3, - size_t outlen, - const uint8_t *key, - uint8_t nonce0, - uint8_t nonce1, - uint8_t nonce2, - uint8_t nonce3); +} keccakx4_state; + +void PQCLEAN_KYBER768_AVX2_shake128x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, keccakx4_state *state); + +void PQCLEAN_KYBER768_AVX2_shake256x4_absorb(keccakx4_state *state, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t nblocks, + keccakx4_state *state); + +void PQCLEAN_KYBER768_AVX2_shake128x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); + +void PQCLEAN_KYBER768_AVX2_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t outlen, const uint8_t *in0, const uint8_t *in1, const uint8_t *in2, const uint8_t *in3, size_t inlen); #endif diff --git a/crypto_kem/kyber1024/avx2/fq.s b/crypto_kem/kyber768/avx2/fq.S similarity index 54% rename from crypto_kem/kyber1024/avx2/fq.s rename to crypto_kem/kyber768/avx2/fq.S index 8a49d883..fc038a73 100644 --- a/crypto_kem/kyber1024/avx2/fq.s +++ b/crypto_kem/kyber768/avx2/fq.S @@ -1,11 +1,8 @@ +#include "cdecl.inc" .include "fq.inc" -.global PQCLEAN_KYBER1024_AVX2_reduce_avx -PQCLEAN_KYBER1024_AVX2_reduce_avx: -#consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xv(%rip),%ymm1 - +.text +reduce128_avx: #load vmovdqa (%rdi),%ymm2 vmovdqa 32(%rdi),%ymm3 @@ -16,14 +13,14 @@ vmovdqa 160(%rdi),%ymm7 vmovdqa 192(%rdi),%ymm8 vmovdqa 224(%rdi),%ymm9 -red16 2 10 -red16 3 11 -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 10 -red16 9 11 +red16 2,10 +red16 3,11 +red16 4,12 +red16 5,13 +red16 6,14 +red16 7,15 +red16 8,10 +red16 9,11 #store vmovdqa %ymm2,(%rdi) @@ -37,11 +34,17 @@ vmovdqa %ymm9,224(%rdi) ret -.global PQCLEAN_KYBER1024_AVX2_csubq_avx -PQCLEAN_KYBER1024_AVX2_csubq_avx: +.global cdecl(PQCLEAN_KYBER768_AVX2_reduce_avx) +cdecl(PQCLEAN_KYBER768_AVX2_reduce_avx): #consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XV*2(%rsi),%ymm1 +call reduce128_avx +add $256,%rdi +call reduce128_avx +ret +csubq128_avx: #load vmovdqa (%rdi),%ymm1 vmovdqa 32(%rdi),%ymm2 @@ -52,14 +55,14 @@ vmovdqa 160(%rdi),%ymm6 vmovdqa 192(%rdi),%ymm7 vmovdqa 224(%rdi),%ymm8 -csubq 1 9 -csubq 2 10 -csubq 3 11 -csubq 4 12 -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 9 +csubq 1,9 +csubq 2,10 +csubq 3,11 +csubq 4,12 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,9 #store vmovdqa %ymm1,(%rdi) @@ -73,13 +76,16 @@ vmovdqa %ymm8,224(%rdi) ret -.global PQCLEAN_KYBER1024_AVX2_frommont_avx -PQCLEAN_KYBER1024_AVX2_frommont_avx: +.global cdecl(PQCLEAN_KYBER768_AVX2_csubq_avx) +cdecl(PQCLEAN_KYBER768_AVX2_csubq_avx): #consts -vmovdqa PQCLEAN_KYBER1024_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xmontsqlo(%rip),%ymm1 -vmovdqa PQCLEAN_KYBER1024_AVX2_16xmontsqhi(%rip),%ymm2 +vmovdqa _16XQ*2(%rsi),%ymm0 +call csubq128_avx +add $256,%rdi +call csubq128_avx +ret +tomont128_avx: #load vmovdqa (%rdi),%ymm3 vmovdqa 32(%rdi),%ymm4 @@ -90,14 +96,14 @@ vmovdqa 160(%rdi),%ymm8 vmovdqa 192(%rdi),%ymm9 vmovdqa 224(%rdi),%ymm10 -fqmulprecomp 1,2,3 11 -fqmulprecomp 1,2,4 12 -fqmulprecomp 1,2,5 13 -fqmulprecomp 1,2,6 14 -fqmulprecomp 1,2,7 15 -fqmulprecomp 1,2,8 11 -fqmulprecomp 1,2,9 12 -fqmulprecomp 1,2,10 13 +fqmulprecomp 1,2,3,11 +fqmulprecomp 1,2,4,12 +fqmulprecomp 1,2,5,13 +fqmulprecomp 1,2,6,14 +fqmulprecomp 1,2,7,15 +fqmulprecomp 1,2,8,11 +fqmulprecomp 1,2,9,12 +fqmulprecomp 1,2,10,13 #store vmovdqa %ymm3,(%rdi) @@ -110,3 +116,14 @@ vmovdqa %ymm9,192(%rdi) vmovdqa %ymm10,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER768_AVX2_tomont_avx) +cdecl(PQCLEAN_KYBER768_AVX2_tomont_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +vmovdqa _16XMONTSQLO*2(%rsi),%ymm1 +vmovdqa _16XMONTSQHI*2(%rsi),%ymm2 +call tomont128_avx +add $256,%rdi +call tomont128_avx +ret diff --git a/crypto_kem/kyber768/avx2/fq.inc b/crypto_kem/kyber768/avx2/fq.inc index 56e36a02..75df098a 100644 --- a/crypto_kem/kyber768/avx2/fq.inc +++ b/crypto_kem/kyber768/avx2/fq.inc @@ -1,24 +1,27 @@ -.macro red16 r x=12 +.macro red16 r,x=12 vpmulhw %ymm1,%ymm\r,%ymm\x vpsraw $10,%ymm\x,%ymm\x vpmullw %ymm0,%ymm\x,%ymm\x vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro csubq r x=12 +.macro csubq r,x=12 vpsubw %ymm0,%ymm\r,%ymm\r vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r +#vpcmpgtw %ymm0,%ymm\r,%ymm\x +#vpand %ymm0,%ymm\x,%ymm\x +#vpsubw %ymm\x,%ymm\r,%ymm\r .endm -.macro caddq r x=12 +.macro caddq r,x=12 vpsraw $15,%ymm\r,%ymm\x vpand %ymm0,%ymm\x,%ymm\x vpaddw %ymm\x,%ymm\r,%ymm\r .endm -.macro fqmulprecomp al,ah,b x=12 +.macro fqmulprecomp al,ah,b,x=12 vpmullw %ymm\al,%ymm\b,%ymm\x vpmulhw %ymm\ah,%ymm\b,%ymm\b vpmulhw %ymm0,%ymm\x,%ymm\x diff --git a/crypto_kem/kyber768/avx2/fq.s b/crypto_kem/kyber768/avx2/fq.s deleted file mode 100644 index 32362886..00000000 --- a/crypto_kem/kyber768/avx2/fq.s +++ /dev/null @@ -1,112 +0,0 @@ -.include "fq.inc" - -.global PQCLEAN_KYBER768_AVX2_reduce_avx -PQCLEAN_KYBER768_AVX2_reduce_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER768_AVX2_16xv(%rip),%ymm1 - -#load -vmovdqa (%rdi),%ymm2 -vmovdqa 32(%rdi),%ymm3 -vmovdqa 64(%rdi),%ymm4 -vmovdqa 96(%rdi),%ymm5 -vmovdqa 128(%rdi),%ymm6 -vmovdqa 160(%rdi),%ymm7 -vmovdqa 192(%rdi),%ymm8 -vmovdqa 224(%rdi),%ymm9 - -red16 2 10 -red16 3 11 -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 10 -red16 9 11 - -#store -vmovdqa %ymm2,(%rdi) -vmovdqa %ymm3,32(%rdi) -vmovdqa %ymm4,64(%rdi) -vmovdqa %ymm5,96(%rdi) -vmovdqa %ymm6,128(%rdi) -vmovdqa %ymm7,160(%rdi) -vmovdqa %ymm8,192(%rdi) -vmovdqa %ymm9,224(%rdi) - -ret - -.global PQCLEAN_KYBER768_AVX2_csubq_avx -PQCLEAN_KYBER768_AVX2_csubq_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 - -#load -vmovdqa (%rdi),%ymm1 -vmovdqa 32(%rdi),%ymm2 -vmovdqa 64(%rdi),%ymm3 -vmovdqa 96(%rdi),%ymm4 -vmovdqa 128(%rdi),%ymm5 -vmovdqa 160(%rdi),%ymm6 -vmovdqa 192(%rdi),%ymm7 -vmovdqa 224(%rdi),%ymm8 - -csubq 1 9 -csubq 2 10 -csubq 3 11 -csubq 4 12 -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 9 - -#store -vmovdqa %ymm1,(%rdi) -vmovdqa %ymm2,32(%rdi) -vmovdqa %ymm3,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm5,128(%rdi) -vmovdqa %ymm6,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm8,224(%rdi) - -ret - -.global PQCLEAN_KYBER768_AVX2_frommont_avx -PQCLEAN_KYBER768_AVX2_frommont_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER768_AVX2_16xmontsqlo(%rip),%ymm1 -vmovdqa PQCLEAN_KYBER768_AVX2_16xmontsqhi(%rip),%ymm2 - -#load -vmovdqa (%rdi),%ymm3 -vmovdqa 32(%rdi),%ymm4 -vmovdqa 64(%rdi),%ymm5 -vmovdqa 96(%rdi),%ymm6 -vmovdqa 128(%rdi),%ymm7 -vmovdqa 160(%rdi),%ymm8 -vmovdqa 192(%rdi),%ymm9 -vmovdqa 224(%rdi),%ymm10 - -fqmulprecomp 1,2,3 11 -fqmulprecomp 1,2,4 12 -fqmulprecomp 1,2,5 13 -fqmulprecomp 1,2,6 14 -fqmulprecomp 1,2,7 15 -fqmulprecomp 1,2,8 11 -fqmulprecomp 1,2,9 12 -fqmulprecomp 1,2,10 13 - -#store -vmovdqa %ymm3,(%rdi) -vmovdqa %ymm4,32(%rdi) -vmovdqa %ymm5,64(%rdi) -vmovdqa %ymm6,96(%rdi) -vmovdqa %ymm7,128(%rdi) -vmovdqa %ymm8,160(%rdi) -vmovdqa %ymm9,192(%rdi) -vmovdqa %ymm10,224(%rdi) - -ret diff --git a/crypto_kem/kyber768/avx2/indcpa.c b/crypto_kem/kyber768/avx2/indcpa.c index 90567655..cbc8a6f8 100644 --- a/crypto_kem/kyber768/avx2/indcpa.c +++ b/crypto_kem/kyber768/avx2/indcpa.c @@ -1,26 +1,33 @@ +#include "align.h" #include "cbd.h" #include "indcpa.h" #include "ntt.h" +#include "params.h" #include "poly.h" #include "polyvec.h" #include "randombytes.h" #include "rejsample.h" #include "symmetric.h" +#include +#include /************************************************* * Name: pack_pk * * Description: Serialize the public key as concatenation of the -* compressed and serialized vector of polynomials pk +* serialized vector of polynomials pk * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER768_AVX2_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -28,16 +35,19 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { /************************************************* * Name: unpack_pk * -* Description: De-serialize and decompress public key from a byte array; +* Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key polynomial vector +* - uint8_t *seed: pointer to output seed to generate matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER768_AVX2_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -48,9 +58,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER768_AVX2_polyvec_tobytes(r, sk); } @@ -60,10 +70,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of polynomials +* (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER768_AVX2_polyvec_frombytes(sk, packedsk); } @@ -74,11 +86,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER768_AVX2_polyvec_compress(r, b); PQCLEAN_KYBER768_AVX2_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -89,22 +103,42 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER768_AVX2_polyvec_decompress(b, c); PQCLEAN_KYBER768_AVX2_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } -static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - unsigned int ctr, pos; - uint16_t val; +/************************************************* +* Name: rej_uniform +* +* Description: Run rejection sampling on uniform random bytes to generate +* uniform random integers mod q +* +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes +* +* Returns number of sampled 16-bit integers (at most len) +**************************************************/ +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { @@ -116,53 +150,76 @@ static size_t rej_uniform_ref(int16_t *r, size_t len, const uint8_t *buf, size_t return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER768_AVX2_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER768_AVX2_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER768_AVX2_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T is generated **************************************************/ -#define GEN_MATRIX_MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr0, ctr1, ctr2, ctr3, bufbytes; - union { - uint8_t x[4][XOF_BLOCKBYTES * GEN_MATRIX_MAXNBLOCKS]; - __m256i _dummy; - } buf; - keccak4x_state state; - keccak_state state1x; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +void PQCLEAN_KYBER768_AVX2_gen_matrix(polyvec *a, const uint8_t seed[32], int transposed) { + unsigned int ctr0 = 0, ctr1 = 0, ctr2 = 0, ctr3 = 0; + ALIGN32_ARRAY_2D(uint8_t, 4, (GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 31) / 32 * 32) buf; + __m256i f; + keccakx4_state state; + xof_state state1x; + + f = _mm256_load_si256((__m256i *)seed); + _mm256_store_si256((__m256i *)buf.arr[0], f); + _mm256_store_si256((__m256i *)buf.arr[1], f); + _mm256_store_si256((__m256i *)buf.arr[2], f); + _mm256_store_si256((__m256i *)buf.arr[3], f); if (transposed) { - PQCLEAN_KYBER768_AVX2_kyber_shake128x4_absorb(&state, seed, 0, 256, 512, 1); + buf.arr[0][KYBER_SYMBYTES + 0] = 0; + buf.arr[0][KYBER_SYMBYTES + 1] = 0; + buf.arr[1][KYBER_SYMBYTES + 0] = 0; + buf.arr[1][KYBER_SYMBYTES + 1] = 1; + buf.arr[2][KYBER_SYMBYTES + 0] = 0; + buf.arr[2][KYBER_SYMBYTES + 1] = 2; + buf.arr[3][KYBER_SYMBYTES + 0] = 1; + buf.arr[3][KYBER_SYMBYTES + 1] = 0; } else { - PQCLEAN_KYBER768_AVX2_kyber_shake128x4_absorb(&state, seed, 0, 1, 2, 256); + buf.arr[0][KYBER_SYMBYTES + 0] = 0; + buf.arr[0][KYBER_SYMBYTES + 1] = 0; + buf.arr[1][KYBER_SYMBYTES + 0] = 1; + buf.arr[1][KYBER_SYMBYTES + 1] = 0; + buf.arr[2][KYBER_SYMBYTES + 0] = 2; + buf.arr[2][KYBER_SYMBYTES + 1] = 0; + buf.arr[3][KYBER_SYMBYTES + 0] = 0; + buf.arr[3][KYBER_SYMBYTES + 1] = 1; } - PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], GEN_MATRIX_MAXNBLOCKS, &state); - bufbytes = GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES; + PQCLEAN_KYBER768_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], GEN_MATRIX_NBLOCKS, + &state); - ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[0].vec[0].coeffs, KYBER_N, buf.x[0], bufbytes); - ctr1 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[0].vec[1].coeffs, KYBER_N, buf.x[1], bufbytes); - ctr2 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[0].vec[2].coeffs, KYBER_N, buf.x[2], bufbytes); - ctr3 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[1].vec[0].coeffs, KYBER_N, buf.x[3], bufbytes); + ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[0].coeffs, buf.arr[0]); + ctr1 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[1].coeffs, buf.arr[1]); + ctr2 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[0].vec[2].coeffs, buf.arr[2]); + ctr3 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[0].coeffs, buf.arr[3]); while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { - PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], 1, &state); - bufbytes = XOF_BLOCKBYTES; - - ctr0 += rej_uniform_ref(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.x[0], bufbytes); - ctr1 += rej_uniform_ref(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.x[1], bufbytes); - ctr2 += rej_uniform_ref(a[0].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.x[2], bufbytes); - ctr3 += rej_uniform_ref(a[1].vec[0].coeffs + ctr3, KYBER_N - ctr3, buf.x[3], bufbytes); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + + ctr0 += rej_uniform(a[0].vec[0].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], + XOF_BLOCKBYTES); + ctr1 += rej_uniform(a[0].vec[1].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], + XOF_BLOCKBYTES); + ctr2 += rej_uniform(a[0].vec[2].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], + XOF_BLOCKBYTES); + ctr3 += rej_uniform(a[1].vec[0].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], + XOF_BLOCKBYTES); } PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[0].vec[0]); @@ -170,28 +227,52 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[0].vec[2]); PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[1].vec[0]); + f = _mm256_load_si256((__m256i *)seed); + _mm256_store_si256((__m256i *)buf.arr[0], f); + _mm256_store_si256((__m256i *)buf.arr[1], f); + _mm256_store_si256((__m256i *)buf.arr[2], f); + _mm256_store_si256((__m256i *)buf.arr[3], f); + if (transposed) { - PQCLEAN_KYBER768_AVX2_kyber_shake128x4_absorb(&state, seed, 257, 513, 2, 258); + buf.arr[0][KYBER_SYMBYTES + 0] = 1; + buf.arr[0][KYBER_SYMBYTES + 1] = 1; + buf.arr[1][KYBER_SYMBYTES + 0] = 1; + buf.arr[1][KYBER_SYMBYTES + 1] = 2; + buf.arr[2][KYBER_SYMBYTES + 0] = 2; + buf.arr[2][KYBER_SYMBYTES + 1] = 0; + buf.arr[3][KYBER_SYMBYTES + 0] = 2; + buf.arr[3][KYBER_SYMBYTES + 1] = 1; } else { - PQCLEAN_KYBER768_AVX2_kyber_shake128x4_absorb(&state, seed, 257, 258, 512, 513); + buf.arr[0][KYBER_SYMBYTES + 0] = 1; + buf.arr[0][KYBER_SYMBYTES + 1] = 1; + buf.arr[1][KYBER_SYMBYTES + 0] = 2; + buf.arr[1][KYBER_SYMBYTES + 1] = 1; + buf.arr[2][KYBER_SYMBYTES + 0] = 0; + buf.arr[2][KYBER_SYMBYTES + 1] = 2; + buf.arr[3][KYBER_SYMBYTES + 0] = 1; + buf.arr[3][KYBER_SYMBYTES + 1] = 2; } - PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], GEN_MATRIX_MAXNBLOCKS, &state); - bufbytes = GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES; + PQCLEAN_KYBER768_AVX2_shake128x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], KYBER_SYMBYTES + 2); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], GEN_MATRIX_NBLOCKS, + &state); - ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[1].vec[1].coeffs, KYBER_N, buf.x[0], bufbytes); - ctr1 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[1].vec[2].coeffs, KYBER_N, buf.x[1], bufbytes); - ctr2 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[2].vec[0].coeffs, KYBER_N, buf.x[2], bufbytes); - ctr3 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[2].vec[1].coeffs, KYBER_N, buf.x[3], bufbytes); + ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[1].coeffs, buf.arr[0]); + ctr1 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[1].vec[2].coeffs, buf.arr[1]); + ctr2 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[0].coeffs, buf.arr[2]); + ctr3 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[1].coeffs, buf.arr[3]); while (ctr0 < KYBER_N || ctr1 < KYBER_N || ctr2 < KYBER_N || ctr3 < KYBER_N) { - PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.x[0], buf.x[1], buf.x[2], buf.x[3], 1, &state); - bufbytes = XOF_BLOCKBYTES; - - ctr0 += rej_uniform_ref(a[1].vec[1].coeffs + ctr0, KYBER_N - ctr0, buf.x[0], bufbytes); - ctr1 += rej_uniform_ref(a[1].vec[2].coeffs + ctr1, KYBER_N - ctr1, buf.x[1], bufbytes); - ctr2 += rej_uniform_ref(a[2].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf.x[2], bufbytes); - ctr3 += rej_uniform_ref(a[2].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf.x[3], bufbytes); + PQCLEAN_KYBER768_AVX2_shake128x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + + ctr0 += rej_uniform(a[1].vec[1].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], + XOF_BLOCKBYTES); + ctr1 += rej_uniform(a[1].vec[2].coeffs + ctr1, KYBER_N - ctr1, buf.arr[1], + XOF_BLOCKBYTES); + ctr2 += rej_uniform(a[2].vec[0].coeffs + ctr2, KYBER_N - ctr2, buf.arr[2], + XOF_BLOCKBYTES); + ctr3 += rej_uniform(a[2].vec[1].coeffs + ctr3, KYBER_N - ctr3, buf.arr[3], + XOF_BLOCKBYTES); } PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[1].vec[1]); @@ -199,56 +280,59 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[2].vec[0]); PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[2].vec[1]); - PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(&state1x, seed, 2, 2); - - PQCLEAN_KYBER768_AVX2_kyber_shake128_squeezeblocks(buf.x[0], GEN_MATRIX_MAXNBLOCKS, &state1x); - bufbytes = GEN_MATRIX_MAXNBLOCKS * XOF_BLOCKBYTES; - - ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform(a[2].vec[2].coeffs, KYBER_N, buf.x[0], bufbytes); - + f = _mm256_load_si256((__m256i *)seed); + _mm256_store_si256((__m256i *)buf.arr[0], f); + buf.arr[0][KYBER_SYMBYTES + 0] = 2; + buf.arr[0][KYBER_SYMBYTES + 1] = 2; + shake128_absorb(&state1x, buf.arr[0], KYBER_SYMBYTES + 2); + shake128_squeezeblocks(buf.arr[0], GEN_MATRIX_NBLOCKS, &state1x); + ctr0 = PQCLEAN_KYBER768_AVX2_rej_uniform_avx(a[2].vec[2].coeffs, buf.arr[0]); while (ctr0 < KYBER_N) { - PQCLEAN_KYBER768_AVX2_kyber_shake128_squeezeblocks(buf.x[0], 1, &state1x); - bufbytes = XOF_BLOCKBYTES; - - ctr0 += rej_uniform_ref(a[2].vec[2].coeffs + ctr0, KYBER_N - ctr0, buf.x[0], bufbytes); + shake128_squeezeblocks(buf.arr[0], 1, &state1x); + ctr0 += rej_uniform(a[2].vec[2].coeffs + ctr0, KYBER_N - ctr0, buf.arr[0], + XOF_BLOCKBYTES); } - - xof_ctx_release(&state1x); + shake128_ctx_release(&state1x); PQCLEAN_KYBER768_AVX2_poly_nttunpack(&a[2].vec[2]); } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER768_AVX2_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER768_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], skpv, e, pkpv; - uint8_t buf[2 * KYBER_SYMBYTES]; - const uint8_t *publicseed = buf; - const uint8_t *noiseseed = buf + KYBER_SYMBYTES; - uint8_t nonce = 0; +void PQCLEAN_KYBER768_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + const uint8_t *publicseed = buf.arr; + const uint8_t *noiseseed = buf.arr + KYBER_SYMBYTES; + polyvec a[KYBER_K], e, pkpv, skpv; - randombytes(buf, KYBER_SYMBYTES); - hash_g(buf, buf, KYBER_SYMBYTES); + randombytes(buf.arr, KYBER_SYMBYTES); + hash_g(buf.arr, buf.arr, KYBER_SYMBYTES); gen_a(a, publicseed); - PQCLEAN_KYBER768_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, e.vec + 0, noiseseed, nonce + 0, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_KYBER768_AVX2_poly_getnoise4x(e.vec + 1, e.vec + 2, pkpv.vec + 0, pkpv.vec + 1, noiseseed, nonce + 4, nonce + 5, 0, 0); + PQCLEAN_KYBER768_AVX2_poly_getnoise4x(skpv.vec + 0, skpv.vec + 1, skpv.vec + 2, e.vec + 0, noiseseed, + 0, 1, 2, 3); + PQCLEAN_KYBER768_AVX2_poly_getnoise4x(e.vec + 1, e.vec + 2, pkpv.vec + 0, pkpv.vec + 1, noiseseed, + 4, 5, 6, 7); PQCLEAN_KYBER768_AVX2_polyvec_ntt(&skpv); PQCLEAN_KYBER768_AVX2_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc(pkpv.vec + i, a + i, &skpv); - PQCLEAN_KYBER768_AVX2_poly_frommont(pkpv.vec + i); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER768_AVX2_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER768_AVX2_polyvec_add(&pkpv, &pkpv, &e); @@ -259,44 +343,51 @@ void PQCLEAN_KYBER768_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER768_AVX2_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER768_AVX2_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec at[KYBER_K], pkpv, sp, ep, bp; - poly k, v, epp; - uint8_t seed[KYBER_SYMBYTES]; - uint8_t nonce = 0; - - unpack_pk(&pkpv, seed, pk); +void PQCLEAN_KYBER768_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; + ALIGN32_ARRAY(uint8_t, KYBER_SYMBYTES) seed; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; + + unpack_pk(&pkpv, seed.arr, pk); PQCLEAN_KYBER768_AVX2_poly_frommsg(&k, m); - gen_at(at, seed); + gen_at(at, seed.arr); - PQCLEAN_KYBER768_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, ep.vec + 0, coins, nonce + 0, nonce + 1, nonce + 2, nonce + 3); - PQCLEAN_KYBER768_AVX2_poly_getnoise4x(ep.vec + 1, ep.vec + 2, &epp, bp.vec + 0, coins, nonce + 4, nonce + 5, nonce + 6, 0); + PQCLEAN_KYBER768_AVX2_poly_getnoise4x(sp.vec + 0, sp.vec + 1, sp.vec + 2, ep.vec + 0, coins, + 0, 1, 2, 3); + PQCLEAN_KYBER768_AVX2_poly_getnoise4x(ep.vec + 1, ep.vec + 2, &epp, bp.vec + 0, coins, + 4, 5, 6, 7); PQCLEAN_KYBER768_AVX2_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc(bp.vec + i, at + i, &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER768_AVX2_polyvec_invntt(&bp); - PQCLEAN_KYBER768_AVX2_poly_invntt(&v); + PQCLEAN_KYBER768_AVX2_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(&v); PQCLEAN_KYBER768_AVX2_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER768_AVX2_poly_add(&v, &v, &epp); @@ -308,18 +399,21 @@ void PQCLEAN_KYBER768_AVX2_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER768_AVX2_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER768_AVX2_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER768_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -327,8 +421,8 @@ void PQCLEAN_KYBER768_AVX2_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER768_AVX2_polyvec_ntt(&bp); - PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER768_AVX2_poly_invntt(&mp); + PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(&mp); PQCLEAN_KYBER768_AVX2_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER768_AVX2_poly_reduce(&mp); diff --git a/crypto_kem/kyber768/avx2/indcpa.h b/crypto_kem/kyber768/avx2/indcpa.h index 3a3d761a..144c999b 100644 --- a/crypto_kem/kyber768/avx2/indcpa.h +++ b/crypto_kem/kyber768/avx2/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER768_AVX2_INDCPA_H +#define PQCLEAN_KYBER768_AVX2_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER768_AVX2_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER768_AVX2_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER768_AVX2_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER768_AVX2_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER768_AVX2_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER768_AVX2_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER768_AVX2_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber768/avx2/invntt.S b/crypto_kem/kyber768/avx2/invntt.S new file mode 100644 index 00000000..e7344b38 --- /dev/null +++ b/crypto_kem/kyber768/avx2/invntt.S @@ -0,0 +1,225 @@ +#include "cdecl.inc" +.include "shuffle.inc" +.include "fq.inc" + +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=1,zl1=1,zh0=2,zh1=2 +#update & mul +vpsubw %ymm\rh0,%ymm\rl0,%ymm12 +vpsubw %ymm\rh1,%ymm\rl1,%ymm13 +vpsubw %ymm\rh2,%ymm\rl2,%ymm14 + +vpaddw %ymm\rh0,%ymm\rl0,%ymm\rl0 +vpaddw %ymm\rh1,%ymm\rl1,%ymm\rl1 +vpmullw %ymm\zl0,%ymm12,%ymm\rh0 + +vpaddw %ymm\rh2,%ymm\rl2,%ymm\rl2 +vpmullw %ymm\zl0,%ymm13,%ymm\rh1 +vpsubw %ymm\rh3,%ymm\rl3,%ymm15 + +vpaddw %ymm\rh3,%ymm\rl3,%ymm\rl3 +vpmullw %ymm\zl1,%ymm14,%ymm\rh2 +vpmullw %ymm\zl1,%ymm15,%ymm\rh3 + +vpmulhw %ymm\zh0,%ymm12,%ymm12 +vpmulhw %ymm\zh0,%ymm13,%ymm13 + +vpmulhw %ymm\zh1,%ymm14,%ymm14 +vpmulhw %ymm\zh1,%ymm15,%ymm15 + +#reduce +vpmulhw %ymm0,%ymm\rh0,%ymm\rh0 +vpmulhw %ymm0,%ymm\rh1,%ymm\rh1 +vpmulhw %ymm0,%ymm\rh2,%ymm\rh2 +vpmulhw %ymm0,%ymm\rh3,%ymm\rh3 +vpsubw %ymm\rh0,%ymm12,%ymm\rh0 +vpsubw %ymm\rh1,%ymm13,%ymm\rh1 +vpsubw %ymm\rh2,%ymm14,%ymm\rh2 +vpsubw %ymm\rh3,%ymm15,%ymm\rh3 +.endm + +.text +invntt_levels0t5_avx: +level0: +#zetas +vmovdqu (%rsi),%ymm15 +vmovdqu 64(%rsi),%ymm3 +vmovdqu 32(%rsi),%ymm1 +vmovdqu 96(%rsi),%ymm2 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + +butterfly 4,5,8,9,6,7,10,11,15,3,1,2 + +level1: +#zetas +vmovdqu 128(%rsi),%ymm3 +vmovdqu 160(%rsi),%ymm2 + +butterfly 4,5,6,7,8,9,10,11,3,3,2,2 + +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +level2: +#zetas +vmovdqu 192(%rsi),%ymm10 +vmovdqu 224(%rsi),%ymm2 + +#consts +vmovdqa _16XV*2(%rdx),%ymm1 + +butterfly 3,4,6,8,5,7,9,11,10,10,2,2 + +red16 3 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +level3: +#zetas +vmovdqu 256(%rsi),%ymm9 +vmovdqu 288(%rsi),%ymm2 + +butterfly 10,3,6,5,4,8,7,11,9,9,2,2 + +red16 10 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +level4: +#zetas +vmovdqu 320(%rsi),%ymm7 +vmovdqu 352(%rsi),%ymm2 + +butterfly 9,10,6,4,3,5,8,11,7,7,2,2 + +red16 9 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +level5: +#zetas +vpbroadcastd 384(%rsi),%ymm8 +vpbroadcastd 388(%rsi),%ymm2 + +butterfly 7,9,6,3,10,4,5,11,8,8,2,2 + +red16 7 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret + +invntt_level6_avx: +#zetas +vpbroadcastd (%rsi),%ymm1 +vpbroadcastd 4(%rsi),%ymm2 + +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 256(%rdi),%ymm8 +vmovdqa 288(%rdi),%ymm9 +vmovdqa 320(%rdi),%ymm10 +vmovdqa 352(%rdi),%ymm11 + +butterfly 4,5,6,7,8,9,10,11 + +#consts +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 + +#store +vmovdqa %ymm8,256(%rdi) +vmovdqa %ymm9,288(%rdi) +vmovdqa %ymm10,320(%rdi) +vmovdqa %ymm11,352(%rdi) + +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 + +#store +vmovdqa %ymm4,(%rdi) +vmovdqa %ymm5,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm7,96(%rdi) + +#load +vmovdqa 128(%rdi),%ymm4 +vmovdqa 160(%rdi),%ymm5 +vmovdqa 192(%rdi),%ymm6 +vmovdqa 224(%rdi),%ymm7 +vmovdqa 384(%rdi),%ymm8 +vmovdqa 416(%rdi),%ymm9 +vmovdqa 448(%rdi),%ymm10 +vmovdqa 480(%rdi),%ymm11 + +butterfly 4,5,6,7,8,9,10,11 + +#consts +vmovdqa _16XFLO*2(%rdx),%ymm12 +vmovdqa _16XFHI*2(%rdx),%ymm13 + +#store +vmovdqa %ymm8,384(%rdi) +vmovdqa %ymm9,416(%rdi) +vmovdqa %ymm10,448(%rdi) +vmovdqa %ymm11,480(%rdi) + +fqmulprecomp 12,13,4,8 +fqmulprecomp 12,13,5,9 +fqmulprecomp 12,13,6,10 +fqmulprecomp 12,13,7,11 + +#store +vmovdqa %ymm4,128(%rdi) +vmovdqa %ymm5,160(%rdi) +vmovdqa %ymm6,192(%rdi) +vmovdqa %ymm7,224(%rdi) + +ret + +.global cdecl(PQCLEAN_KYBER768_AVX2_invntt_avx) +cdecl(PQCLEAN_KYBER768_AVX2_invntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_INV_EXP*2,%rsi +call invntt_levels0t5_avx +add $256,%rdi +add $392,%rsi +call invntt_levels0t5_avx +sub $256,%rdi +add $392,%rsi +call invntt_level6_avx +ret diff --git a/crypto_kem/kyber768/avx2/kem.c b/crypto_kem/kyber768/avx2/kem.c index 792ae55c..10b3a615 100644 --- a/crypto_kem/kyber768/avx2/kem.c +++ b/crypto_kem/kyber768/avx2/kem.c @@ -1,103 +1,127 @@ -#include "api.h" +#include "align.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include + -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER768_AVX2_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER768_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER768_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER768_AVX2_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER768_AVX2_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER768_AVX2_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ - uint8_t buf[2 * KYBER_SYMBYTES]; +int PQCLEAN_KYBER768_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; - randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + randombytes(buf.arr, KYBER_SYMBYTES); + /* Don't release system RNG output */ + hash_h(buf.arr, buf.arr, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf.arr + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER768_AVX2_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER768_AVX2_indcpa_enc(ct, buf.arr, pk, kr.arr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER768_AVX2_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER768_AVX2_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - union { - uint8_t x[KYBER_CIPHERTEXTBYTES]; - __m256i __dummy; - } _cmp; - uint8_t *cmp = _cmp.x; - uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER768_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) buf; + /* Will contain key, coins */ + ALIGN32_ARRAY(uint8_t, 2 * KYBER_SYMBYTES) kr; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; - PQCLEAN_KYBER768_AVX2_indcpa_dec(buf, ct, sk); + PQCLEAN_KYBER768_AVX2_indcpa_dec(buf.arr, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf.arr[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } - hash_g(kr, buf, 2 * KYBER_SYMBYTES); + hash_g(kr.arr, buf.arr, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER768_AVX2_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER768_AVX2_indcpa_enc(cmp, buf.arr, pk, kr.arr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER768_AVX2_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr.arr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER768_AVX2_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER768_AVX2_cmov(kr.arr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr.arr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber768/avx2/kem.h b/crypto_kem/kyber768/avx2/kem.h new file mode 100644 index 00000000..2ba9873a --- /dev/null +++ b/crypto_kem/kyber768/avx2/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER768_AVX2_KEM_H +#define PQCLEAN_KYBER768_AVX2_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER768_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER768_AVX2_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER768_AVX2_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber512-90s/avx2/ntt.s b/crypto_kem/kyber768/avx2/ntt.S similarity index 81% rename from crypto_kem/kyber512-90s/avx2/ntt.s rename to crypto_kem/kyber768/avx2/ntt.S index 9019cf27..eaf9355e 100644 --- a/crypto_kem/kyber512-90s/avx2/ntt.s +++ b/crypto_kem/kyber768/avx2/ntt.S @@ -1,7 +1,8 @@ +#include "cdecl.inc" .include "shuffle.inc" .include "fq.inc" -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=15,zl1=15,zh0=1,zh1=1 +.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=15,zl1=15,zh0=1,zh1=1 #mul vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmullw %ymm\zl0,%ymm\rh1,%ymm13 @@ -36,7 +37,7 @@ vpaddw %ymm15,%ymm\rl3,%ymm\rl3 # We break the dependency chains with the cost of slightly more additions. # But they can be run in parallel to the multiplications on execution port 5 # (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 +.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 #mul vpmullw %ymm\zl0,%ymm\rh0,%ymm12 vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x @@ -73,11 +74,8 @@ vpaddw %ymm15,%ymm\rh3,%ymm\rh3 vpsubw %ymm15,%ymm\rl3,%ymm\rl3 .endm -.global PQCLEAN_KYBER51290S_AVX2_ntt_level0_avx -PQCLEAN_KYBER51290S_AVX2_ntt_level0_avx: -#consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 - +.text +ntt_level0_avx: level0: #zetas vpbroadcastd (%rsi),%ymm15 @@ -107,11 +105,7 @@ vmovdqa %ymm11,352(%rdi) ret -.global PQCLEAN_KYBER51290S_AVX2_ntt_levels1t6_avx -PQCLEAN_KYBER51290S_AVX2_ntt_levels1t6_avx: -#consts -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xq(%rip),%ymm0 - +ntt_levels1t6_avx: level1: #zetas vpbroadcastd (%rsi),%ymm15 @@ -127,7 +121,7 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -butterfly2 4,5,6,7,8,9,10,11 3 +butterfly2 4,5,6,7,8,9,10,11,3 level2: #zetas @@ -139,7 +133,7 @@ shuffle8 5,9,4,9 shuffle8 6,10,5,10 shuffle8 7,11,6,11 -butterfly2 3,8,4,9,5,10,6,11 7 +butterfly2 3,8,4,9,5,10,6,11,7 level3: #zetas @@ -151,7 +145,7 @@ shuffle4 8,10,3,10 shuffle4 4,6,8,6 shuffle4 9,11,4,11 -butterfly2 7,5,3,10,8,6,4,11 9 +butterfly2 7,5,3,10,8,6,4,11,9 level4: #zetas @@ -163,7 +157,7 @@ shuffle2 5,6,7,6 shuffle2 3,4,5,4 shuffle2 10,11,3,11 -butterfly2 9,8,7,6,5,4,3,11 10 +butterfly2 9,8,7,6,5,4,3,11,10 level5: #zetas @@ -175,7 +169,7 @@ shuffle1 8,4,9,4 shuffle1 7,3,8,3 shuffle1 6,11,7,11 -butterfly2 10,5,9,4,8,3,7,11 6 +butterfly2 10,5,9,4,8,3,7,11,6 level6: #zetas @@ -184,17 +178,17 @@ vmovdqu 328(%rsi),%ymm15 vmovdqu 296(%rsi),%ymm1 vmovdqu 360(%rsi),%ymm2 -butterfly2 10,5,8,3,9,4,7,11 6,1,14,15,1,2 +butterfly2 10,5,8,3,9,4,7,11,6,1,14,15,1,2 -vmovdqa PQCLEAN_KYBER51290S_AVX2_16xv(%rip),%ymm1 -red16 10 12 -red16 5 13 -red16 9 14 -red16 4 15 -red16 8 2 -red16 3 6 -red16 7 12 -red16 11 13 +vmovdqa _16XV*2(%rdx),%ymm1 +red16 10,12 +red16 5,13 +red16 9,14 +red16 4,15 +red16 8,2 +red16 3,6 +red16 7,12 +red16 11,13 #store vmovdqa %ymm10,(%rdi) @@ -207,3 +201,20 @@ vmovdqa %ymm7,192(%rdi) vmovdqa %ymm11,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER768_AVX2_ntt_avx) +cdecl(PQCLEAN_KYBER768_AVX2_ntt_avx): +#consts +vmovdqa _16XQ*2(%rsi),%ymm0 +mov %rsi,%rdx +add $_ZETAS_EXP*2,%rsi +call ntt_level0_avx +add $128,%rdi +call ntt_level0_avx +sub $128,%rdi +add $8,%rsi +call ntt_levels1t6_avx +add $256,%rdi +add $392,%rsi +call ntt_levels1t6_avx +ret diff --git a/crypto_kem/kyber768/avx2/ntt.h b/crypto_kem/kyber768/avx2/ntt.h index aa454b7c..206f3a4e 100644 --- a/crypto_kem/kyber768/avx2/ntt.h +++ b/crypto_kem/kyber768/avx2/ntt.h @@ -2,19 +2,27 @@ #define NTT_H #include "consts.h" - +#include "params.h" #include -void PQCLEAN_KYBER768_AVX2_ntt_level0_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER768_AVX2_ntt_levels1t6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER768_AVX2_invntt_level6_avx(int16_t *r, const uint16_t *zetas); -void PQCLEAN_KYBER768_AVX2_nttpack_avx(int16_t *r); -void PQCLEAN_KYBER768_AVX2_nttunpack_avx(int16_t *r); -void PQCLEAN_KYBER768_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); -void PQCLEAN_KYBER768_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const uint16_t *zeta); - -void PQCLEAN_KYBER768_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a); -void PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a); + +void PQCLEAN_KYBER768_AVX2_ntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + +void PQCLEAN_KYBER768_AVX2_invntt_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + + +void nttpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + +void PQCLEAN_KYBER768_AVX2_nttunpack_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + + +void PQCLEAN_KYBER768_AVX2_basemul_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + +void PQCLEAN_KYBER768_AVX2_basemul_acc_avx(int16_t *r, const int16_t *a, const int16_t *b, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + + +void PQCLEAN_KYBER768_AVX2_ntttobytes_avx(uint8_t *r, const int16_t *a, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + +void PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(int16_t *r, const uint8_t *a, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); #endif diff --git a/crypto_kem/kyber768/avx2/ntt.s b/crypto_kem/kyber768/avx2/ntt.s deleted file mode 100644 index a4bd6dee..00000000 --- a/crypto_kem/kyber768/avx2/ntt.s +++ /dev/null @@ -1,209 +0,0 @@ -.include "shuffle.inc" -.include "fq.inc" - -.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\rh0 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh1 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\rh2 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh3 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 -vpsubw %ymm12,%ymm\rh0,%ymm12 -vpsubw %ymm13,%ymm\rh1,%ymm13 -vpsubw %ymm14,%ymm\rh2,%ymm14 -vpsubw %ymm15,%ymm\rh3,%ymm15 - -#update -vpsubw %ymm12,%ymm\rl0,%ymm\rh0 -vpaddw %ymm12,%ymm\rl0,%ymm\rl0 -vpsubw %ymm13,%ymm\rl1,%ymm\rh1 -vpaddw %ymm13,%ymm\rl1,%ymm\rl1 -vpsubw %ymm14,%ymm\rl2,%ymm\rh2 -vpaddw %ymm14,%ymm\rl2,%ymm\rl2 -vpsubw %ymm15,%ymm\rl3,%ymm\rh3 -vpaddw %ymm15,%ymm\rl3,%ymm\rl3 -.endm - -# We break the dependency chains with the cost of slightly more additions. -# But they can be run in parallel to the multiplications on execution port 5 -# (multiplications only go to ports 0 and 1) -.macro butterfly2 rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3 x=3,y=2,zl0=15,zl1=15,zh0=1,zh1=1 -#mul -vpmullw %ymm\zl0,%ymm\rh0,%ymm12 -vpmulhw %ymm\zh0,%ymm\rh0,%ymm\x -vpmullw %ymm\zl0,%ymm\rh1,%ymm13 -vpmulhw %ymm\zh0,%ymm\rh1,%ymm\rh0 -vpmullw %ymm\zl1,%ymm\rh2,%ymm14 -vpmulhw %ymm\zh1,%ymm\rh2,%ymm\y -vpmullw %ymm\zl1,%ymm\rh3,%ymm15 -vpmulhw %ymm\zh1,%ymm\rh3,%ymm\rh2 - -#reduce -vpmulhw %ymm0,%ymm12,%ymm12 -vpmulhw %ymm0,%ymm13,%ymm13 -vpmulhw %ymm0,%ymm14,%ymm14 -vpmulhw %ymm0,%ymm15,%ymm15 - -vpsubw %ymm\rh0,%ymm\rl1,%ymm\rh1 -vpaddw %ymm\rh0,%ymm\rl1,%ymm\rl1 -vpsubw %ymm\x,%ymm\rl0,%ymm\rh0 -vpaddw %ymm\x,%ymm\rl0,%ymm\rl0 -vpsubw %ymm\rh2,%ymm\rl3,%ymm\rh3 -vpaddw %ymm\rh2,%ymm\rl3,%ymm\rl3 -vpsubw %ymm\y,%ymm\rl2,%ymm\rh2 -vpaddw %ymm\y,%ymm\rl2,%ymm\rl2 - -#update -vpaddw %ymm12,%ymm\rh0,%ymm\rh0 -vpsubw %ymm12,%ymm\rl0,%ymm\rl0 -vpaddw %ymm13,%ymm\rh1,%ymm\rh1 -vpsubw %ymm13,%ymm\rl1,%ymm\rl1 -vpaddw %ymm14,%ymm\rh2,%ymm\rh2 -vpsubw %ymm14,%ymm\rl2,%ymm\rl2 -vpaddw %ymm15,%ymm\rh3,%ymm\rh3 -vpsubw %ymm15,%ymm\rl3,%ymm\rl3 -.endm - -.global PQCLEAN_KYBER768_AVX2_ntt_level0_avx -PQCLEAN_KYBER768_AVX2_ntt_level0_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 - -level0: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 256(%rdi),%ymm8 -vmovdqa 288(%rdi),%ymm9 -vmovdqa 320(%rdi),%ymm10 -vmovdqa 352(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 - -#store -vmovdqa %ymm4,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm6,64(%rdi) -vmovdqa %ymm7,96(%rdi) -vmovdqa %ymm8,256(%rdi) -vmovdqa %ymm9,288(%rdi) -vmovdqa %ymm10,320(%rdi) -vmovdqa %ymm11,352(%rdi) - -ret - -.global PQCLEAN_KYBER768_AVX2_ntt_levels1t6_avx -PQCLEAN_KYBER768_AVX2_ntt_levels1t6_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 - -level1: -#zetas -vpbroadcastd (%rsi),%ymm15 -vpbroadcastd 4(%rsi),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -butterfly2 4,5,6,7,8,9,10,11 3 - -level2: -#zetas -vmovdqu 8(%rsi),%ymm15 -vmovdqu 40(%rsi),%ymm1 - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -butterfly2 3,8,4,9,5,10,6,11 7 - -level3: -#zetas -vmovdqu 72(%rsi),%ymm15 -vmovdqu 104(%rsi),%ymm1 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -butterfly2 7,5,3,10,8,6,4,11 9 - -level4: -#zetas -vmovdqu 136(%rsi),%ymm15 -vmovdqu 168(%rsi),%ymm1 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -butterfly2 9,8,7,6,5,4,3,11 10 - -level5: -#zetas -vmovdqu 200(%rsi),%ymm15 -vmovdqu 232(%rsi),%ymm1 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -butterfly2 10,5,9,4,8,3,7,11 6 - -level6: -#zetas -vmovdqu 264(%rsi),%ymm14 -vmovdqu 328(%rsi),%ymm15 -vmovdqu 296(%rsi),%ymm1 -vmovdqu 360(%rsi),%ymm2 - -butterfly2 10,5,8,3,9,4,7,11 6,1,14,15,1,2 - -vmovdqa PQCLEAN_KYBER768_AVX2_16xv(%rip),%ymm1 -red16 10 12 -red16 5 13 -red16 9 14 -red16 4 15 -red16 8 2 -red16 3 6 -red16 7 12 -red16 11 13 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret diff --git a/crypto_kem/kyber768/avx2/params.h b/crypto_kem/kyber768/avx2/params.h index 3a1e0d10..1eb4c5f9 100644 --- a/crypto_kem/kyber768/avx2/params.h +++ b/crypto_kem/kyber768/avx2/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER768_AVX2_PARAMS_H +#define PQCLEAN_KYBER768_AVX2_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 3 #define KYBER_POLYCOMPRESSEDBYTES 128 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber768/avx2/poly.c b/crypto_kem/kyber768/avx2/poly.c index b9e278e1..eca0cf27 100644 --- a/crypto_kem/kyber768/avx2/poly.c +++ b/crypto_kem/kyber768/avx2/poly.c @@ -1,131 +1,229 @@ +#include "align.h" #include "cbd.h" +#include "consts.h" #include "ntt.h" #include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include #include /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER768_AVX2_poly_compress * * Description: Compression and subsequent serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *restrict a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t i, j, k = 0; PQCLEAN_KYBER768_AVX2_poly_csubq(a); - for (i = 0; i < KYBER_N; i += 8) { + for (i = 0; i < KYBER_N / 8; i++) { for (j = 0; j < 8; j++) { - t[j] = (uint8_t)(((((uint16_t)a->coeffs[i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15); + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; } - r[k] = (uint8_t)(t[0] | (t[1] << 4)); - r[k + 1] = (uint8_t)(t[2] | (t[3] << 4)); - r[k + 2] = (uint8_t)(t[4] | (t[5] << 4)); - r[k + 3] = (uint8_t)(t[6] | (t[7] << 4)); - k += 4; + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER768_AVX2_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER768_AVX2_poly_compress * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_decompress(poly *r, const uint8_t *a) { - size_t i; - for (i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)((((a[0] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 1] = (int16_t)((((a[0] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 2] = (int16_t)((((a[1] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 3] = (int16_t)((((a[1] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 4] = (int16_t)((((a[2] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 5] = (int16_t)((((a[2] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 6] = (int16_t)((((a[3] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 7] = (int16_t)((((a[3] >> 4) * KYBER_Q) + 8) >> 4); - a += 4; +void PQCLEAN_KYBER768_AVX2_poly_decompress(poly *restrict r, + const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; + a += 1; } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER768_AVX2_poly_tobytes * * Description: Serialization of a polynomial * * Arguments: - uint8_t *r: pointer to output byte array -* - const poly *a: pointer to input polynomial +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_tobytes(uint8_t *r, poly *a) { - PQCLEAN_KYBER768_AVX2_ntttobytes_avx(r, a->coeffs); - PQCLEAN_KYBER768_AVX2_ntttobytes_avx(r + 192, a->coeffs + 128); +void PQCLEAN_KYBER768_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + PQCLEAN_KYBER768_AVX2_ntttobytes_avx(r, a->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER768_AVX2_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER768_AVX2_poly_tobytes * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_frombytes(poly *r, const uint8_t *a) { - PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(r->coeffs, a); - PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(r->coeffs + 128, a + 192); +void PQCLEAN_KYBER768_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + PQCLEAN_KYBER768_AVX2_nttfrombytes_avx(r->coeffs, a, &PQCLEAN_KYBER768_AVX2_qdata); } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER768_AVX2_poly_frommsg +* +* Description: Convert 32-byte message to polynomial +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message +**************************************************/ +void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *restrict r, + const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + __m256i f, g0, g1, g2, g3, h0, h1, h2, h3; + const __m256i shift = _mm256_broadcastsi128_si256(_mm_set_epi32(0, 1, 2, 3)); + const __m256i idx = _mm256_broadcastsi128_si256(_mm_set_epi8(15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + const __m256i hqs = _mm256_set1_epi16((KYBER_Q + 1) / 2); + +#define FROMMSG64(i) \ + g3 = _mm256_shuffle_epi32(f,0x55*(i)); \ + g3 = _mm256_sllv_epi32(g3,shift); \ + g3 = _mm256_shuffle_epi8(g3,idx); \ + g0 = _mm256_slli_epi16(g3,12); \ + g1 = _mm256_slli_epi16(g3,8); \ + g2 = _mm256_slli_epi16(g3,4); \ + g0 = _mm256_srai_epi16(g0,15); \ + g1 = _mm256_srai_epi16(g1,15); \ + g2 = _mm256_srai_epi16(g2,15); \ + g3 = _mm256_srai_epi16(g3,15); \ + g0 = _mm256_and_si256(g0,hqs); /* 19 18 17 16 3 2 1 0 */ \ + g1 = _mm256_and_si256(g1,hqs); /* 23 22 21 20 7 6 5 4 */ \ + g2 = _mm256_and_si256(g2,hqs); /* 27 26 25 24 11 10 9 8 */ \ + g3 = _mm256_and_si256(g3,hqs); /* 31 30 29 28 15 14 13 12 */ \ + h0 = _mm256_unpacklo_epi64(g0,g1); \ + h2 = _mm256_unpackhi_epi64(g0,g1); \ + h1 = _mm256_unpacklo_epi64(g2,g3); \ + h3 = _mm256_unpackhi_epi64(g2,g3); \ + g0 = _mm256_permute2x128_si256(h0,h1,0x20); \ + g2 = _mm256_permute2x128_si256(h0,h1,0x31); \ + g1 = _mm256_permute2x128_si256(h2,h3,0x20); \ + g3 = _mm256_permute2x128_si256(h2,h3,0x31); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+ 0],g0); \ + _mm256_store_si256((__m256i *)&r->coeffs[ 0+32*(i)+16],g1); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+ 0],g2); \ + _mm256_store_si256((__m256i *)&r->coeffs[128+32*(i)+16],g3) + + f = _mm256_load_si256((__m256i *)msg); + FROMMSG64(0); + FROMMSG64(1); + FROMMSG64(2); + FROMMSG64(3); +} + +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER768_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *restrict a) { + unsigned int i = 0; + uint32_t small = 0; + __m256i f0, f1, g0, g1; + const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); + const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); + + for (i = 0; i < KYBER_N / 32; i++) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i]); + f1 = _mm256_load_si256((__m256i *)&a->coeffs[32 * i + 16]); + f0 = _mm256_sub_epi16(hqs, f0); + f1 = _mm256_sub_epi16(hqs, f1); + g0 = _mm256_srai_epi16(f0, 15); + g1 = _mm256_srai_epi16(f1, 15); + f0 = _mm256_xor_si256(f0, g0); + f1 = _mm256_xor_si256(f1, g1); + f0 = _mm256_sub_epi16(hhqs, f0); + f1 = _mm256_sub_epi16(hhqs, f1); + f0 = _mm256_packs_epi16(f0, f1); + small = _mm256_movemask_epi8(f0); + small = ~small; + msg[4 * i + 0] = small; + msg[4 * i + 1] = small >> 16; + msg[4 * i + 2] = small >> 8; + msg[4 * i + 3] = small >> 24; + } +} + +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { - uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); - PQCLEAN_KYBER768_AVX2_cbd(r, buf); +void PQCLEAN_KYBER768_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { + ALIGN32_ARRAY(uint8_t, KYBER_ETA * KYBER_N / 4) buf; + prf(buf.arr, sizeof(buf.arr), seed, nonce); + PQCLEAN_KYBER768_AVX2_cbd(r, buf.arr); } -// FIXME void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0, poly *r1, poly *r2, poly *r3, - const uint8_t *seed, + const uint8_t seed[32], uint8_t nonce0, uint8_t nonce1, uint8_t nonce2, uint8_t nonce3) { - uint8_t buf[4][SHAKE256_RATE]; - - PQCLEAN_KYBER768_AVX2_shake256x4_prf(buf[0], buf[1], buf[2], buf[3], SHAKE256_RATE, seed, nonce0, nonce1, nonce2, nonce3); - - PQCLEAN_KYBER768_AVX2_cbd(r0, buf[0]); - PQCLEAN_KYBER768_AVX2_cbd(r1, buf[1]); - PQCLEAN_KYBER768_AVX2_cbd(r2, buf[2]); - PQCLEAN_KYBER768_AVX2_cbd(r3, buf[3]); + ALIGN32_ARRAY_2D(uint8_t, 4, 160) buf; + __m256i f; + keccakx4_state state; + + f = _mm256_load_si256((__m256i *)seed); + _mm256_store_si256((__m256i *)buf.arr[0], f); + _mm256_store_si256((__m256i *)buf.arr[1], f); + _mm256_store_si256((__m256i *)buf.arr[2], f); + _mm256_store_si256((__m256i *)buf.arr[3], f); + + buf.arr[0][32] = nonce0; + buf.arr[1][32] = nonce1; + buf.arr[2][32] = nonce2; + buf.arr[3][32] = nonce3; + + PQCLEAN_KYBER768_AVX2_shake256x4_absorb(&state, buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 33); + PQCLEAN_KYBER768_AVX2_shake256x4_squeezeblocks(buf.arr[0], buf.arr[1], buf.arr[2], buf.arr[3], 1, &state); + + PQCLEAN_KYBER768_AVX2_cbd(r0, buf.arr[0]); + PQCLEAN_KYBER768_AVX2_cbd(r1, buf.arr[1]); + PQCLEAN_KYBER768_AVX2_cbd(r2, buf.arr[2]); + PQCLEAN_KYBER768_AVX2_cbd(r3, buf.arr[3]); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER768_AVX2_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -134,73 +232,78 @@ void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0, * Arguments: - uint16_t *r: pointer to in/output polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_ntt(poly *r) { - PQCLEAN_KYBER768_AVX2_ntt_level0_avx(r->coeffs, PQCLEAN_KYBER768_AVX2_zetas_exp); - PQCLEAN_KYBER768_AVX2_ntt_level0_avx(r->coeffs + 64, PQCLEAN_KYBER768_AVX2_zetas_exp); - PQCLEAN_KYBER768_AVX2_ntt_levels1t6_avx(r->coeffs, PQCLEAN_KYBER768_AVX2_zetas_exp + 4); - PQCLEAN_KYBER768_AVX2_ntt_levels1t6_avx(r->coeffs + 128, PQCLEAN_KYBER768_AVX2_zetas_exp + 200); + PQCLEAN_KYBER768_AVX2_ntt_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER768_AVX2_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_invntt(poly *r) { - PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx(r->coeffs, PQCLEAN_KYBER768_AVX2_zetas_inv_exp); - PQCLEAN_KYBER768_AVX2_invntt_levels0t5_avx(r->coeffs + 128, PQCLEAN_KYBER768_AVX2_zetas_inv_exp + 196); - PQCLEAN_KYBER768_AVX2_invntt_level6_avx(r->coeffs, PQCLEAN_KYBER768_AVX2_zetas_inv_exp + 392); +void PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(poly *r) { + PQCLEAN_KYBER768_AVX2_invntt_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } -// FIXME void PQCLEAN_KYBER768_AVX2_poly_nttunpack(poly *r) { - PQCLEAN_KYBER768_AVX2_nttunpack_avx(r->coeffs); - PQCLEAN_KYBER768_AVX2_nttunpack_avx(r->coeffs + 128); + PQCLEAN_KYBER768_AVX2_nttunpack_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } -//XXX Add comment -void PQCLEAN_KYBER768_AVX2_poly_basemul(poly *r, const poly *a, const poly *b) { - PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs, - a->coeffs, - b->coeffs, - PQCLEAN_KYBER768_AVX2_zetas_exp + 152); - PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs + 64, - a->coeffs + 64, - b->coeffs + 64, - PQCLEAN_KYBER768_AVX2_zetas_exp + 184); - PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs + 128, - a->coeffs + 128, - b->coeffs + 128, - PQCLEAN_KYBER768_AVX2_zetas_exp + 348); - PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs + 192, - a->coeffs + 192, - b->coeffs + 192, - PQCLEAN_KYBER768_AVX2_zetas_exp + 380); +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery +* +* Description: Multiplication of two polynomials in NTT domain +* +* Arguments: - poly *r: pointer to output polynomial +* - const poly *a: pointer to first input polynomial +* - const poly *b: pointer to second input polynomial +**************************************************/ +void PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + PQCLEAN_KYBER768_AVX2_basemul_avx(r->coeffs, a->coeffs, b->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } -// FIXME -void PQCLEAN_KYBER768_AVX2_poly_frommont(poly *r) { - PQCLEAN_KYBER768_AVX2_frommont_avx(r->coeffs); - PQCLEAN_KYBER768_AVX2_frommont_avx(r->coeffs + 128); +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_poly_tomont +* +* Description: Inplace conversion of all coefficients of a polynomial +* from normal domain to Montgomery domain +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ +void PQCLEAN_KYBER768_AVX2_poly_tomont(poly *r) { + PQCLEAN_KYBER768_AVX2_tomont_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_poly_reduce +* +* Description: Applies Barrett reduction to all coefficients of a polynomial +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER768_AVX2_poly_reduce(poly *r) { - PQCLEAN_KYBER768_AVX2_reduce_avx(r->coeffs); - PQCLEAN_KYBER768_AVX2_reduce_avx(r->coeffs + 128); + PQCLEAN_KYBER768_AVX2_reduce_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_poly_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER768_AVX2_poly_csubq(poly *r) { - PQCLEAN_KYBER768_AVX2_csubq_avx(r->coeffs); - PQCLEAN_KYBER768_AVX2_csubq_avx(r->coeffs + 128); + PQCLEAN_KYBER768_AVX2_csubq_avx(r->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER768_AVX2_poly_add * * Description: Add two polynomials * @@ -209,18 +312,19 @@ void PQCLEAN_KYBER768_AVX2_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_add(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_add_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_add_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER768_AVX2_poly_sub * * Description: Subtract two polynomials * @@ -229,127 +333,13 @@ void PQCLEAN_KYBER768_AVX2_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER768_AVX2_poly_sub(poly *r, const poly *a, const poly *b) { - __m256i vec0, vec1; - - for (size_t i = 0; i < KYBER_N; i += 16) { - vec0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); - vec1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); - vec0 = _mm256_sub_epi16(vec0, vec1); - _mm256_store_si256((__m256i *)&r->coeffs[i], vec0); - } -} - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - __m128i tmp; - __m256i a[4], d0, d1, d2, d3; - const __m256i shift = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - const __m256i zeros = _mm256_setzero_si256(); - const __m256i ones = _mm256_set1_epi32(1); - const __m256i hqs = _mm256_set1_epi32((KYBER_Q + 1) / 2); - - tmp = _mm_loadu_si128((__m128i *)msg); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[32 * i + 16], d2); - } - - tmp = _mm_loadu_si128((__m128i *)&msg[16]); - for (size_t i = 0; i < 4; i++) { - a[i] = _mm256_broadcastd_epi32(tmp); - tmp = _mm_srli_si128(tmp, 4); - } - - for (size_t i = 0; i < 4; i++) { - d0 = _mm256_srlv_epi32(a[i], shift); - d1 = _mm256_srli_epi32(d0, 8); - d2 = _mm256_srli_epi32(d0, 16); - d3 = _mm256_srli_epi32(d0, 24); - - d0 = _mm256_and_si256(d0, ones); - d1 = _mm256_and_si256(d1, ones); - d2 = _mm256_and_si256(d2, ones); - d3 = _mm256_and_si256(d3, ones); - - d0 = _mm256_sub_epi32(zeros, d0); - d1 = _mm256_sub_epi32(zeros, d1); - d2 = _mm256_sub_epi32(zeros, d2); - d3 = _mm256_sub_epi32(zeros, d3); - - d0 = _mm256_and_si256(hqs, d0); - d1 = _mm256_and_si256(hqs, d1); - d2 = _mm256_and_si256(hqs, d2); - d3 = _mm256_and_si256(hqs, d3); - - d0 = _mm256_packus_epi32(d0, d1); - d2 = _mm256_packus_epi32(d2, d3); - d0 = _mm256_permute4x64_epi64(d0, 0xD8); - d2 = _mm256_permute4x64_epi64(d2, 0xD8); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 0], d0); - _mm256_store_si256((__m256i *)&r->coeffs[128 + 32 * i + 16], d2); - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER768_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint32_t small; - __m256i vec, tmp; - const __m256i hqs = _mm256_set1_epi16((KYBER_Q - 1) / 2); - const __m256i hhqs = _mm256_set1_epi16((KYBER_Q - 5) / 4); - - for (size_t i = 0; i < KYBER_N / 16; i++) { - vec = _mm256_load_si256((__m256i *)&a->coeffs[16 * i]); - vec = _mm256_sub_epi16(hqs, vec); - tmp = _mm256_srai_epi16(vec, 15); - vec = _mm256_xor_si256(vec, tmp); - vec = _mm256_sub_epi16(hhqs, vec); - small = (uint32_t)_mm256_movemask_epi8(vec); - small = _pext_u32(small, 0xAAAAAAAA); - small = ~small; - msg[2 * i + 0] = (uint8_t)small; - msg[2 * i + 1] = (uint8_t)(small >> 8); + unsigned int i = 0; + __m256i f0, f1; + + for (i = 0; i < KYBER_N; i += 16) { + f0 = _mm256_load_si256((__m256i *)&a->coeffs[i]); + f1 = _mm256_load_si256((__m256i *)&b->coeffs[i]); + f0 = _mm256_sub_epi16(f0, f1); + _mm256_store_si256((__m256i *)&r->coeffs[i], f0); } } diff --git a/crypto_kem/kyber768/avx2/poly.h b/crypto_kem/kyber768/avx2/poly.h index fb0589ae..6cce5615 100644 --- a/crypto_kem/kyber768/avx2/poly.h +++ b/crypto_kem/kyber768/avx2/poly.h @@ -1,8 +1,7 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER768_AVX2_POLY_H +#define PQCLEAN_KYBER768_AVX2_POLY_H #include "params.h" - #include #include @@ -11,20 +10,28 @@ * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] */ typedef union { + __m256i dummy; int16_t coeffs[KYBER_N]; - __m256i _dummy; } poly; -void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER768_AVX2_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER768_AVX2_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER768_AVX2_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER768_AVX2_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER768_AVX2_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER768_AVX2_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER768_AVX2_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER768_AVX2_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); + -void PQCLEAN_KYBER768_AVX2_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER768_AVX2_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); +void PQCLEAN_KYBER768_AVX2_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); -void PQCLEAN_KYBER768_AVX2_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0, poly *r1, poly *r2, @@ -37,15 +44,23 @@ void PQCLEAN_KYBER768_AVX2_poly_getnoise4x(poly *r0, void PQCLEAN_KYBER768_AVX2_poly_ntt(poly *r); -void PQCLEAN_KYBER768_AVX2_poly_invntt(poly *r); + +void PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(poly *r); + void PQCLEAN_KYBER768_AVX2_poly_nttunpack(poly *r); -void PQCLEAN_KYBER768_AVX2_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER768_AVX2_poly_frommont(poly *r); + +void PQCLEAN_KYBER768_AVX2_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER768_AVX2_poly_tomont(poly *r); + void PQCLEAN_KYBER768_AVX2_poly_reduce(poly *r); + void PQCLEAN_KYBER768_AVX2_poly_csubq(poly *r); + void PQCLEAN_KYBER768_AVX2_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER768_AVX2_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber768/avx2/polyvec.c b/crypto_kem/kyber768/avx2/polyvec.c index 2ba0aebd..6b83eba7 100644 --- a/crypto_kem/kyber768/avx2/polyvec.c +++ b/crypto_kem/kyber768/avx2/polyvec.c @@ -1,157 +1,188 @@ +#include "params.h" +#include "consts.h" #include "ntt.h" #include "poly.h" #include "polyvec.h" - #include /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER768_AVX2_polyvec_compress * * Description: Compress and serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], + polyvec *restrict a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER768_AVX2_polyvec_csubq(a); uint16_t t[4]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - for (size_t k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) + / KYBER_Q) & 0x3ff; + } } - r[5 * j + 0] = (uint8_t)t[0]; - r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x3f) << 2)); - r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] & 0x0f) << 4)); - r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] & 0x03) << 6)); - r[5 * j + 4] = (uint8_t)((t[3] >> 2)); + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; } - r += 320; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER768_AVX2_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER768_AVX2_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - r->vec[i].coeffs[4 * j + 0] = (int16_t)( (((a[5 * j + 0] | (((uint32_t)a[5 * j + 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 1] = (int16_t)(((((a[5 * j + 1] >> 2) | (((uint32_t)a[5 * j + 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 2] = (int16_t)(((((a[5 * j + 2] >> 4) | (((uint32_t)a[5 * j + 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 3] = (int16_t)(((((a[5 * j + 3] >> 6) | (((uint32_t)a[5 * j + 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10); +void PQCLEAN_KYBER768_AVX2_polyvec_decompress(polyvec *restrict r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } } - a += 320; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER768_AVX2_polyvec_tobytes * * Description: Serialize vector of polynomials * * Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER768_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_AVX2_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER768_AVX2_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER768_AVX2_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array +* Arguments: - uint8_t *r: pointer to output byte array * - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER768_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_AVX2_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER768_AVX2_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER768_AVX2_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_AVX2_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER768_AVX2_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_AVX2_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER768_AVX2_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER768_AVX2_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { - PQCLEAN_KYBER768_AVX2_basemul_acc_avx(r->coeffs, - a->vec->coeffs, - b->vec->coeffs, - PQCLEAN_KYBER768_AVX2_zetas_exp + 152); - PQCLEAN_KYBER768_AVX2_basemul_acc_avx(r->coeffs + 64, - a->vec->coeffs + 64, - b->vec->coeffs + 64, - PQCLEAN_KYBER768_AVX2_zetas_exp + 184); - PQCLEAN_KYBER768_AVX2_basemul_acc_avx(r->coeffs + 128, - a->vec->coeffs + 128, - b->vec->coeffs + 128, - PQCLEAN_KYBER768_AVX2_zetas_exp + 348); - PQCLEAN_KYBER768_AVX2_basemul_acc_avx(r->coeffs + 192, - a->vec->coeffs + 192, - b->vec->coeffs + 192, - PQCLEAN_KYBER768_AVX2_zetas_exp + 380); +void PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + PQCLEAN_KYBER768_AVX2_basemul_acc_avx(r->coeffs, a->vec->coeffs, b->vec->coeffs, &PQCLEAN_KYBER768_AVX2_qdata); } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_polyvec_reduce +* +* Description: Applies Barrett reduction to each coefficient +* of each element of a vector of polynomials +* for details of the Barrett reduction see comments in reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER768_AVX2_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_AVX2_poly_reduce(&r->vec[i]); } } -// FIXME +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_polyvec_csubq +* +* Description: Applies conditional subtraction of q to each coefficient +* of each element of a vector of polynomials +* for details of conditional subtraction of q see comments in +* reduce.c +* +* Arguments: - poly *r: pointer to input/output polynomial +**************************************************/ void PQCLEAN_KYBER768_AVX2_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_AVX2_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER768_AVX2_polyvec_add * * Description: Add vectors of polynomials * @@ -160,7 +191,8 @@ void PQCLEAN_KYBER768_AVX2_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER768_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_AVX2_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber768/avx2/polyvec.h b/crypto_kem/kyber768/avx2/polyvec.h index 8079d0b0..15b983d7 100644 --- a/crypto_kem/kyber768/avx2/polyvec.h +++ b/crypto_kem/kyber768/avx2/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER768_AVX2_POLYVEC_H +#define PQCLEAN_KYBER768_AVX2_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER768_AVX2_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER768_AVX2_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER768_AVX2_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER768_AVX2_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER768_AVX2_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER768_AVX2_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER768_AVX2_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER768_AVX2_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER768_AVX2_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER768_AVX2_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER768_AVX2_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER768_AVX2_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER768_AVX2_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER768_AVX2_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber768/avx2/reduce.h b/crypto_kem/kyber768/avx2/reduce.h index 4da9eff8..1b9a96d9 100644 --- a/crypto_kem/kyber768/avx2/reduce.h +++ b/crypto_kem/kyber768/avx2/reduce.h @@ -3,8 +3,14 @@ #include -int16_t PQCLEAN_KYBER768_AVX2_reduce_avx(int16_t *r); -int16_t PQCLEAN_KYBER768_AVX2_csubq_avx(int16_t *r); -int16_t PQCLEAN_KYBER768_AVX2_frommont_avx(int16_t *r); +#include "consts.h" +#include "params.h" + + +int16_t PQCLEAN_KYBER768_AVX2_reduce_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + +int16_t PQCLEAN_KYBER768_AVX2_csubq_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); + +int16_t PQCLEAN_KYBER768_AVX2_tomont_avx(int16_t *r, const qdata_t *PQCLEAN_KYBER768_AVX2_qdata); #endif diff --git a/crypto_kem/kyber768/avx2/rejsample.c b/crypto_kem/kyber768/avx2/rejsample.c index 1c22167f..dc112183 100644 --- a/crypto_kem/kyber768/avx2/rejsample.c +++ b/crypto_kem/kyber768/avx2/rejsample.c @@ -1,386 +1,360 @@ +#include "align.h" #include "consts.h" #include "params.h" #include "rejsample.h" - -#include #include - -static const uint8_t idx[256][8] = { - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0}, - { 2, 0, 0, 0, 0, 0, 0, 0}, - { 0, 2, 0, 0, 0, 0, 0, 0}, - { 4, 0, 0, 0, 0, 0, 0, 0}, - { 0, 4, 0, 0, 0, 0, 0, 0}, - { 2, 4, 0, 0, 0, 0, 0, 0}, - { 0, 2, 4, 0, 0, 0, 0, 0}, - { 6, 0, 0, 0, 0, 0, 0, 0}, - { 0, 6, 0, 0, 0, 0, 0, 0}, - { 2, 6, 0, 0, 0, 0, 0, 0}, - { 0, 2, 6, 0, 0, 0, 0, 0}, - { 4, 6, 0, 0, 0, 0, 0, 0}, - { 0, 4, 6, 0, 0, 0, 0, 0}, - { 2, 4, 6, 0, 0, 0, 0, 0}, - { 0, 2, 4, 6, 0, 0, 0, 0}, - { 8, 0, 0, 0, 0, 0, 0, 0}, - { 0, 8, 0, 0, 0, 0, 0, 0}, - { 2, 8, 0, 0, 0, 0, 0, 0}, - { 0, 2, 8, 0, 0, 0, 0, 0}, - { 4, 8, 0, 0, 0, 0, 0, 0}, - { 0, 4, 8, 0, 0, 0, 0, 0}, - { 2, 4, 8, 0, 0, 0, 0, 0}, - { 0, 2, 4, 8, 0, 0, 0, 0}, - { 6, 8, 0, 0, 0, 0, 0, 0}, - { 0, 6, 8, 0, 0, 0, 0, 0}, - { 2, 6, 8, 0, 0, 0, 0, 0}, - { 0, 2, 6, 8, 0, 0, 0, 0}, - { 4, 6, 8, 0, 0, 0, 0, 0}, - { 0, 4, 6, 8, 0, 0, 0, 0}, - { 2, 4, 6, 8, 0, 0, 0, 0}, - { 0, 2, 4, 6, 8, 0, 0, 0}, - {10, 0, 0, 0, 0, 0, 0, 0}, - { 0, 10, 0, 0, 0, 0, 0, 0}, - { 2, 10, 0, 0, 0, 0, 0, 0}, - { 0, 2, 10, 0, 0, 0, 0, 0}, - { 4, 10, 0, 0, 0, 0, 0, 0}, - { 0, 4, 10, 0, 0, 0, 0, 0}, - { 2, 4, 10, 0, 0, 0, 0, 0}, - { 0, 2, 4, 10, 0, 0, 0, 0}, - { 6, 10, 0, 0, 0, 0, 0, 0}, - { 0, 6, 10, 0, 0, 0, 0, 0}, - { 2, 6, 10, 0, 0, 0, 0, 0}, - { 0, 2, 6, 10, 0, 0, 0, 0}, - { 4, 6, 10, 0, 0, 0, 0, 0}, - { 0, 4, 6, 10, 0, 0, 0, 0}, - { 2, 4, 6, 10, 0, 0, 0, 0}, - { 0, 2, 4, 6, 10, 0, 0, 0}, - { 8, 10, 0, 0, 0, 0, 0, 0}, - { 0, 8, 10, 0, 0, 0, 0, 0}, - { 2, 8, 10, 0, 0, 0, 0, 0}, - { 0, 2, 8, 10, 0, 0, 0, 0}, - { 4, 8, 10, 0, 0, 0, 0, 0}, - { 0, 4, 8, 10, 0, 0, 0, 0}, - { 2, 4, 8, 10, 0, 0, 0, 0}, - { 0, 2, 4, 8, 10, 0, 0, 0}, - { 6, 8, 10, 0, 0, 0, 0, 0}, - { 0, 6, 8, 10, 0, 0, 0, 0}, - { 2, 6, 8, 10, 0, 0, 0, 0}, - { 0, 2, 6, 8, 10, 0, 0, 0}, - { 4, 6, 8, 10, 0, 0, 0, 0}, - { 0, 4, 6, 8, 10, 0, 0, 0}, - { 2, 4, 6, 8, 10, 0, 0, 0}, - { 0, 2, 4, 6, 8, 10, 0, 0}, - {12, 0, 0, 0, 0, 0, 0, 0}, - { 0, 12, 0, 0, 0, 0, 0, 0}, - { 2, 12, 0, 0, 0, 0, 0, 0}, - { 0, 2, 12, 0, 0, 0, 0, 0}, - { 4, 12, 0, 0, 0, 0, 0, 0}, - { 0, 4, 12, 0, 0, 0, 0, 0}, - { 2, 4, 12, 0, 0, 0, 0, 0}, - { 0, 2, 4, 12, 0, 0, 0, 0}, - { 6, 12, 0, 0, 0, 0, 0, 0}, - { 0, 6, 12, 0, 0, 0, 0, 0}, - { 2, 6, 12, 0, 0, 0, 0, 0}, - { 0, 2, 6, 12, 0, 0, 0, 0}, - { 4, 6, 12, 0, 0, 0, 0, 0}, - { 0, 4, 6, 12, 0, 0, 0, 0}, - { 2, 4, 6, 12, 0, 0, 0, 0}, - { 0, 2, 4, 6, 12, 0, 0, 0}, - { 8, 12, 0, 0, 0, 0, 0, 0}, - { 0, 8, 12, 0, 0, 0, 0, 0}, - { 2, 8, 12, 0, 0, 0, 0, 0}, - { 0, 2, 8, 12, 0, 0, 0, 0}, - { 4, 8, 12, 0, 0, 0, 0, 0}, - { 0, 4, 8, 12, 0, 0, 0, 0}, - { 2, 4, 8, 12, 0, 0, 0, 0}, - { 0, 2, 4, 8, 12, 0, 0, 0}, - { 6, 8, 12, 0, 0, 0, 0, 0}, - { 0, 6, 8, 12, 0, 0, 0, 0}, - { 2, 6, 8, 12, 0, 0, 0, 0}, - { 0, 2, 6, 8, 12, 0, 0, 0}, - { 4, 6, 8, 12, 0, 0, 0, 0}, - { 0, 4, 6, 8, 12, 0, 0, 0}, - { 2, 4, 6, 8, 12, 0, 0, 0}, - { 0, 2, 4, 6, 8, 12, 0, 0}, - {10, 12, 0, 0, 0, 0, 0, 0}, - { 0, 10, 12, 0, 0, 0, 0, 0}, - { 2, 10, 12, 0, 0, 0, 0, 0}, - { 0, 2, 10, 12, 0, 0, 0, 0}, - { 4, 10, 12, 0, 0, 0, 0, 0}, - { 0, 4, 10, 12, 0, 0, 0, 0}, - { 2, 4, 10, 12, 0, 0, 0, 0}, - { 0, 2, 4, 10, 12, 0, 0, 0}, - { 6, 10, 12, 0, 0, 0, 0, 0}, - { 0, 6, 10, 12, 0, 0, 0, 0}, - { 2, 6, 10, 12, 0, 0, 0, 0}, - { 0, 2, 6, 10, 12, 0, 0, 0}, - { 4, 6, 10, 12, 0, 0, 0, 0}, - { 0, 4, 6, 10, 12, 0, 0, 0}, - { 2, 4, 6, 10, 12, 0, 0, 0}, - { 0, 2, 4, 6, 10, 12, 0, 0}, - { 8, 10, 12, 0, 0, 0, 0, 0}, - { 0, 8, 10, 12, 0, 0, 0, 0}, - { 2, 8, 10, 12, 0, 0, 0, 0}, - { 0, 2, 8, 10, 12, 0, 0, 0}, - { 4, 8, 10, 12, 0, 0, 0, 0}, - { 0, 4, 8, 10, 12, 0, 0, 0}, - { 2, 4, 8, 10, 12, 0, 0, 0}, - { 0, 2, 4, 8, 10, 12, 0, 0}, - { 6, 8, 10, 12, 0, 0, 0, 0}, - { 0, 6, 8, 10, 12, 0, 0, 0}, - { 2, 6, 8, 10, 12, 0, 0, 0}, - { 0, 2, 6, 8, 10, 12, 0, 0}, - { 4, 6, 8, 10, 12, 0, 0, 0}, - { 0, 4, 6, 8, 10, 12, 0, 0}, - { 2, 4, 6, 8, 10, 12, 0, 0}, - { 0, 2, 4, 6, 8, 10, 12, 0}, - {14, 0, 0, 0, 0, 0, 0, 0}, - { 0, 14, 0, 0, 0, 0, 0, 0}, - { 2, 14, 0, 0, 0, 0, 0, 0}, - { 0, 2, 14, 0, 0, 0, 0, 0}, - { 4, 14, 0, 0, 0, 0, 0, 0}, - { 0, 4, 14, 0, 0, 0, 0, 0}, - { 2, 4, 14, 0, 0, 0, 0, 0}, - { 0, 2, 4, 14, 0, 0, 0, 0}, - { 6, 14, 0, 0, 0, 0, 0, 0}, - { 0, 6, 14, 0, 0, 0, 0, 0}, - { 2, 6, 14, 0, 0, 0, 0, 0}, - { 0, 2, 6, 14, 0, 0, 0, 0}, - { 4, 6, 14, 0, 0, 0, 0, 0}, - { 0, 4, 6, 14, 0, 0, 0, 0}, - { 2, 4, 6, 14, 0, 0, 0, 0}, - { 0, 2, 4, 6, 14, 0, 0, 0}, - { 8, 14, 0, 0, 0, 0, 0, 0}, - { 0, 8, 14, 0, 0, 0, 0, 0}, - { 2, 8, 14, 0, 0, 0, 0, 0}, - { 0, 2, 8, 14, 0, 0, 0, 0}, - { 4, 8, 14, 0, 0, 0, 0, 0}, - { 0, 4, 8, 14, 0, 0, 0, 0}, - { 2, 4, 8, 14, 0, 0, 0, 0}, - { 0, 2, 4, 8, 14, 0, 0, 0}, - { 6, 8, 14, 0, 0, 0, 0, 0}, - { 0, 6, 8, 14, 0, 0, 0, 0}, - { 2, 6, 8, 14, 0, 0, 0, 0}, - { 0, 2, 6, 8, 14, 0, 0, 0}, - { 4, 6, 8, 14, 0, 0, 0, 0}, - { 0, 4, 6, 8, 14, 0, 0, 0}, - { 2, 4, 6, 8, 14, 0, 0, 0}, - { 0, 2, 4, 6, 8, 14, 0, 0}, - {10, 14, 0, 0, 0, 0, 0, 0}, - { 0, 10, 14, 0, 0, 0, 0, 0}, - { 2, 10, 14, 0, 0, 0, 0, 0}, - { 0, 2, 10, 14, 0, 0, 0, 0}, - { 4, 10, 14, 0, 0, 0, 0, 0}, - { 0, 4, 10, 14, 0, 0, 0, 0}, - { 2, 4, 10, 14, 0, 0, 0, 0}, - { 0, 2, 4, 10, 14, 0, 0, 0}, - { 6, 10, 14, 0, 0, 0, 0, 0}, - { 0, 6, 10, 14, 0, 0, 0, 0}, - { 2, 6, 10, 14, 0, 0, 0, 0}, - { 0, 2, 6, 10, 14, 0, 0, 0}, - { 4, 6, 10, 14, 0, 0, 0, 0}, - { 0, 4, 6, 10, 14, 0, 0, 0}, - { 2, 4, 6, 10, 14, 0, 0, 0}, - { 0, 2, 4, 6, 10, 14, 0, 0}, - { 8, 10, 14, 0, 0, 0, 0, 0}, - { 0, 8, 10, 14, 0, 0, 0, 0}, - { 2, 8, 10, 14, 0, 0, 0, 0}, - { 0, 2, 8, 10, 14, 0, 0, 0}, - { 4, 8, 10, 14, 0, 0, 0, 0}, - { 0, 4, 8, 10, 14, 0, 0, 0}, - { 2, 4, 8, 10, 14, 0, 0, 0}, - { 0, 2, 4, 8, 10, 14, 0, 0}, - { 6, 8, 10, 14, 0, 0, 0, 0}, - { 0, 6, 8, 10, 14, 0, 0, 0}, - { 2, 6, 8, 10, 14, 0, 0, 0}, - { 0, 2, 6, 8, 10, 14, 0, 0}, - { 4, 6, 8, 10, 14, 0, 0, 0}, - { 0, 4, 6, 8, 10, 14, 0, 0}, - { 2, 4, 6, 8, 10, 14, 0, 0}, - { 0, 2, 4, 6, 8, 10, 14, 0}, - {12, 14, 0, 0, 0, 0, 0, 0}, - { 0, 12, 14, 0, 0, 0, 0, 0}, - { 2, 12, 14, 0, 0, 0, 0, 0}, - { 0, 2, 12, 14, 0, 0, 0, 0}, - { 4, 12, 14, 0, 0, 0, 0, 0}, - { 0, 4, 12, 14, 0, 0, 0, 0}, - { 2, 4, 12, 14, 0, 0, 0, 0}, - { 0, 2, 4, 12, 14, 0, 0, 0}, - { 6, 12, 14, 0, 0, 0, 0, 0}, - { 0, 6, 12, 14, 0, 0, 0, 0}, - { 2, 6, 12, 14, 0, 0, 0, 0}, - { 0, 2, 6, 12, 14, 0, 0, 0}, - { 4, 6, 12, 14, 0, 0, 0, 0}, - { 0, 4, 6, 12, 14, 0, 0, 0}, - { 2, 4, 6, 12, 14, 0, 0, 0}, - { 0, 2, 4, 6, 12, 14, 0, 0}, - { 8, 12, 14, 0, 0, 0, 0, 0}, - { 0, 8, 12, 14, 0, 0, 0, 0}, - { 2, 8, 12, 14, 0, 0, 0, 0}, - { 0, 2, 8, 12, 14, 0, 0, 0}, - { 4, 8, 12, 14, 0, 0, 0, 0}, - { 0, 4, 8, 12, 14, 0, 0, 0}, - { 2, 4, 8, 12, 14, 0, 0, 0}, - { 0, 2, 4, 8, 12, 14, 0, 0}, - { 6, 8, 12, 14, 0, 0, 0, 0}, - { 0, 6, 8, 12, 14, 0, 0, 0}, - { 2, 6, 8, 12, 14, 0, 0, 0}, - { 0, 2, 6, 8, 12, 14, 0, 0}, - { 4, 6, 8, 12, 14, 0, 0, 0}, - { 0, 4, 6, 8, 12, 14, 0, 0}, - { 2, 4, 6, 8, 12, 14, 0, 0}, - { 0, 2, 4, 6, 8, 12, 14, 0}, - {10, 12, 14, 0, 0, 0, 0, 0}, - { 0, 10, 12, 14, 0, 0, 0, 0}, - { 2, 10, 12, 14, 0, 0, 0, 0}, - { 0, 2, 10, 12, 14, 0, 0, 0}, - { 4, 10, 12, 14, 0, 0, 0, 0}, - { 0, 4, 10, 12, 14, 0, 0, 0}, - { 2, 4, 10, 12, 14, 0, 0, 0}, - { 0, 2, 4, 10, 12, 14, 0, 0}, - { 6, 10, 12, 14, 0, 0, 0, 0}, - { 0, 6, 10, 12, 14, 0, 0, 0}, - { 2, 6, 10, 12, 14, 0, 0, 0}, - { 0, 2, 6, 10, 12, 14, 0, 0}, - { 4, 6, 10, 12, 14, 0, 0, 0}, - { 0, 4, 6, 10, 12, 14, 0, 0}, - { 2, 4, 6, 10, 12, 14, 0, 0}, - { 0, 2, 4, 6, 10, 12, 14, 0}, - { 8, 10, 12, 14, 0, 0, 0, 0}, - { 0, 8, 10, 12, 14, 0, 0, 0}, - { 2, 8, 10, 12, 14, 0, 0, 0}, - { 0, 2, 8, 10, 12, 14, 0, 0}, - { 4, 8, 10, 12, 14, 0, 0, 0}, - { 0, 4, 8, 10, 12, 14, 0, 0}, - { 2, 4, 8, 10, 12, 14, 0, 0}, - { 0, 2, 4, 8, 10, 12, 14, 0}, - { 6, 8, 10, 12, 14, 0, 0, 0}, - { 0, 6, 8, 10, 12, 14, 0, 0}, - { 2, 6, 8, 10, 12, 14, 0, 0}, - { 0, 2, 6, 8, 10, 12, 14, 0}, - { 4, 6, 8, 10, 12, 14, 0, 0}, - { 0, 4, 6, 8, 10, 12, 14, 0}, - { 2, 4, 6, 8, 10, 12, 14, 0}, - { 0, 2, 4, 6, 8, 10, 12, 14} +static const ALIGN32_ARRAY_2D(uint8_t, 256, 8) idx = {.arr = { + {-1, -1, -1, -1, -1, -1, -1, -1}, + { 0, -1, -1, -1, -1, -1, -1, -1}, + { 2, -1, -1, -1, -1, -1, -1, -1}, + { 0, 2, -1, -1, -1, -1, -1, -1}, + { 4, -1, -1, -1, -1, -1, -1, -1}, + { 0, 4, -1, -1, -1, -1, -1, -1}, + { 2, 4, -1, -1, -1, -1, -1, -1}, + { 0, 2, 4, -1, -1, -1, -1, -1}, + { 6, -1, -1, -1, -1, -1, -1, -1}, + { 0, 6, -1, -1, -1, -1, -1, -1}, + { 2, 6, -1, -1, -1, -1, -1, -1}, + { 0, 2, 6, -1, -1, -1, -1, -1}, + { 4, 6, -1, -1, -1, -1, -1, -1}, + { 0, 4, 6, -1, -1, -1, -1, -1}, + { 2, 4, 6, -1, -1, -1, -1, -1}, + { 0, 2, 4, 6, -1, -1, -1, -1}, + { 8, -1, -1, -1, -1, -1, -1, -1}, + { 0, 8, -1, -1, -1, -1, -1, -1}, + { 2, 8, -1, -1, -1, -1, -1, -1}, + { 0, 2, 8, -1, -1, -1, -1, -1}, + { 4, 8, -1, -1, -1, -1, -1, -1}, + { 0, 4, 8, -1, -1, -1, -1, -1}, + { 2, 4, 8, -1, -1, -1, -1, -1}, + { 0, 2, 4, 8, -1, -1, -1, -1}, + { 6, 8, -1, -1, -1, -1, -1, -1}, + { 0, 6, 8, -1, -1, -1, -1, -1}, + { 2, 6, 8, -1, -1, -1, -1, -1}, + { 0, 2, 6, 8, -1, -1, -1, -1}, + { 4, 6, 8, -1, -1, -1, -1, -1}, + { 0, 4, 6, 8, -1, -1, -1, -1}, + { 2, 4, 6, 8, -1, -1, -1, -1}, + { 0, 2, 4, 6, 8, -1, -1, -1}, + {10, -1, -1, -1, -1, -1, -1, -1}, + { 0, 10, -1, -1, -1, -1, -1, -1}, + { 2, 10, -1, -1, -1, -1, -1, -1}, + { 0, 2, 10, -1, -1, -1, -1, -1}, + { 4, 10, -1, -1, -1, -1, -1, -1}, + { 0, 4, 10, -1, -1, -1, -1, -1}, + { 2, 4, 10, -1, -1, -1, -1, -1}, + { 0, 2, 4, 10, -1, -1, -1, -1}, + { 6, 10, -1, -1, -1, -1, -1, -1}, + { 0, 6, 10, -1, -1, -1, -1, -1}, + { 2, 6, 10, -1, -1, -1, -1, -1}, + { 0, 2, 6, 10, -1, -1, -1, -1}, + { 4, 6, 10, -1, -1, -1, -1, -1}, + { 0, 4, 6, 10, -1, -1, -1, -1}, + { 2, 4, 6, 10, -1, -1, -1, -1}, + { 0, 2, 4, 6, 10, -1, -1, -1}, + { 8, 10, -1, -1, -1, -1, -1, -1}, + { 0, 8, 10, -1, -1, -1, -1, -1}, + { 2, 8, 10, -1, -1, -1, -1, -1}, + { 0, 2, 8, 10, -1, -1, -1, -1}, + { 4, 8, 10, -1, -1, -1, -1, -1}, + { 0, 4, 8, 10, -1, -1, -1, -1}, + { 2, 4, 8, 10, -1, -1, -1, -1}, + { 0, 2, 4, 8, 10, -1, -1, -1}, + { 6, 8, 10, -1, -1, -1, -1, -1}, + { 0, 6, 8, 10, -1, -1, -1, -1}, + { 2, 6, 8, 10, -1, -1, -1, -1}, + { 0, 2, 6, 8, 10, -1, -1, -1}, + { 4, 6, 8, 10, -1, -1, -1, -1}, + { 0, 4, 6, 8, 10, -1, -1, -1}, + { 2, 4, 6, 8, 10, -1, -1, -1}, + { 0, 2, 4, 6, 8, 10, -1, -1}, + {12, -1, -1, -1, -1, -1, -1, -1}, + { 0, 12, -1, -1, -1, -1, -1, -1}, + { 2, 12, -1, -1, -1, -1, -1, -1}, + { 0, 2, 12, -1, -1, -1, -1, -1}, + { 4, 12, -1, -1, -1, -1, -1, -1}, + { 0, 4, 12, -1, -1, -1, -1, -1}, + { 2, 4, 12, -1, -1, -1, -1, -1}, + { 0, 2, 4, 12, -1, -1, -1, -1}, + { 6, 12, -1, -1, -1, -1, -1, -1}, + { 0, 6, 12, -1, -1, -1, -1, -1}, + { 2, 6, 12, -1, -1, -1, -1, -1}, + { 0, 2, 6, 12, -1, -1, -1, -1}, + { 4, 6, 12, -1, -1, -1, -1, -1}, + { 0, 4, 6, 12, -1, -1, -1, -1}, + { 2, 4, 6, 12, -1, -1, -1, -1}, + { 0, 2, 4, 6, 12, -1, -1, -1}, + { 8, 12, -1, -1, -1, -1, -1, -1}, + { 0, 8, 12, -1, -1, -1, -1, -1}, + { 2, 8, 12, -1, -1, -1, -1, -1}, + { 0, 2, 8, 12, -1, -1, -1, -1}, + { 4, 8, 12, -1, -1, -1, -1, -1}, + { 0, 4, 8, 12, -1, -1, -1, -1}, + { 2, 4, 8, 12, -1, -1, -1, -1}, + { 0, 2, 4, 8, 12, -1, -1, -1}, + { 6, 8, 12, -1, -1, -1, -1, -1}, + { 0, 6, 8, 12, -1, -1, -1, -1}, + { 2, 6, 8, 12, -1, -1, -1, -1}, + { 0, 2, 6, 8, 12, -1, -1, -1}, + { 4, 6, 8, 12, -1, -1, -1, -1}, + { 0, 4, 6, 8, 12, -1, -1, -1}, + { 2, 4, 6, 8, 12, -1, -1, -1}, + { 0, 2, 4, 6, 8, 12, -1, -1}, + {10, 12, -1, -1, -1, -1, -1, -1}, + { 0, 10, 12, -1, -1, -1, -1, -1}, + { 2, 10, 12, -1, -1, -1, -1, -1}, + { 0, 2, 10, 12, -1, -1, -1, -1}, + { 4, 10, 12, -1, -1, -1, -1, -1}, + { 0, 4, 10, 12, -1, -1, -1, -1}, + { 2, 4, 10, 12, -1, -1, -1, -1}, + { 0, 2, 4, 10, 12, -1, -1, -1}, + { 6, 10, 12, -1, -1, -1, -1, -1}, + { 0, 6, 10, 12, -1, -1, -1, -1}, + { 2, 6, 10, 12, -1, -1, -1, -1}, + { 0, 2, 6, 10, 12, -1, -1, -1}, + { 4, 6, 10, 12, -1, -1, -1, -1}, + { 0, 4, 6, 10, 12, -1, -1, -1}, + { 2, 4, 6, 10, 12, -1, -1, -1}, + { 0, 2, 4, 6, 10, 12, -1, -1}, + { 8, 10, 12, -1, -1, -1, -1, -1}, + { 0, 8, 10, 12, -1, -1, -1, -1}, + { 2, 8, 10, 12, -1, -1, -1, -1}, + { 0, 2, 8, 10, 12, -1, -1, -1}, + { 4, 8, 10, 12, -1, -1, -1, -1}, + { 0, 4, 8, 10, 12, -1, -1, -1}, + { 2, 4, 8, 10, 12, -1, -1, -1}, + { 0, 2, 4, 8, 10, 12, -1, -1}, + { 6, 8, 10, 12, -1, -1, -1, -1}, + { 0, 6, 8, 10, 12, -1, -1, -1}, + { 2, 6, 8, 10, 12, -1, -1, -1}, + { 0, 2, 6, 8, 10, 12, -1, -1}, + { 4, 6, 8, 10, 12, -1, -1, -1}, + { 0, 4, 6, 8, 10, 12, -1, -1}, + { 2, 4, 6, 8, 10, 12, -1, -1}, + { 0, 2, 4, 6, 8, 10, 12, -1}, + {14, -1, -1, -1, -1, -1, -1, -1}, + { 0, 14, -1, -1, -1, -1, -1, -1}, + { 2, 14, -1, -1, -1, -1, -1, -1}, + { 0, 2, 14, -1, -1, -1, -1, -1}, + { 4, 14, -1, -1, -1, -1, -1, -1}, + { 0, 4, 14, -1, -1, -1, -1, -1}, + { 2, 4, 14, -1, -1, -1, -1, -1}, + { 0, 2, 4, 14, -1, -1, -1, -1}, + { 6, 14, -1, -1, -1, -1, -1, -1}, + { 0, 6, 14, -1, -1, -1, -1, -1}, + { 2, 6, 14, -1, -1, -1, -1, -1}, + { 0, 2, 6, 14, -1, -1, -1, -1}, + { 4, 6, 14, -1, -1, -1, -1, -1}, + { 0, 4, 6, 14, -1, -1, -1, -1}, + { 2, 4, 6, 14, -1, -1, -1, -1}, + { 0, 2, 4, 6, 14, -1, -1, -1}, + { 8, 14, -1, -1, -1, -1, -1, -1}, + { 0, 8, 14, -1, -1, -1, -1, -1}, + { 2, 8, 14, -1, -1, -1, -1, -1}, + { 0, 2, 8, 14, -1, -1, -1, -1}, + { 4, 8, 14, -1, -1, -1, -1, -1}, + { 0, 4, 8, 14, -1, -1, -1, -1}, + { 2, 4, 8, 14, -1, -1, -1, -1}, + { 0, 2, 4, 8, 14, -1, -1, -1}, + { 6, 8, 14, -1, -1, -1, -1, -1}, + { 0, 6, 8, 14, -1, -1, -1, -1}, + { 2, 6, 8, 14, -1, -1, -1, -1}, + { 0, 2, 6, 8, 14, -1, -1, -1}, + { 4, 6, 8, 14, -1, -1, -1, -1}, + { 0, 4, 6, 8, 14, -1, -1, -1}, + { 2, 4, 6, 8, 14, -1, -1, -1}, + { 0, 2, 4, 6, 8, 14, -1, -1}, + {10, 14, -1, -1, -1, -1, -1, -1}, + { 0, 10, 14, -1, -1, -1, -1, -1}, + { 2, 10, 14, -1, -1, -1, -1, -1}, + { 0, 2, 10, 14, -1, -1, -1, -1}, + { 4, 10, 14, -1, -1, -1, -1, -1}, + { 0, 4, 10, 14, -1, -1, -1, -1}, + { 2, 4, 10, 14, -1, -1, -1, -1}, + { 0, 2, 4, 10, 14, -1, -1, -1}, + { 6, 10, 14, -1, -1, -1, -1, -1}, + { 0, 6, 10, 14, -1, -1, -1, -1}, + { 2, 6, 10, 14, -1, -1, -1, -1}, + { 0, 2, 6, 10, 14, -1, -1, -1}, + { 4, 6, 10, 14, -1, -1, -1, -1}, + { 0, 4, 6, 10, 14, -1, -1, -1}, + { 2, 4, 6, 10, 14, -1, -1, -1}, + { 0, 2, 4, 6, 10, 14, -1, -1}, + { 8, 10, 14, -1, -1, -1, -1, -1}, + { 0, 8, 10, 14, -1, -1, -1, -1}, + { 2, 8, 10, 14, -1, -1, -1, -1}, + { 0, 2, 8, 10, 14, -1, -1, -1}, + { 4, 8, 10, 14, -1, -1, -1, -1}, + { 0, 4, 8, 10, 14, -1, -1, -1}, + { 2, 4, 8, 10, 14, -1, -1, -1}, + { 0, 2, 4, 8, 10, 14, -1, -1}, + { 6, 8, 10, 14, -1, -1, -1, -1}, + { 0, 6, 8, 10, 14, -1, -1, -1}, + { 2, 6, 8, 10, 14, -1, -1, -1}, + { 0, 2, 6, 8, 10, 14, -1, -1}, + { 4, 6, 8, 10, 14, -1, -1, -1}, + { 0, 4, 6, 8, 10, 14, -1, -1}, + { 2, 4, 6, 8, 10, 14, -1, -1}, + { 0, 2, 4, 6, 8, 10, 14, -1}, + {12, 14, -1, -1, -1, -1, -1, -1}, + { 0, 12, 14, -1, -1, -1, -1, -1}, + { 2, 12, 14, -1, -1, -1, -1, -1}, + { 0, 2, 12, 14, -1, -1, -1, -1}, + { 4, 12, 14, -1, -1, -1, -1, -1}, + { 0, 4, 12, 14, -1, -1, -1, -1}, + { 2, 4, 12, 14, -1, -1, -1, -1}, + { 0, 2, 4, 12, 14, -1, -1, -1}, + { 6, 12, 14, -1, -1, -1, -1, -1}, + { 0, 6, 12, 14, -1, -1, -1, -1}, + { 2, 6, 12, 14, -1, -1, -1, -1}, + { 0, 2, 6, 12, 14, -1, -1, -1}, + { 4, 6, 12, 14, -1, -1, -1, -1}, + { 0, 4, 6, 12, 14, -1, -1, -1}, + { 2, 4, 6, 12, 14, -1, -1, -1}, + { 0, 2, 4, 6, 12, 14, -1, -1}, + { 8, 12, 14, -1, -1, -1, -1, -1}, + { 0, 8, 12, 14, -1, -1, -1, -1}, + { 2, 8, 12, 14, -1, -1, -1, -1}, + { 0, 2, 8, 12, 14, -1, -1, -1}, + { 4, 8, 12, 14, -1, -1, -1, -1}, + { 0, 4, 8, 12, 14, -1, -1, -1}, + { 2, 4, 8, 12, 14, -1, -1, -1}, + { 0, 2, 4, 8, 12, 14, -1, -1}, + { 6, 8, 12, 14, -1, -1, -1, -1}, + { 0, 6, 8, 12, 14, -1, -1, -1}, + { 2, 6, 8, 12, 14, -1, -1, -1}, + { 0, 2, 6, 8, 12, 14, -1, -1}, + { 4, 6, 8, 12, 14, -1, -1, -1}, + { 0, 4, 6, 8, 12, 14, -1, -1}, + { 2, 4, 6, 8, 12, 14, -1, -1}, + { 0, 2, 4, 6, 8, 12, 14, -1}, + {10, 12, 14, -1, -1, -1, -1, -1}, + { 0, 10, 12, 14, -1, -1, -1, -1}, + { 2, 10, 12, 14, -1, -1, -1, -1}, + { 0, 2, 10, 12, 14, -1, -1, -1}, + { 4, 10, 12, 14, -1, -1, -1, -1}, + { 0, 4, 10, 12, 14, -1, -1, -1}, + { 2, 4, 10, 12, 14, -1, -1, -1}, + { 0, 2, 4, 10, 12, 14, -1, -1}, + { 6, 10, 12, 14, -1, -1, -1, -1}, + { 0, 6, 10, 12, 14, -1, -1, -1}, + { 2, 6, 10, 12, 14, -1, -1, -1}, + { 0, 2, 6, 10, 12, 14, -1, -1}, + { 4, 6, 10, 12, 14, -1, -1, -1}, + { 0, 4, 6, 10, 12, 14, -1, -1}, + { 2, 4, 6, 10, 12, 14, -1, -1}, + { 0, 2, 4, 6, 10, 12, 14, -1}, + { 8, 10, 12, 14, -1, -1, -1, -1}, + { 0, 8, 10, 12, 14, -1, -1, -1}, + { 2, 8, 10, 12, 14, -1, -1, -1}, + { 0, 2, 8, 10, 12, 14, -1, -1}, + { 4, 8, 10, 12, 14, -1, -1, -1}, + { 0, 4, 8, 10, 12, 14, -1, -1}, + { 2, 4, 8, 10, 12, 14, -1, -1}, + { 0, 2, 4, 8, 10, 12, 14, -1}, + { 6, 8, 10, 12, 14, -1, -1, -1}, + { 0, 6, 8, 10, 12, 14, -1, -1}, + { 2, 6, 8, 10, 12, 14, -1, -1}, + { 0, 2, 6, 8, 10, 12, 14, -1}, + { 4, 6, 8, 10, 12, 14, -1, -1}, + { 0, 4, 6, 8, 10, 12, 14, -1}, + { 2, 4, 6, 8, 10, 12, 14, -1}, + { 0, 2, 4, 6, 8, 10, 12, 14} + } }; -#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) -#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) +#define _mm256_cmpge_epu16(a, b) _mm256_cmpeq_epi16(_mm256_max_epu16(a, b), a) +#define _mm_cmpge_epu16(a, b) _mm_cmpeq_epi16(_mm_max_epu16(a, b), a) -size_t PQCLEAN_KYBER768_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen) { - size_t ctr, pos; - uint16_t val; - uint32_t good0, good1, good2; - const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); // -1 to use cheaper >= instead of > comparison +#define REJ_UNIFORM_BUFLEN 672 +unsigned int PQCLEAN_KYBER768_AVX2_rej_uniform_avx(int16_t *restrict r, + const uint8_t *restrict buf) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; + uint32_t good = 0; + const __m256i bound = _mm256_set1_epi16((int16_t)(19 * KYBER_Q - 1)); const __m256i ones = _mm256_set1_epi8(1); - const __m256i kyberq = _mm256_load_si256(&PQCLEAN_KYBER768_AVX2_16xq.as_vec); - const __m256i v = _mm256_load_si256(&PQCLEAN_KYBER768_AVX2_16xv.as_vec); - __m256i d0, d1, d2, tmp0, tmp1, tmp2, pi0, pi1, pi2; - __m128i d, tmp, pilo, pihi; + const __m256i kyberq = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER768_AVX2_qdata.as_arr[_16XQ]); + const __m256i v = _mm256_load_si256((__m256i *)&PQCLEAN_KYBER768_AVX2_qdata.as_arr[_16XV]); + __m256i f0, f1, g0, g1, g2, g3; + __m128i f, t, pilo, pihi; - ctr = pos = 0; - while (ctr + 48 <= len && pos + 96 <= buflen) { - d0 = _mm256_loadu_si256((__m256i *)&buf[pos + 0]); - d1 = _mm256_loadu_si256((__m256i *)&buf[pos + 32]); - d2 = _mm256_loadu_si256((__m256i *)&buf[pos + 64]); + ctr = 0; + for (pos = 0; pos < 2 * KYBER_N; pos += 64) { + f0 = _mm256_load_si256((__m256i *)&buf[pos + 0]); + f1 = _mm256_load_si256((__m256i *)&buf[pos + 32]); - tmp0 = _mm256_cmpge_epu16(bound, d0); - tmp1 = _mm256_cmpge_epu16(bound, d1); - tmp2 = _mm256_cmpge_epu16(bound, d2); - good0 = (uint32_t)_mm256_movemask_epi8(tmp0); - good1 = (uint32_t)_mm256_movemask_epi8(tmp1); - good2 = (uint32_t)_mm256_movemask_epi8(tmp2); - good0 = _pext_u32(good0, 0x55555555); - good1 = _pext_u32(good1, 0x55555555); - good2 = _pext_u32(good2, 0x55555555); + g0 = _mm256_cmpge_epu16(bound, f0); + g1 = _mm256_cmpge_epu16(bound, f1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good0 >> 8) & 0xFF]); - pi0 = _mm256_castsi128_si256(pilo); - pi0 = _mm256_inserti128_si256(pi0, pihi, 1); + g0 = _mm256_packs_epi16(g0, g1); + good = _mm256_movemask_epi8(g0); - pilo = _mm_loadl_epi64((__m128i *)&idx[good1 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good1 >> 8) & 0xFF]); - pi1 = _mm256_castsi128_si256(pilo); - pi1 = _mm256_inserti128_si256(pi1, pihi, 1); + g0 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 0) & 0xFF])); + g1 = _mm256_castsi128_si256(_mm_loadl_epi64((__m128i *)&idx.arr[(good >> 8) & 0xFF])); + g0 = _mm256_inserti128_si256(g0, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 16) & 0xFF]), 1); + g1 = _mm256_inserti128_si256(g1, _mm_loadl_epi64((__m128i *)&idx.arr[(good >> 24) & 0xFF]), 1); - pilo = _mm_loadl_epi64((__m128i *)&idx[good2 & 0xFF]); - pihi = _mm_loadl_epi64((__m128i *)&idx[(good2 >> 8) & 0xFF]); - pi2 = _mm256_castsi128_si256(pilo); - pi2 = _mm256_inserti128_si256(pi2, pihi, 1); + //g0 = _mm256_cvtepu8_epi64(_mm_loadl_epi64((__m128i *)&good)); + //g1 = _mm256_i64gather_epi64((long long *)idx, g0, 8); - tmp0 = _mm256_add_epi8(pi0, ones); - tmp1 = _mm256_add_epi8(pi1, ones); - tmp2 = _mm256_add_epi8(pi2, ones); - pi0 = _mm256_unpacklo_epi8(pi0, tmp0); - pi1 = _mm256_unpacklo_epi8(pi1, tmp1); - pi2 = _mm256_unpacklo_epi8(pi2, tmp2); + /* Barrett reduction of (still unsigned) values */ + g2 = _mm256_mulhi_epu16(f0, v); + g3 = _mm256_mulhi_epu16(f1, v); + g2 = _mm256_srli_epi16(g2, 10); + g3 = _mm256_srli_epi16(g3, 10); + g2 = _mm256_mullo_epi16(g2, kyberq); + g3 = _mm256_mullo_epi16(g3, kyberq); + f0 = _mm256_sub_epi16(f0, g2); + f1 = _mm256_sub_epi16(f1, g3); - d0 = _mm256_shuffle_epi8(d0, pi0); - d1 = _mm256_shuffle_epi8(d1, pi1); - d2 = _mm256_shuffle_epi8(d2, pi2); + g2 = _mm256_add_epi8(g0, ones); + g3 = _mm256_add_epi8(g1, ones); + g0 = _mm256_unpacklo_epi8(g0, g2); + g1 = _mm256_unpacklo_epi8(g1, g3); - /* Barrett reduction of (still unsigned) d values */ - tmp0 = _mm256_mulhi_epu16(d0, v); - tmp1 = _mm256_mulhi_epu16(d1, v); - tmp2 = _mm256_mulhi_epu16(d2, v); - tmp0 = _mm256_srli_epi16(tmp0, 10); - tmp1 = _mm256_srli_epi16(tmp1, 10); - tmp2 = _mm256_srli_epi16(tmp2, 10); - tmp0 = _mm256_mullo_epi16(tmp0, kyberq); - tmp1 = _mm256_mullo_epi16(tmp1, kyberq); - tmp2 = _mm256_mullo_epi16(tmp2, kyberq); - d0 = _mm256_sub_epi16(d0, tmp0); - d1 = _mm256_sub_epi16(d1, tmp1); - d2 = _mm256_sub_epi16(d2, tmp2); + f0 = _mm256_shuffle_epi8(f0, g0); + f1 = _mm256_shuffle_epi8(f1, g1); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d0)); - ctr += (unsigned int)_mm_popcnt_u32(good0 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d0, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good0 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d1)); - ctr += (unsigned int)_mm_popcnt_u32(good1 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d1, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good1 >> 8) & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(d2)); - ctr += (unsigned int)_mm_popcnt_u32(good2 & 0xFF); - _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(d2, 1)); - ctr += (unsigned int)_mm_popcnt_u32((good2 >> 8) & 0xFF); - pos += 96; + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f0)); + ctr += _mm_popcnt_u32((good >> 0) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f0, 1)); + ctr += _mm_popcnt_u32((good >> 16) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_castsi256_si128(f1)); + ctr += _mm_popcnt_u32((good >> 8) & 0xFF); + _mm_storeu_si128((__m128i *)&r[ctr], _mm256_extracti128_si256(f1, 1)); + ctr += _mm_popcnt_u32((good >> 24) & 0xFF); } - while (ctr + 8 <= len && pos + 16 <= buflen) { - d = _mm_loadu_si128((__m128i *)&buf[pos]); - tmp = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), d); - good0 = (uint32_t)_mm_movemask_epi8(tmp); - good0 = _pext_u32(good0, 0x55555555); - pilo = _mm_loadl_epi64((__m128i *)&idx[good0]); + while (ctr <= KYBER_N - 8 && pos <= REJ_UNIFORM_BUFLEN - 16) { + f = _mm_load_si128((__m128i *)&buf[pos]); + t = _mm_cmpge_epu16(_mm256_castsi256_si128(bound), f); + good = _mm_movemask_epi8(t); + good = _pext_u32(good, 0x5555); + pilo = _mm_loadl_epi64((__m128i *)&idx.arr[good]); pihi = _mm_add_epi8(pilo, _mm256_castsi256_si128(ones)); pilo = _mm_unpacklo_epi8(pilo, pihi); - d = _mm_shuffle_epi8(d, pilo); /* Barrett reduction */ - tmp = _mm_mulhi_epu16(d, _mm256_castsi256_si128(v)); - tmp = _mm_srli_epi16(tmp, 10); - tmp = _mm_mullo_epi16(tmp, _mm256_castsi256_si128(kyberq)); - d = _mm_sub_epi16(d, tmp); + t = _mm_mulhi_epu16(f, _mm256_castsi256_si128(v)); + t = _mm_srli_epi16(t, 10); + t = _mm_mullo_epi16(t, _mm256_castsi256_si128(kyberq)); + f = _mm_sub_epi16(f, t); - _mm_storeu_si128((__m128i *)&r[ctr], d); - ctr += (unsigned int)_mm_popcnt_u32(good0); + f = _mm_shuffle_epi8(f, pilo); + _mm_storeu_si128((__m128i *)&r[ctr], f); + ctr += _mm_popcnt_u32(good); pos += 16; } - while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + while (ctr < KYBER_N && pos <= REJ_UNIFORM_BUFLEN - 2) { + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { val -= ((int32_t)val * 20159 >> 26) * KYBER_Q; - r[ctr++] = (int16_t)val; + r[ctr++] = val; } } diff --git a/crypto_kem/kyber768/avx2/rejsample.h b/crypto_kem/kyber768/avx2/rejsample.h index b2369f1e..b92b5035 100644 --- a/crypto_kem/kyber768/avx2/rejsample.h +++ b/crypto_kem/kyber768/avx2/rejsample.h @@ -1,12 +1,11 @@ #ifndef REJSAMPLE_H #define REJSAMPLE_H -#include +#include "params.h" #include -size_t PQCLEAN_KYBER768_AVX2_rej_uniform(int16_t *r, - size_t len, - const uint8_t *buf, - size_t buflen); + +unsigned int PQCLEAN_KYBER768_AVX2_rej_uniform_avx(int16_t *r, + const unsigned char *buf); #endif diff --git a/crypto_kem/kyber512/avx2/shuffle.s b/crypto_kem/kyber768/avx2/shuffle.S similarity index 66% rename from crypto_kem/kyber512/avx2/shuffle.s rename to crypto_kem/kyber768/avx2/shuffle.S index d3f09835..34d3b980 100644 --- a/crypto_kem/kyber512/avx2/shuffle.s +++ b/crypto_kem/kyber768/avx2/shuffle.S @@ -1,12 +1,9 @@ +#include "cdecl.inc" .include "fq.inc" .include "shuffle.inc" -.global PQCLEAN_KYBER512_AVX2_nttunpack_avx -PQCLEAN_KYBER512_AVX2_nttunpack_avx: -#consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER512_AVX2_16xv(%rip),%ymm1 - +/* +nttpack_avx: #load vmovdqa (%rdi),%ymm4 vmovdqa 32(%rdi),%ymm5 @@ -17,18 +14,51 @@ vmovdqa 160(%rdi),%ymm9 vmovdqa 192(%rdi),%ymm10 vmovdqa 224(%rdi),%ymm11 -/* -#reduce -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 12 -red16 9 13 -red16 10 14 -red16 11 15 +shuffle1 4,5,3,5 +shuffle1 6,7,4,7 +shuffle1 8,9,6,9 +shuffle1 10,11,8,11 + +shuffle2 3,4,10,4 +shuffle2 6,8,3,8 +shuffle2 5,7,6,7 +shuffle2 9,11,5,11 + +shuffle4 10,3,9,3 +shuffle4 6,5,10,5 +shuffle4 4,8,6,8 +shuffle4 7,11,4,11 + +shuffle8 9,10,7,10 +shuffle8 6,4,9,4 +shuffle8 3,5,6,5 +shuffle8 8,11,3,11 + +#store +vmovdqa %ymm7,(%rdi) +vmovdqa %ymm9,32(%rdi) +vmovdqa %ymm6,64(%rdi) +vmovdqa %ymm3,96(%rdi) +vmovdqa %ymm10,128(%rdi) +vmovdqa %ymm4,160(%rdi) +vmovdqa %ymm5,192(%rdi) +vmovdqa %ymm11,224(%rdi) + +ret */ +.text +nttunpack128_avx: +#load +vmovdqa (%rdi),%ymm4 +vmovdqa 32(%rdi),%ymm5 +vmovdqa 64(%rdi),%ymm6 +vmovdqa 96(%rdi),%ymm7 +vmovdqa 128(%rdi),%ymm8 +vmovdqa 160(%rdi),%ymm9 +vmovdqa 192(%rdi),%ymm10 +vmovdqa 224(%rdi),%ymm11 + shuffle8 4,8,3,8 shuffle8 5,9,4,9 shuffle8 6,10,5,10 @@ -61,11 +91,14 @@ vmovdqa %ymm11,224(%rdi) ret -.global PQCLEAN_KYBER512_AVX2_ntttobytes_avx -PQCLEAN_KYBER512_AVX2_ntttobytes_avx: -#consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xq(%rip),%ymm0 +.global cdecl(PQCLEAN_KYBER768_AVX2_nttunpack_avx) +cdecl(PQCLEAN_KYBER768_AVX2_nttunpack_avx): +call nttunpack128_avx +add $256,%rdi +call nttunpack128_avx +ret +ntttobytes128_avx: #load vmovdqa (%rsi),%ymm5 vmovdqa 32(%rsi),%ymm6 @@ -77,14 +110,14 @@ vmovdqa 192(%rsi),%ymm11 vmovdqa 224(%rsi),%ymm12 #csubq -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 1 -csubq 9 13 -csubq 10 14 -csubq 11 15 -csubq 12 1 +csubq 5,13 +csubq 6,14 +csubq 7,15 +csubq 8,1 +csubq 9,13 +csubq 10,14 +csubq 11,15 +csubq 12,1 #bitpack vpsllw $12,%ymm6,%ymm4 @@ -135,11 +168,17 @@ vmovdqu %ymm9,160(%rdi) ret -.global PQCLEAN_KYBER512_AVX2_nttfrombytes_avx -PQCLEAN_KYBER512_AVX2_nttfrombytes_avx: +.global cdecl(PQCLEAN_KYBER768_AVX2_ntttobytes_avx) +cdecl(PQCLEAN_KYBER768_AVX2_ntttobytes_avx): #consts -vmovdqa PQCLEAN_KYBER512_AVX2_16xmask(%rip),%ymm0 +vmovdqa _16XQ*2(%rdx),%ymm0 +call ntttobytes128_avx +add $256,%rsi +add $192,%rdi +call ntttobytes128_avx +ret +nttfrombytes128_avx: #load vmovdqu (%rsi),%ymm4 vmovdqu 32(%rsi),%ymm5 @@ -204,3 +243,13 @@ vmovdqa %ymm15,192(%rdi) vmovdqa %ymm1,224(%rdi) ret + +.global cdecl(PQCLEAN_KYBER768_AVX2_nttfrombytes_avx) +cdecl(PQCLEAN_KYBER768_AVX2_nttfrombytes_avx): +#consts +vmovdqa _16XMASK*2(%rdx),%ymm0 +call nttfrombytes128_avx +add $256,%rdi +add $192,%rsi +call nttfrombytes128_avx +ret diff --git a/crypto_kem/kyber768/avx2/shuffle.inc b/crypto_kem/kyber768/avx2/shuffle.inc index df352030..d4b092bc 100644 --- a/crypto_kem/kyber768/avx2/shuffle.inc +++ b/crypto_kem/kyber768/avx2/shuffle.inc @@ -9,6 +9,8 @@ vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3 .endm .macro shuffle2 r0,r1,r2,r3 +#vshufps $0x20,%ymm\r1,%ymm\r0,%ymm\r2 +#vshufps $0x31,%ymm\r1,%ymm\r0,%ymm\r3 vpsllq $32,%ymm\r1,%ymm12 vpsrlq $32,%ymm\r0,%ymm13 vpblendd $0xAA,%ymm12,%ymm\r0,%ymm\r2 diff --git a/crypto_kem/kyber768/avx2/shuffle.s b/crypto_kem/kyber768/avx2/shuffle.s deleted file mode 100644 index 6a8edecb..00000000 --- a/crypto_kem/kyber768/avx2/shuffle.s +++ /dev/null @@ -1,206 +0,0 @@ -.include "fq.inc" -.include "shuffle.inc" - -.global PQCLEAN_KYBER768_AVX2_nttunpack_avx -PQCLEAN_KYBER768_AVX2_nttunpack_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 -vmovdqa PQCLEAN_KYBER768_AVX2_16xv(%rip),%ymm1 - -#load -vmovdqa (%rdi),%ymm4 -vmovdqa 32(%rdi),%ymm5 -vmovdqa 64(%rdi),%ymm6 -vmovdqa 96(%rdi),%ymm7 -vmovdqa 128(%rdi),%ymm8 -vmovdqa 160(%rdi),%ymm9 -vmovdqa 192(%rdi),%ymm10 -vmovdqa 224(%rdi),%ymm11 - -/* -#reduce -red16 4 12 -red16 5 13 -red16 6 14 -red16 7 15 -red16 8 12 -red16 9 13 -red16 10 14 -red16 11 15 -*/ - -shuffle8 4,8,3,8 -shuffle8 5,9,4,9 -shuffle8 6,10,5,10 -shuffle8 7,11,6,11 - -shuffle4 3,5,7,5 -shuffle4 8,10,3,10 -shuffle4 4,6,8,6 -shuffle4 9,11,4,11 - -shuffle2 7,8,9,8 -shuffle2 5,6,7,6 -shuffle2 3,4,5,4 -shuffle2 10,11,3,11 - -shuffle1 9,5,10,5 -shuffle1 8,4,9,4 -shuffle1 7,3,8,3 -shuffle1 6,11,7,11 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm5,32(%rdi) -vmovdqa %ymm9,64(%rdi) -vmovdqa %ymm4,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm3,160(%rdi) -vmovdqa %ymm7,192(%rdi) -vmovdqa %ymm11,224(%rdi) - -ret - -.global PQCLEAN_KYBER768_AVX2_ntttobytes_avx -PQCLEAN_KYBER768_AVX2_ntttobytes_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xq(%rip),%ymm0 - -#load -vmovdqa (%rsi),%ymm5 -vmovdqa 32(%rsi),%ymm6 -vmovdqa 64(%rsi),%ymm7 -vmovdqa 96(%rsi),%ymm8 -vmovdqa 128(%rsi),%ymm9 -vmovdqa 160(%rsi),%ymm10 -vmovdqa 192(%rsi),%ymm11 -vmovdqa 224(%rsi),%ymm12 - -#csubq -csubq 5 13 -csubq 6 14 -csubq 7 15 -csubq 8 1 -csubq 9 13 -csubq 10 14 -csubq 11 15 -csubq 12 1 - -#bitpack -vpsllw $12,%ymm6,%ymm4 -vpor %ymm4,%ymm5,%ymm4 - -vpsrlw $4,%ymm6,%ymm5 -vpsllw $8,%ymm7,%ymm6 -vpor %ymm5,%ymm6,%ymm5 - -vpsrlw $8,%ymm7,%ymm6 -vpsllw $4,%ymm8,%ymm7 -vpor %ymm6,%ymm7,%ymm6 - -vpsllw $12,%ymm10,%ymm7 -vpor %ymm7,%ymm9,%ymm7 - -vpsrlw $4,%ymm10,%ymm8 -vpsllw $8,%ymm11,%ymm9 -vpor %ymm8,%ymm9,%ymm8 - -vpsrlw $8,%ymm11,%ymm9 -vpsllw $4,%ymm12,%ymm10 -vpor %ymm9,%ymm10,%ymm9 - -shuffle1 4,5,3,5 -shuffle1 6,7,4,7 -shuffle1 8,9,6,9 - -shuffle2 3,4,8,4 -shuffle2 6,5,3,5 -shuffle2 7,9,6,9 - -shuffle4 8,3,7,3 -shuffle4 6,4,8,4 -shuffle4 5,9,6,9 - -shuffle8 7,8,5,8 -shuffle8 6,3,7,3 -shuffle8 4,9,6,9 - -#store -vmovdqu %ymm5,(%rdi) -vmovdqu %ymm7,32(%rdi) -vmovdqu %ymm6,64(%rdi) -vmovdqu %ymm8,96(%rdi) -vmovdqu %ymm3,128(%rdi) -vmovdqu %ymm9,160(%rdi) - -ret - -.global PQCLEAN_KYBER768_AVX2_nttfrombytes_avx -PQCLEAN_KYBER768_AVX2_nttfrombytes_avx: -#consts -vmovdqa PQCLEAN_KYBER768_AVX2_16xmask(%rip),%ymm0 - -#load -vmovdqu (%rsi),%ymm4 -vmovdqu 32(%rsi),%ymm5 -vmovdqu 64(%rsi),%ymm6 -vmovdqu 96(%rsi),%ymm7 -vmovdqu 128(%rsi),%ymm8 -vmovdqu 160(%rsi),%ymm9 - -shuffle8 4,7,3,7 -shuffle8 5,8,4,8 -shuffle8 6,9,5,9 - -shuffle4 3,8,6,8 -shuffle4 7,5,3,5 -shuffle4 4,9,7,9 - -shuffle2 6,5,4,5 -shuffle2 8,7,6,7 -shuffle2 3,9,8,9 - -shuffle1 4,7,10,7 -shuffle1 5,8,4,8 -shuffle1 6,9,5,9 - -#bitunpack -vpsrlw $12,%ymm10,%ymm11 -vpsllw $4,%ymm7,%ymm12 -vpor %ymm11,%ymm12,%ymm11 -vpand %ymm0,%ymm10,%ymm10 -vpand %ymm0,%ymm11,%ymm11 - -vpsrlw $8,%ymm7,%ymm12 -vpsllw $8,%ymm4,%ymm13 -vpor %ymm12,%ymm13,%ymm12 -vpand %ymm0,%ymm12,%ymm12 - -vpsrlw $4,%ymm4,%ymm13 -vpand %ymm0,%ymm13,%ymm13 - -vpsrlw $12,%ymm8,%ymm14 -vpsllw $4,%ymm5,%ymm15 -vpor %ymm14,%ymm15,%ymm14 -vpand %ymm0,%ymm8,%ymm8 -vpand %ymm0,%ymm14,%ymm14 - -vpsrlw $8,%ymm5,%ymm15 -vpsllw $8,%ymm9,%ymm1 -vpor %ymm15,%ymm1,%ymm15 -vpand %ymm0,%ymm15,%ymm15 - -vpsrlw $4,%ymm9,%ymm1 -vpand %ymm0,%ymm1,%ymm1 - -#store -vmovdqa %ymm10,(%rdi) -vmovdqa %ymm11,32(%rdi) -vmovdqa %ymm12,64(%rdi) -vmovdqa %ymm13,96(%rdi) -vmovdqa %ymm8,128(%rdi) -vmovdqa %ymm14,160(%rdi) -vmovdqa %ymm15,192(%rdi) -vmovdqa %ymm1,224(%rdi) - -ret diff --git a/crypto_kem/kyber768/avx2/symmetric-fips202.c b/crypto_kem/kyber768/avx2/symmetric-fips202.c deleted file mode 100644 index 0b7a8e4a..00000000 --- a/crypto_kem/kyber768/avx2/symmetric-fips202.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "fips202.h" -#include "symmetric.h" - -#include -/************************************************* -* Name: kyber_shake128_absorb -* -* Description: Absorb step of the SHAKE128 specialized for the Kyber context. - -* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *input: pointer to KYBER_SYMBYTES input to be absorbed into s -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input -**************************************************/ -void PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) { - size_t i; - uint8_t extseed[KYBER_SYMBYTES + 2]; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extseed[i] = input[i]; - } - extseed[i++] = x; - extseed[i] = y; - shake128_absorb(s, extseed, KYBER_SYMBYTES + 2); -} - -/************************************************* -* Name: kyber_shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. -* Modifies the state. Can be called multiple times to keep squeezing, -* i.e., is incremental. -* -* Arguments: - uint8_t *output: pointer to output blocks -* - unsigned long long nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to in/output Keccak state -**************************************************/ -void PQCLEAN_KYBER768_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s) { - shake128_squeezeblocks(output, nblocks, s); -} - -/************************************************* -* Name: shake256_prf -* -* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input -* and then generates outlen bytes of SHAKE256 output -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t * key: pointer to the key (of length KYBER_SYMBYTES) -* - const uint8_t nonce: single-byte nonce (public PRF input) -**************************************************/ -void PQCLEAN_KYBER768_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t extkey[KYBER_SYMBYTES + 1]; - size_t i; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extkey[i] = key[i]; - } - extkey[i] = nonce; - - shake256(output, outlen, extkey, KYBER_SYMBYTES + 1); -} diff --git a/crypto_kem/kyber768/avx2/symmetric-shake.c b/crypto_kem/kyber768/avx2/symmetric-shake.c new file mode 100644 index 00000000..bc980363 --- /dev/null +++ b/crypto_kem/kyber768/avx2/symmetric-shake.c @@ -0,0 +1,60 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include +#include + +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. +* +* Arguments: - keccak_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input +* to be absorbed into state +* - uint8_t i additional byte of input +* - uint8_t j additional byte of input +**************************************************/ +void PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(xof_state *state, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y) { + unsigned int i = 0; + uint8_t extseed[KYBER_SYMBYTES + 2]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extseed[i] = seed[i]; + } + extseed[i++] = x; + extseed[i] = y; + + shake128_absorb(state, extseed, sizeof(extseed)); +} + +/************************************************* +* Name: PQCLEAN_KYBER768_AVX2_kyber_shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key +* (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce) { + unsigned int i = 0; + uint8_t extkey[KYBER_SYMBYTES + 1]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extkey[i] = key[i]; + } + extkey[i] = nonce; + + shake256(out, outlen, extkey, sizeof(extkey)); +} diff --git a/crypto_kem/kyber768/avx2/symmetric.h b/crypto_kem/kyber768/avx2/symmetric.h index eecd78c2..6b4816dd 100644 --- a/crypto_kem/kyber768/avx2/symmetric.h +++ b/crypto_kem/kyber768/avx2/symmetric.h @@ -2,28 +2,36 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "fips202.h" #include "fips202x4.h" -typedef shake128ctx keccak_state; +typedef shake128ctx xof_state; -void PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y); -void PQCLEAN_KYBER768_AVX2_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s); -void PQCLEAN_KYBER768_AVX2_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); +void PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(shake128ctx *s, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y); + +void PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce); + +#define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(STATE, IN, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER768_AVX2_kyber_shake128_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER768_AVX2_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER768_AVX2_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) -#define XOF_BLOCKBYTES SHAKE128_RATE - -typedef keccak_state xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber768/avx2/verify.c b/crypto_kem/kyber768/avx2/verify.c index 10cdfa47..7fd33147 100644 --- a/crypto_kem/kyber768/avx2/verify.c +++ b/crypto_kem/kyber768/avx2/verify.c @@ -1,23 +1,22 @@ #include "verify.h" - #include #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER768_AVX2_verify * * Description: Compare two arrays for equality in constant time. * -* Arguments: const uint8_t *a: pointer to first byte array -* const uint8_t *b: pointer to second byte array +* Arguments: const unsigned char *a: pointer to first byte array +* const unsigned char *b: pointer to second byte array * size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER768_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { - size_t pos; - uint64_t r; +int PQCLEAN_KYBER768_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t pos = 0; + uint64_t r = 0; __m256i avec, bvec, cvec; cvec = _mm256_setzero_si256(); @@ -27,38 +26,38 @@ uint8_t PQCLEAN_KYBER768_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t avec = _mm256_xor_si256(avec, bvec); cvec = _mm256_or_si256(cvec, avec); } + r = !_mm256_testz_si256(cvec, cvec); - cvec = _mm256_cmpeq_epi8(cvec, _mm256_setzero_si256()); - r = (uint32_t)(_mm256_movemask_epi8(cvec) ^ -1); - - while (pos < len) { - r |= a[pos] ^ b[pos]; - pos += 1; + if (pos < len) { + avec = _mm256_loadu_si256((__m256i *)&a[pos]); + bvec = _mm256_loadu_si256((__m256i *)&b[pos]); + cvec = _mm256_cmpeq_epi8(avec, bvec); + r |= _mm256_movemask_epi8(cvec) & (-(uint32_t)1 >> (32 + pos - len)); } r = (-r) >> 63; - return (uint8_t)r; + return r; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER768_AVX2_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; * assumes two's complement representation of negative integers. * Runs in constant time. * -* Arguments: uint8_t *r: pointer to output byte array -* const uint8_t *x: pointer to input byte array +* Arguments: unsigned char *r: pointer to output byte array +* const unsigned char *x: pointer to input byte array * size_t len: Amount of bytes to be copied -* uint8_t b: Condition bit; has to be in {0,1} +* unsigned char b: Condition bit; has to be in {0,1} **************************************************/ -void PQCLEAN_KYBER768_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t pos; +void PQCLEAN_KYBER768_AVX2_cmov(uint8_t *restrict r, const uint8_t *restrict x, size_t len, uint8_t b) { + size_t pos = 0; __m256i xvec, rvec, bvec; b = -b; - bvec = _mm256_set1_epi8((char)b); + bvec = _mm256_set1_epi8(b); for (pos = 0; pos + 32 <= len; pos += 32) { rvec = _mm256_loadu_si256((__m256i *)&r[pos]); diff --git a/crypto_kem/kyber768/avx2/verify.h b/crypto_kem/kyber768/avx2/verify.h index fbb7b58f..54c986d1 100644 --- a/crypto_kem/kyber768/avx2/verify.h +++ b/crypto_kem/kyber768/avx2/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER768_AVX2_VERIFY_H +#define PQCLEAN_KYBER768_AVX2_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER768_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER768_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER768_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/crypto_kem/kyber768/clean/LICENSE b/crypto_kem/kyber768/clean/LICENSE index 7b02ea1b..08473af7 100644 --- a/crypto_kem/kyber768/clean/LICENSE +++ b/crypto_kem/kyber768/clean/LICENSE @@ -1,14 +1,4 @@ -kyber-20170627 -Public Domain -Authors: Joppe Bos, - Léo Ducas, - Eike Kiltz , - Tancrède Lepoint, - Vadim Lyubashevsky, - John Schanck, - Peter Schwabe, - Gregor Seiler, - Damien Stehlé +Public Domain (https://creativecommons.org/share-your-work/public-domain/cc0/) For Keccak and AES we are using public-domain code from sources and by authors listed in diff --git a/crypto_kem/kyber768/clean/Makefile b/crypto_kem/kyber768/clean/Makefile index 93058b9c..f8c1c5db 100644 --- a/crypto_kem/kyber768/clean/Makefile +++ b/crypto_kem/kyber768/clean/Makefile @@ -1,8 +1,8 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libkyber768_clean.a -HEADERS=api.h cbd.h indcpa.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h -OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-fips202.o +HEADERS=api.h cbd.h indcpa.h kem.h ntt.h params.h poly.h polyvec.h reduce.h verify.h symmetric.h +OBJECTS=cbd.o indcpa.o kem.o ntt.o poly.o polyvec.o reduce.o verify.o symmetric-shake.o CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/kyber768/clean/Makefile.Microsoft_nmake b/crypto_kem/kyber768/clean/Makefile.Microsoft_nmake index bf738c2c..394bc4cf 100644 --- a/crypto_kem/kyber768/clean/Makefile.Microsoft_nmake +++ b/crypto_kem/kyber768/clean/Makefile.Microsoft_nmake @@ -2,7 +2,7 @@ # nmake /f Makefile.Microsoft_nmake LIBRARY=libkyber768_clean.lib -OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-fips202.obj +OBJECTS=cbd.obj indcpa.obj kem.obj ntt.obj poly.obj polyvec.obj reduce.obj verify.obj symmetric-shake.obj # Warning C4146 is raised when a unary minus operator is applied to an # unsigned type; this has nonetheless been standard and portable for as diff --git a/crypto_kem/kyber768/clean/cbd.c b/crypto_kem/kyber768/clean/cbd.c index 197c31c8..90e20195 100644 --- a/crypto_kem/kyber768/clean/cbd.c +++ b/crypto_kem/kyber768/clean/cbd.c @@ -1,7 +1,5 @@ -#include "cbd.h" #include "params.h" - -#include +#include "cbd.h" #include /************************************************* @@ -14,8 +12,8 @@ * * Returns 32-bit unsigned integer loaded from x **************************************************/ -static uint32_t load32_littleendian(const uint8_t *x) { - uint32_t r; +static uint32_t load32_littleendian(const uint8_t x[4]) { + uint32_t r = 0; r = (uint32_t)x[0]; r |= (uint32_t)x[1] << 8; r |= (uint32_t)x[2] << 16; @@ -24,27 +22,27 @@ static uint32_t load32_littleendian(const uint8_t *x) { } /************************************************* -* Name: cbd +* Name: PQCLEAN_KYBER768_CLEAN_cbd * * Description: Given an array of uniformly random bytes, compute * polynomial with coefficients distributed according to * a centered binomial distribution with parameter KYBER_ETA -* specialized for KYBER_ETA=2 * -* Arguments: - poly *r: pointer to output polynomial +* Arguments: - poly *r: pointer to output polynomial * - const uint8_t *buf: pointer to input byte array **************************************************/ -void PQCLEAN_KYBER768_CLEAN_cbd(poly *r, const uint8_t *buf) { - uint32_t d, t; - int16_t a, b; +void PQCLEAN_KYBER768_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]) { + unsigned int i = 0, j = 0; + uint32_t t = 0, d = 0; + int16_t a = 0, b = 0; - for (size_t i = 0; i < KYBER_N / 8; i++) { - t = load32_littleendian(buf + 4 * i); + for (i = 0; i < KYBER_N / 8; i++) { + t = load32_littleendian(buf + 4 * i); d = t & 0x55555555; d += (t >> 1) & 0x55555555; - for (size_t j = 0; j < 8; j++) { - a = (d >> 4 * j) & 0x3; + for (j = 0; j < 8; j++) { + a = (d >> (4 * j + 0)) & 0x3; b = (d >> (4 * j + 2)) & 0x3; r->coeffs[8 * i + j] = a - b; } diff --git a/crypto_kem/kyber768/clean/cbd.h b/crypto_kem/kyber768/clean/cbd.h index 89c24fe2..26818803 100644 --- a/crypto_kem/kyber768/clean/cbd.h +++ b/crypto_kem/kyber768/clean/cbd.h @@ -1,8 +1,11 @@ -#ifndef CBD_H -#define CBD_H +#ifndef PQCLEAN_KYBER768_CLEAN_CBD_H +#define PQCLEAN_KYBER768_CLEAN_CBD_H +#include "params.h" #include "poly.h" +#include -void PQCLEAN_KYBER768_CLEAN_cbd(poly *r, const uint8_t *buf); + +void PQCLEAN_KYBER768_CLEAN_cbd(poly *r, const uint8_t buf[KYBER_ETA * KYBER_N / 4]); #endif diff --git a/crypto_kem/kyber768/clean/indcpa.c b/crypto_kem/kyber768/clean/indcpa.c index 47f6d808..bc3aa176 100644 --- a/crypto_kem/kyber768/clean/indcpa.c +++ b/crypto_kem/kyber768/clean/indcpa.c @@ -5,7 +5,7 @@ #include "polyvec.h" #include "randombytes.h" #include "symmetric.h" - +#include #include /************************************************* @@ -16,12 +16,15 @@ * and the public seed used to generate the matrix A. * * Arguments: uint8_t *r: pointer to the output serialized public key -* const poly *pk: pointer to the input public-key polynomial +* polyvec *pk: pointer to the input public-key polyvec * const uint8_t *seed: pointer to the input public seed **************************************************/ -static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { +static void pack_pk(uint8_t r[KYBER_INDCPA_PUBLICKEYBYTES], + polyvec *pk, + const uint8_t seed[KYBER_SYMBYTES]) { + size_t i = 0; PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(r, pk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { r[i + KYBER_POLYVECBYTES] = seed[i]; } } @@ -32,13 +35,18 @@ static void pack_pk(uint8_t *r, polyvec *pk, const uint8_t *seed) { * Description: De-serialize public key from a byte array; * approximate inverse of pack_pk * -* Arguments: - polyvec *pk: pointer to output public-key vector of polynomials -* - uint8_t *seed: pointer to output seed to generate matrix A +* Arguments: - polyvec *pk: pointer to output public-key +* polynomial vector +* - uint8_t *seed: pointer to output seed to generate +* matrix A * - const uint8_t *packedpk: pointer to input serialized public key **************************************************/ -static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { +static void unpack_pk(polyvec *pk, + uint8_t seed[KYBER_SYMBYTES], + const uint8_t packedpk[KYBER_INDCPA_PUBLICKEYBYTES]) { + size_t i = 0; PQCLEAN_KYBER768_CLEAN_polyvec_frombytes(pk, packedpk); - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { + for (i = 0; i < KYBER_SYMBYTES; i++) { seed[i] = packedpk[i + KYBER_POLYVECBYTES]; } } @@ -49,9 +57,9 @@ static void unpack_pk(polyvec *pk, uint8_t *seed, const uint8_t *packedpk) { * Description: Serialize the secret key * * Arguments: - uint8_t *r: pointer to output serialized secret key -* - const polyvec *sk: pointer to input vector of polynomials (secret key) +* - polyvec *sk: pointer to input vector of polynomials (secret key) **************************************************/ -static void pack_sk(uint8_t *r, polyvec *sk) { +static void pack_sk(uint8_t r[KYBER_INDCPA_SECRETKEYBYTES], polyvec *sk) { PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(r, sk); } @@ -61,10 +69,12 @@ static void pack_sk(uint8_t *r, polyvec *sk) { * Description: De-serialize the secret key; * inverse of pack_sk * -* Arguments: - polyvec *sk: pointer to output vector of polynomials (secret key) +* Arguments: - polyvec *sk: pointer to output vector of +* polynomials (secret key) * - const uint8_t *packedsk: pointer to input serialized secret key **************************************************/ -static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { +static void unpack_sk(polyvec *sk, + const uint8_t packedsk[KYBER_INDCPA_SECRETKEYBYTES]) { PQCLEAN_KYBER768_CLEAN_polyvec_frombytes(sk, packedsk); } @@ -75,11 +85,13 @@ static void unpack_sk(polyvec *sk, const uint8_t *packedsk) { * compressed and serialized vector of polynomials b * and the compressed and serialized polynomial v * -* Arguments: uint8_t *r: pointer to the output serialized ciphertext -* const poly *pk: pointer to the input vector of polynomials b -* const uint8_t *seed: pointer to the input polynomial v +* Arguments: uint8_t *r: pointer to the output serialized ciphertext +* poly *pk: pointer to the input vector of polynomials b +* poly *v: pointer to the input polynomial v **************************************************/ -static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { +static void pack_ciphertext(uint8_t r[KYBER_INDCPA_BYTES], + polyvec *b, + poly *v) { PQCLEAN_KYBER768_CLEAN_polyvec_compress(r, b); PQCLEAN_KYBER768_CLEAN_poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v); } @@ -90,11 +102,13 @@ static void pack_ciphertext(uint8_t *r, polyvec *b, poly *v) { * Description: De-serialize and decompress ciphertext from a byte array; * approximate inverse of pack_ciphertext * -* Arguments: - polyvec *b: pointer to the output vector of polynomials b -* - poly *v: pointer to the output polynomial v +* Arguments: - polyvec *b: pointer to the output vector of polynomials b +* - poly *v: pointer to the output polynomial v * - const uint8_t *c: pointer to the input serialized ciphertext **************************************************/ -static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { +static void unpack_ciphertext(polyvec *b, + poly *v, + const uint8_t c[KYBER_INDCPA_BYTES]) { PQCLEAN_KYBER768_CLEAN_polyvec_decompress(b, c); PQCLEAN_KYBER768_CLEAN_poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES); } @@ -105,24 +119,29 @@ static void unpack_ciphertext(polyvec *b, poly *v, const uint8_t *c) { * Description: Run rejection sampling on uniform random bytes to generate * uniform random integers mod q * -* Arguments: - int16_t *r: pointer to output buffer -* - size_t len: requested number of 16-bit integers (uniform mod q) -* - const uint8_t *buf: pointer to input buffer (assumed to be uniform random bytes) -* - size_t buflen: length of input buffer in bytes +* Arguments: - int16_t *r: pointer to output buffer +* - unsigned int len: requested number of 16-bit integers +* (uniform mod q) +* - const uint8_t *buf: pointer to input buffer +* (assumed to be uniform random bytes) +* - unsigned int buflen: length of input buffer in bytes * * Returns number of sampled 16-bit integers (at most len) **************************************************/ -static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buflen) { - size_t ctr, pos; - uint16_t val; +static unsigned int rej_uniform(int16_t *r, + unsigned int len, + const uint8_t *buf, + unsigned int buflen) { + unsigned int ctr = 0, pos = 0; + uint16_t val = 0; ctr = pos = 0; while (ctr < len && pos + 2 <= buflen) { - val = (uint16_t)(buf[pos] | ((uint16_t)buf[pos + 1] << 8)); + val = buf[pos] | ((uint16_t)buf[pos + 1] << 8); pos += 2; if (val < 19 * KYBER_Q) { - val -= (uint16_t)((val >> 12) * KYBER_Q); // Barrett reduction + val -= (val >> 12) * KYBER_Q; // Barrett reduction r[ctr++] = (int16_t)val; } } @@ -130,26 +149,28 @@ static size_t rej_uniform(int16_t *r, size_t len, const uint8_t *buf, size_t buf return ctr; } -#define gen_a(A,B) gen_matrix(A,B,0) -#define gen_at(A,B) gen_matrix(A,B,1) +#define gen_a(A,B) PQCLEAN_KYBER768_CLEAN_gen_matrix(A,B,0) +#define gen_at(A,B) PQCLEAN_KYBER768_CLEAN_gen_matrix(A,B,1) /************************************************* -* Name: gen_matrix +* Name: PQCLEAN_KYBER768_CLEAN_gen_matrix * * Description: Deterministically generate matrix A (or the transpose of A) * from a seed. Entries of the matrix are polynomials that look * uniformly random. Performs rejection sampling on output of * a XOF * -* Arguments: - polyvec *a: pointer to ouptput matrix A +* Arguments: - polyvec *a: pointer to ouptput matrix A * - const uint8_t *seed: pointer to input seed -* - int transposed: boolean deciding whether A or A^T is generated +* - int transposed: boolean deciding whether A or A^T +* is generated **************************************************/ -#define MAXNBLOCKS ((530+XOF_BLOCKBYTES)/XOF_BLOCKBYTES) /* 530 is expected number of required bytes */ -static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { - size_t ctr; - uint8_t i, j; - uint8_t buf[XOF_BLOCKBYTES * MAXNBLOCKS + 1]; +#define GEN_MATRIX_NBLOCKS ((2*KYBER_N*(1U << 16)/(19*KYBER_Q) \ + + XOF_BLOCKBYTES)/XOF_BLOCKBYTES) +// Not static for benchmarking +void PQCLEAN_KYBER768_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed) { + unsigned int ctr = 0, i = 0, j = 0; + uint8_t buf[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES]; xof_state state; for (i = 0; i < KYBER_K; i++) { @@ -160,12 +181,13 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { xof_absorb(&state, seed, j, i); } - xof_squeezeblocks(buf, MAXNBLOCKS, &state); - ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, MAXNBLOCKS * XOF_BLOCKBYTES); + xof_squeezeblocks(buf, GEN_MATRIX_NBLOCKS, &state); + ctr = rej_uniform(a[i].vec[j].coeffs, KYBER_N, buf, sizeof(buf)); while (ctr < KYBER_N) { xof_squeezeblocks(buf, 1, &state); - ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, XOF_BLOCKBYTES); + ctr += rej_uniform(a[i].vec[j].coeffs + ctr, KYBER_N - ctr, buf, + XOF_BLOCKBYTES); } xof_ctx_release(&state); } @@ -173,40 +195,44 @@ static void gen_matrix(polyvec *a, const uint8_t *seed, int transposed) { } /************************************************* -* Name: indcpa_keypair +* Name: PQCLEAN_KYBER768_CLEAN_indcpa_keypair * * Description: Generates public and private key for the CPA-secure * public-key encryption scheme underlying Kyber * -* Arguments: - uint8_t *pk: pointer to output public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (of length KYBER_INDCPA_SECRETKEYBYTES bytes) +* Arguments: - uint8_t *pk: pointer to output public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) +* - uint8_t *sk: pointer to output private key + (of length KYBER_INDCPA_SECRETKEYBYTES bytes) **************************************************/ -void PQCLEAN_KYBER768_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { - polyvec a[KYBER_K], e, pkpv, skpv; +void PQCLEAN_KYBER768_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { + unsigned int i = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t *publicseed = buf; - uint8_t *noiseseed = buf + KYBER_SYMBYTES; + const uint8_t *publicseed = buf; + const uint8_t *noiseseed = buf + KYBER_SYMBYTES; uint8_t nonce = 0; + polyvec a[KYBER_K], e, pkpv, skpv; randombytes(buf, KYBER_SYMBYTES); hash_g(buf, buf, KYBER_SYMBYTES); gen_a(a, publicseed); - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_getnoise(skpv.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER768_CLEAN_poly_getnoise(&skpv.vec[i], noiseseed, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_getnoise(e.vec + i, noiseseed, nonce++); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER768_CLEAN_poly_getnoise(&e.vec[i], noiseseed, nonce++); } PQCLEAN_KYBER768_CLEAN_polyvec_ntt(&skpv); PQCLEAN_KYBER768_CLEAN_polyvec_ntt(&e); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc(&pkpv.vec[i], &a[i], &skpv); - PQCLEAN_KYBER768_CLEAN_poly_frommont(&pkpv.vec[i]); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(&pkpv.vec[i], &a[i], &skpv); + PQCLEAN_KYBER768_CLEAN_poly_tomont(&pkpv.vec[i]); } PQCLEAN_KYBER768_CLEAN_polyvec_add(&pkpv, &pkpv, &e); @@ -217,34 +243,40 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_keypair(uint8_t *pk, uint8_t *sk) { } /************************************************* -* Name: indcpa_enc +* Name: PQCLEAN_KYBER768_CLEAN_indcpa_enc * * Description: Encryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *c: pointer to output ciphertext (of length KYBER_INDCPA_BYTES bytes) -* - const uint8_t *m: pointer to input message (of length KYBER_INDCPA_MSGBYTES bytes) -* - const uint8_t *pk: pointer to input public key (of length KYBER_INDCPA_PUBLICKEYBYTES bytes) -* - const uint8_t *coin: pointer to input random coins used as seed (of length KYBER_SYMBYTES bytes) -* to deterministically generate all randomness +* Arguments: - uint8_t *c: pointer to output ciphertext +* (of length KYBER_INDCPA_BYTES bytes) +* - const uint8_t *m: pointer to input message +* (of length KYBER_INDCPA_MSGBYTES bytes) +* - const uint8_t *pk: pointer to input public key +* (of length KYBER_INDCPA_PUBLICKEYBYTES) +* - const uint8_t *coins: pointer to input random coins +* used as seed (of length KYBER_SYMBYTES) +* to deterministically generate all +* randomness **************************************************/ -void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins) { - polyvec sp, pkpv, ep, at[KYBER_K], bp; - poly v, k, epp; +void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], + const uint8_t coins[KYBER_SYMBYTES]) { + unsigned int i = 0; uint8_t seed[KYBER_SYMBYTES]; uint8_t nonce = 0; + polyvec sp, pkpv, ep, at[KYBER_K], bp; + poly v, k, epp; unpack_pk(&pkpv, seed, pk); PQCLEAN_KYBER768_CLEAN_poly_frommsg(&k, m); gen_at(at, seed); - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_getnoise(sp.vec + i, coins, nonce++); } - for (size_t i = 0; i < KYBER_K; i++) { + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_getnoise(ep.vec + i, coins, nonce++); } PQCLEAN_KYBER768_CLEAN_poly_getnoise(&epp, coins, nonce++); @@ -252,14 +284,14 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t *c, PQCLEAN_KYBER768_CLEAN_polyvec_ntt(&sp); // matrix-vector multiplication - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc(&bp.vec[i], &at[i], &sp); + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(&bp.vec[i], &at[i], &sp); } - PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc(&v, &pkpv, &sp); + PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(&v, &pkpv, &sp); - PQCLEAN_KYBER768_CLEAN_polyvec_invntt(&bp); - PQCLEAN_KYBER768_CLEAN_poly_invntt(&v); + PQCLEAN_KYBER768_CLEAN_polyvec_invntt_tomont(&bp); + PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(&v); PQCLEAN_KYBER768_CLEAN_polyvec_add(&bp, &bp, &ep); PQCLEAN_KYBER768_CLEAN_poly_add(&v, &v, &epp); @@ -271,18 +303,21 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t *c, } /************************************************* -* Name: indcpa_dec +* Name: PQCLEAN_KYBER768_CLEAN_indcpa_dec * * Description: Decryption function of the CPA-secure * public-key encryption scheme underlying Kyber. * -* Arguments: - uint8_t *m: pointer to output decrypted message (of length KYBER_INDCPA_MSGBYTES) -* - const uint8_t *c: pointer to input ciphertext (of length KYBER_INDCPA_BYTES) -* - const uint8_t *sk: pointer to input secret key (of length KYBER_INDCPA_SECRETKEYBYTES) +* Arguments: - uint8_t *m: pointer to output decrypted message +* (of length KYBER_INDCPA_MSGBYTES) +* - const uint8_t *c: pointer to input ciphertext +* (of length KYBER_INDCPA_BYTES) +* - const uint8_t *sk: pointer to input secret key +* (of length KYBER_INDCPA_SECRETKEYBYTES) **************************************************/ -void PQCLEAN_KYBER768_CLEAN_indcpa_dec(uint8_t *m, - const uint8_t *c, - const uint8_t *sk) { +void PQCLEAN_KYBER768_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], + const uint8_t c[KYBER_INDCPA_BYTES], + const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]) { polyvec bp, skpv; poly v, mp; @@ -290,8 +325,8 @@ void PQCLEAN_KYBER768_CLEAN_indcpa_dec(uint8_t *m, unpack_sk(&skpv, sk); PQCLEAN_KYBER768_CLEAN_polyvec_ntt(&bp); - PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc(&mp, &skpv, &bp); - PQCLEAN_KYBER768_CLEAN_poly_invntt(&mp); + PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(&mp, &skpv, &bp); + PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(&mp); PQCLEAN_KYBER768_CLEAN_poly_sub(&mp, &v, &mp); PQCLEAN_KYBER768_CLEAN_poly_reduce(&mp); diff --git a/crypto_kem/kyber768/clean/indcpa.h b/crypto_kem/kyber768/clean/indcpa.h index b9fd0ea0..e07ba93f 100644 --- a/crypto_kem/kyber768/clean/indcpa.h +++ b/crypto_kem/kyber768/clean/indcpa.h @@ -1,21 +1,16 @@ -#ifndef INDCPA_H -#define INDCPA_H +#ifndef PQCLEAN_KYBER768_CLEAN_INDCPA_H +#define PQCLEAN_KYBER768_CLEAN_INDCPA_H +#include "params.h" +#include "polyvec.h" #include -void PQCLEAN_KYBER768_CLEAN_indcpa_keypair( - uint8_t *pk, - uint8_t *sk); +void PQCLEAN_KYBER768_CLEAN_gen_matrix(polyvec *a, const uint8_t seed[KYBER_SYMBYTES], int transposed); -void PQCLEAN_KYBER768_CLEAN_indcpa_enc( - uint8_t *c, - const uint8_t *m, - const uint8_t *pk, - const uint8_t *coins); +void PQCLEAN_KYBER768_CLEAN_indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_KYBER768_CLEAN_indcpa_dec( - uint8_t *m, - const uint8_t *c, - const uint8_t *sk); +void PQCLEAN_KYBER768_CLEAN_indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES], const uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES], const uint8_t coins[KYBER_SYMBYTES]); + +void PQCLEAN_KYBER768_CLEAN_indcpa_dec(uint8_t m[KYBER_INDCPA_MSGBYTES], const uint8_t c[KYBER_INDCPA_BYTES], const uint8_t sk[KYBER_INDCPA_SECRETKEYBYTES]); #endif diff --git a/crypto_kem/kyber768/clean/kem.c b/crypto_kem/kyber768/clean/kem.c index 8df427b6..f84bcfc5 100644 --- a/crypto_kem/kyber768/clean/kem.c +++ b/crypto_kem/kyber768/clean/kem.c @@ -1,99 +1,125 @@ -#include "api.h" #include "indcpa.h" +#include "kem.h" #include "params.h" #include "randombytes.h" #include "symmetric.h" #include "verify.h" +#include +#include -#include /************************************************* -* Name: crypto_kem_keypair +* Name: PQCLEAN_KYBER768_CLEAN_crypto_kem_keypair * * Description: Generates public and private key * for CCA-secure Kyber key encapsulation mechanism * -* Arguments: - uint8_t *pk: pointer to output public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) -* - uint8_t *sk: pointer to output private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *pk: pointer to output public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* - unsigned char *sk: pointer to output private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER768_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - size_t i; +int PQCLEAN_KYBER768_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + size_t i = 0; PQCLEAN_KYBER768_CLEAN_indcpa_keypair(pk, sk); for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) { sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i]; } hash_h(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); - randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */ + /* Value z for pseudo-random output on reject */ + randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_enc +* Name: PQCLEAN_KYBER768_CLEAN_crypto_kem_enc * * Description: Generates cipher text and shared * secret for given public key * -* Arguments: - uint8_t *ct: pointer to output cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *pk: pointer to input public key (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) +* Arguments: - unsigned char *ct: pointer to output cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *pk: pointer to input public key +* (an already allocated array of CRYPTO_PUBLICKEYBYTES bytes) * * Returns 0 (success) **************************************************/ -int PQCLEAN_KYBER768_CLEAN_crypto_kem_enc(uint8_t *ct, uint8_t *ss, const uint8_t *pk) { - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ +int PQCLEAN_KYBER768_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk) { uint8_t buf[2 * KYBER_SYMBYTES]; + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; randombytes(buf, KYBER_SYMBYTES); - hash_h(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */ + /* Don't release system RNG output */ + hash_h(buf, buf, KYBER_SYMBYTES); - hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins + contributory KEM */ + /* Multitarget countermeasure for coins + contributory KEM */ + hash_h(buf + KYBER_SYMBYTES, pk, KYBER_PUBLICKEYBYTES); hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER768_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER768_CLEAN_indcpa_enc(ct, buf, pk, kr + KYBER_SYMBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } /************************************************* -* Name: crypto_kem_dec +* Name: PQCLEAN_KYBER768_CLEAN_crypto_kem_dec * * Description: Generates shared secret for given * cipher text and private key * -* Arguments: - uint8_t *ss: pointer to output shared secret (an already allocated array of CRYPTO_BYTES bytes) -* - const uint8_t *ct: pointer to input cipher text (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) -* - const uint8_t *sk: pointer to input private key (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) +* Arguments: - unsigned char *ss: pointer to output shared secret +* (an already allocated array of CRYPTO_BYTES bytes) +* - const unsigned char *ct: pointer to input cipher text +* (an already allocated array of CRYPTO_CIPHERTEXTBYTES bytes) +* - const unsigned char *sk: pointer to input private key +* (an already allocated array of CRYPTO_SECRETKEYBYTES bytes) * * Returns 0. * * On failure, ss will contain a pseudo-random value. **************************************************/ -int PQCLEAN_KYBER768_CLEAN_crypto_kem_dec(uint8_t *ss, const uint8_t *ct, const uint8_t *sk) { - size_t i; - uint8_t fail; - uint8_t cmp[KYBER_CIPHERTEXTBYTES]; +int PQCLEAN_KYBER768_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk) { + size_t i = 0; + int fail = 0; uint8_t buf[2 * KYBER_SYMBYTES]; - uint8_t kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */ + /* Will contain key, coins */ + uint8_t kr[2 * KYBER_SYMBYTES]; + uint8_t cmp[KYBER_CIPHERTEXTBYTES]; const uint8_t *pk = sk + KYBER_INDCPA_SECRETKEYBYTES; PQCLEAN_KYBER768_CLEAN_indcpa_dec(buf, ct, sk); - for (i = 0; i < KYBER_SYMBYTES; i++) { /* Multitarget countermeasure for coins + contributory KEM */ - buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; /* Save hash by storing H(pk) in sk */ + /* Multitarget countermeasure for coins + contributory KEM */ + for (i = 0; i < KYBER_SYMBYTES; i++) { + buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES + i]; } hash_g(kr, buf, 2 * KYBER_SYMBYTES); - PQCLEAN_KYBER768_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */ + /* coins are in kr+KYBER_SYMBYTES */ + PQCLEAN_KYBER768_CLEAN_indcpa_enc(cmp, buf, pk, kr + KYBER_SYMBYTES); fail = PQCLEAN_KYBER768_CLEAN_verify(ct, cmp, KYBER_CIPHERTEXTBYTES); - hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */ + /* overwrite coins in kr with H(c) */ + hash_h(kr + KYBER_SYMBYTES, ct, KYBER_CIPHERTEXTBYTES); - PQCLEAN_KYBER768_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); /* Overwrite pre-k with z on re-encryption failure */ + /* Overwrite pre-k with z on re-encryption failure */ + PQCLEAN_KYBER768_CLEAN_cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES, fail); - kdf(ss, kr, 2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */ + /* hash concatenation of pre-k and H(c) to k */ + kdf(ss, kr, 2 * KYBER_SYMBYTES); return 0; } diff --git a/crypto_kem/kyber768/clean/kem.h b/crypto_kem/kyber768/clean/kem.h new file mode 100644 index 00000000..75c79892 --- /dev/null +++ b/crypto_kem/kyber768/clean/kem.h @@ -0,0 +1,19 @@ +#ifndef PQCLEAN_KYBER768_CLEAN_KEM_H +#define PQCLEAN_KYBER768_CLEAN_KEM_H + +#include "params.h" + + +int PQCLEAN_KYBER768_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + + +int PQCLEAN_KYBER768_CLEAN_crypto_kem_enc(unsigned char *ct, + unsigned char *ss, + const unsigned char *pk); + + +int PQCLEAN_KYBER768_CLEAN_crypto_kem_dec(unsigned char *ss, + const unsigned char *ct, + const unsigned char *sk); + +#endif diff --git a/crypto_kem/kyber768/clean/ntt.c b/crypto_kem/kyber768/clean/ntt.c index e80ea6a2..f9b44a29 100644 --- a/crypto_kem/kyber768/clean/ntt.c +++ b/crypto_kem/kyber768/clean/ntt.c @@ -1,11 +1,9 @@ -#include "ntt.h" #include "params.h" +#include "ntt.h" #include "reduce.h" - -#include #include -/* Code to generate zetas and zetas_inv used in the number-theoretic transform: +/* Code to generate PQCLEAN_KYBER768_CLEAN_zetas and PQCLEAN_KYBER768_CLEAN_zetas_inv used in the number-theoretic transform: #define KYBER_ROOT_OF_UNITY 17 @@ -17,12 +15,8 @@ static const uint16_t tree[128] = { 1, 65, 33, 97, 17, 81, 49, 113, 9, 73, 41, 105, 25, 89, 57, 121, 5, 69, 37, 101, 21, 85, 53, 117, 13, 77, 45, 109, 29, 93, 61, 125, 3, 67, 35, 99, 19, 83, 51, 115, 11, 75, 43, 107, 27, 91, 59, 123, - 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127}; - - -static int16_t fqmul(int16_t a, int16_t b) { - return montgomery_reduce((int32_t)a*b); -} + 7, 71, 39, 103, 23, 87, 55, 119, 15, 79, 47, 111, 31, 95, 63, 127 +}; void init_ntt() { unsigned int i, j, k; @@ -33,40 +27,44 @@ void init_ntt() { tmp[i] = fqmul(tmp[i-1], KYBER_ROOT_OF_UNITY*MONT % KYBER_Q); for(i = 0; i < 128; ++i) - zetas[i] = tmp[tree[i]]; + PQCLEAN_KYBER768_CLEAN_zetas[i] = tmp[tree[i]]; k = 0; for(i = 64; i >= 1; i >>= 1) for(j = i; j < 2*i; ++j) - zetas_inv[k++] = -tmp[128 - tree[j]]; + PQCLEAN_KYBER768_CLEAN_zetas_inv[k++] = -tmp[128 - tree[j]]; - zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; + PQCLEAN_KYBER768_CLEAN_zetas_inv[127] = MONT * (MONT * (KYBER_Q - 1) * ((KYBER_Q - 1)/128) % KYBER_Q) % KYBER_Q; } */ + const int16_t PQCLEAN_KYBER768_CLEAN_zetas[128] = { - 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, 2127, 1855, 1468, - 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, 732, 608, 1787, 411, 3124, 1758, - 1223, 652, 2777, 1015, 2036, 1491, 3047, 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, - 2476, 3239, 3058, 830, 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, - 2226, 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, 1653, - 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, 418, 329, 3173, 3254, - 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, 1218, 1994, 2455, 220, 2142, 1670, - 2144, 1799, 2051, 794, 1819, 2475, 2459, 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 + 2285, 2571, 2970, 1812, 1493, 1422, 287, 202, 3158, 622, 1577, 182, 962, + 2127, 1855, 1468, 573, 2004, 264, 383, 2500, 1458, 1727, 3199, 2648, 1017, + 732, 608, 1787, 411, 3124, 1758, 1223, 652, 2777, 1015, 2036, 1491, 3047, + 1785, 516, 3321, 3009, 2663, 1711, 2167, 126, 1469, 2476, 3239, 3058, 830, + 107, 1908, 3082, 2378, 2931, 961, 1821, 2604, 448, 2264, 677, 2054, 2226, + 430, 555, 843, 2078, 871, 1550, 105, 422, 587, 177, 3094, 3038, 2869, 1574, + 1653, 3083, 778, 1159, 3182, 2552, 1483, 2727, 1119, 1739, 644, 2457, 349, + 418, 329, 3173, 3254, 817, 1097, 603, 610, 1322, 2044, 1864, 384, 2114, 3193, + 1218, 1994, 2455, 220, 2142, 1670, 2144, 1799, 2051, 794, 1819, 2475, 2459, + 478, 3221, 3021, 996, 991, 958, 1869, 1522, 1628 }; const int16_t PQCLEAN_KYBER768_CLEAN_zetas_inv[128] = { - 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, 1278, 1530, 1185, - 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, 1285, 2007, 2719, 2726, 2232, 2512, - 75, 156, 3000, 2911, 2980, 872, 2685, 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, - 1676, 1755, 460, 291, 235, 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, - 1275, 2652, 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, - 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, 2677, 2106, - 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, 829, 2946, 3065, 1325, 2756, - 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 + 1701, 1807, 1460, 2371, 2338, 2333, 308, 108, 2851, 870, 854, 1510, 2535, + 1278, 1530, 1185, 1659, 1187, 3109, 874, 1335, 2111, 136, 1215, 2945, 1465, + 1285, 2007, 2719, 2726, 2232, 2512, 75, 156, 3000, 2911, 2980, 872, 2685, + 1590, 2210, 602, 1846, 777, 147, 2170, 2551, 246, 1676, 1755, 460, 291, 235, + 3152, 2742, 2907, 3224, 1779, 2458, 1251, 2486, 2774, 2899, 1103, 1275, 2652, + 1065, 2881, 725, 1508, 2368, 398, 951, 247, 1421, 3222, 2499, 271, 90, 853, + 1860, 3203, 1162, 1618, 666, 320, 8, 2813, 1544, 282, 1838, 1293, 2314, 552, + 2677, 2106, 1571, 205, 2918, 1542, 2721, 2597, 2312, 681, 130, 1602, 1871, + 829, 2946, 3065, 1325, 2756, 1861, 1474, 1202, 2367, 3147, 1752, 2707, 171, + 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 }; - /************************************************* * Name: fqmul * @@ -82,68 +80,73 @@ static int16_t fqmul(int16_t a, int16_t b) { } /************************************************* -* Name: ntt +* Name: PQCLEAN_KYBER768_CLEAN_ntt * * Description: Inplace number-theoretic transform (NTT) in Rq * input is in standard order, output is in bitreversed order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER768_CLEAN_ntt(int16_t poly[256]) { - size_t j, k = 1; - int16_t t, zeta; +void PQCLEAN_KYBER768_CLEAN_ntt(int16_t r[256]) { + unsigned int len = 0, start = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 128; len >= 2; len >>= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 1; + for (len = 128; len >= 2; len >>= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER768_CLEAN_zetas[k++]; for (j = start; j < start + len; ++j) { - t = fqmul(zeta, poly[j + len]); - poly[j + len] = poly[j] - t; - poly[j] = poly[j] + t; + t = fqmul(zeta, r[j + len]); + r[j + len] = r[j] - t; + r[j] = r[j] + t; } } } } /************************************************* -* Name: invntt +* Name: invntt_tomont * -* Description: Inplace inverse number-theoretic transform in Rq -* input is in bitreversed order, output is in standard order +* Description: Inplace inverse number-theoretic transform in Rq and +* multiplication by Montgomery factor 2^16. +* Input is in bitreversed order, output is in standard order * -* Arguments: - int16_t poly[256]: pointer to input/output vector of elements of Zq +* Arguments: - int16_t r[256]: pointer to input/output vector of elements +* of Zq **************************************************/ -void PQCLEAN_KYBER768_CLEAN_invntt(int16_t poly[256]) { - size_t j, k = 0; - int16_t t, zeta; +void PQCLEAN_KYBER768_CLEAN_invntt(int16_t r[256]) { + unsigned int start = 0, len = 0, j = 0, k = 0; + int16_t t = 0, zeta = 0; - for (size_t len = 2; len <= 128; len <<= 1) { - for (size_t start = 0; start < 256; start = j + len) { + k = 0; + for (len = 2; len <= 128; len <<= 1) { + for (start = 0; start < 256; start = j + len) { zeta = PQCLEAN_KYBER768_CLEAN_zetas_inv[k++]; for (j = start; j < start + len; ++j) { - t = poly[j]; - poly[j] = PQCLEAN_KYBER768_CLEAN_barrett_reduce(t + poly[j + len]); - poly[j + len] = t - poly[j + len]; - poly[j + len] = fqmul(zeta, poly[j + len]); + t = r[j]; + r[j] = PQCLEAN_KYBER768_CLEAN_barrett_reduce(t + r[j + len]); + r[j + len] = t - r[j + len]; + r[j + len] = fqmul(zeta, r[j + len]); } } } for (j = 0; j < 256; ++j) { - poly[j] = fqmul(poly[j], PQCLEAN_KYBER768_CLEAN_zetas_inv[127]); + r[j] = fqmul(r[j], PQCLEAN_KYBER768_CLEAN_zetas_inv[127]); } } /************************************************* -* Name: basemul +* Name: PQCLEAN_KYBER768_CLEAN_basemul * -* Description: Multiplication of polynomials in Zq[X]/((X^2-zeta)) +* Description: Multiplication of polynomials in Zq[X]/(X^2-zeta) * used for multiplication of elements in Rq in NTT domain * -* Arguments: - int16_t r[2]: pointer to the output polynomial +* Arguments: - int16_t r[2]: pointer to the output polynomial * - const int16_t a[2]: pointer to the first factor * - const int16_t b[2]: pointer to the second factor -* - int16_t zeta: integer defining the reduction polynomial +* - int16_t zeta: integer defining the reduction polynomial **************************************************/ void PQCLEAN_KYBER768_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta) { r[0] = fqmul(a[1], b[1]); diff --git a/crypto_kem/kyber768/clean/ntt.h b/crypto_kem/kyber768/clean/ntt.h index 5b84d53c..d3ca297c 100644 --- a/crypto_kem/kyber768/clean/ntt.h +++ b/crypto_kem/kyber768/clean/ntt.h @@ -1,13 +1,22 @@ -#ifndef NTT_H -#define NTT_H +#ifndef PQCLEAN_KYBER768_CLEAN_NTT_H +#define PQCLEAN_KYBER768_CLEAN_NTT_H +#include "params.h" #include + extern const int16_t PQCLEAN_KYBER768_CLEAN_zetas[128]; -extern const int16_t PQCLEAN_KYBER768_CLEAN_zetasinv[128]; -void PQCLEAN_KYBER768_CLEAN_ntt(int16_t *poly); -void PQCLEAN_KYBER768_CLEAN_invntt(int16_t *poly); + +extern const int16_t PQCLEAN_KYBER768_CLEAN_zetas_inv[128]; + + +void PQCLEAN_KYBER768_CLEAN_ntt(int16_t r[256]); + + +void PQCLEAN_KYBER768_CLEAN_invntt(int16_t r[256]); + + void PQCLEAN_KYBER768_CLEAN_basemul(int16_t r[2], const int16_t a[2], const int16_t b[2], int16_t zeta); #endif diff --git a/crypto_kem/kyber768/clean/params.h b/crypto_kem/kyber768/clean/params.h index 3a1e0d10..81873d6e 100644 --- a/crypto_kem/kyber768/clean/params.h +++ b/crypto_kem/kyber768/clean/params.h @@ -1,8 +1,5 @@ -#ifndef PARAMS_H -#define PARAMS_H - - -/* Don't change parameters below this line */ +#ifndef PQCLEAN_KYBER768_CLEAN_PARAMS_H +#define PQCLEAN_KYBER768_CLEAN_PARAMS_H #define KYBER_N 256 #define KYBER_Q 3329 @@ -12,9 +9,8 @@ #define KYBER_SYMBYTES 32 /* size in bytes of hashes, and seeds */ #define KYBER_SSBYTES 32 /* size in bytes of shared key */ -#define KYBER_POLYBYTES 384 -#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) - +#define KYBER_POLYBYTES 384 +#define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES) #define KYBER_K 3 #define KYBER_POLYCOMPRESSEDBYTES 128 @@ -23,10 +19,14 @@ #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES #define KYBER_INDCPA_PUBLICKEYBYTES (KYBER_POLYVECBYTES + KYBER_SYMBYTES) #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES) -#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES) +#define KYBER_INDCPA_BYTES (KYBER_POLYVECCOMPRESSEDBYTES \ + + KYBER_POLYCOMPRESSEDBYTES) #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES) -#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES + 2*KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */ +/* 32 bytes of additional space to save H(pk) */ +#define KYBER_SECRETKEYBYTES (KYBER_INDCPA_SECRETKEYBYTES \ + + KYBER_INDCPA_PUBLICKEYBYTES \ + + 2*KYBER_SYMBYTES) #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES #endif diff --git a/crypto_kem/kyber768/clean/poly.c b/crypto_kem/kyber768/clean/poly.c index 6c646895..4ffd0a1e 100644 --- a/crypto_kem/kyber768/clean/poly.c +++ b/crypto_kem/kyber768/clean/poly.c @@ -1,119 +1,164 @@ +#include "params.h" #include "cbd.h" #include "ntt.h" -#include "params.h" #include "poly.h" #include "reduce.h" #include "symmetric.h" - #include + /************************************************* -* Name: poly_compress +* Name: PQCLEAN_KYBER768_CLEAN_poly_compress * * Description: Compression and subsequent serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYCOMPRESSEDBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (of length KYBER_POLYCOMPRESSEDBYTES) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t *r, poly *a) { +void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a) { + unsigned int i = 0, j = 0; uint8_t t[8]; - size_t k = 0; PQCLEAN_KYBER768_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N; i += 8) { - for (size_t j = 0; j < 8; j++) { - t[j] = ((((uint32_t)a->coeffs[i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + t[j] = ((((uint16_t)a->coeffs[8 * i + j] << 4) + KYBER_Q / 2) / KYBER_Q) & 15; } - r[k] = (uint8_t)(t[0] | (t[1] << 4)); - r[k + 1] = (uint8_t)(t[2] | (t[3] << 4)); - r[k + 2] = (uint8_t)(t[4] | (t[5] << 4)); - r[k + 3] = (uint8_t)(t[6] | (t[7] << 4)); - k += 4; + r[0] = t[0] | (t[1] << 4); + r[1] = t[2] | (t[3] << 4); + r[2] = t[4] | (t[5] << 4); + r[3] = t[6] | (t[7] << 4); + r += 4; } } /************************************************* -* Name: poly_decompress +* Name: PQCLEAN_KYBER768_CLEAN_poly_decompress * * Description: De-serialization and subsequent decompression of a polynomial; -* approximate inverse of poly_compress +* approximate inverse of PQCLEAN_KYBER768_CLEAN_poly_compress * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of length KYBER_POLYCOMPRESSEDBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYCOMPRESSEDBYTES bytes) **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_decompress(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N; i += 8) { - r->coeffs[i + 0] = (int16_t)((((a[0] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 1] = (int16_t)((((a[0] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 2] = (int16_t)((((a[1] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 3] = (int16_t)((((a[1] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 4] = (int16_t)((((a[2] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 5] = (int16_t)((((a[2] >> 4) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 6] = (int16_t)((((a[3] & 15) * KYBER_Q) + 8) >> 4); - r->coeffs[i + 7] = (int16_t)((((a[3] >> 4) * KYBER_Q) + 8) >> 4); - a += 4; +void PQCLEAN_KYBER768_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]) { + unsigned int i = 0; + + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i + 0] = (((uint16_t)(a[0] & 15) * KYBER_Q) + 8) >> 4; + r->coeffs[2 * i + 1] = (((uint16_t)(a[0] >> 4) * KYBER_Q) + 8) >> 4; + a += 1; } } /************************************************* -* Name: poly_tobytes +* Name: PQCLEAN_KYBER768_CLEAN_poly_tobytes * * Description: Serialization of a polynomial * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYBYTES bytes) -* - const poly *a: pointer to input polynomial +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYBYTES bytes) +* - poly *a: pointer to input polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t *r, poly *a) { - int16_t t0, t1; +void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a) { + unsigned int i = 0; + uint16_t t0 = 0, t1 = 0; PQCLEAN_KYBER768_CLEAN_poly_csubq(a); - for (size_t i = 0; i < KYBER_N / 2; i++) { + for (i = 0; i < KYBER_N / 2; i++) { t0 = a->coeffs[2 * i]; t1 = a->coeffs[2 * i + 1]; - r[3 * i] = t0 & 0xff; - r[3 * i + 1] = (uint8_t)((t0 >> 8) | ((t1 & 0xf) << 4)); - r[3 * i + 2] = (uint8_t)(t1 >> 4); + r[3 * i + 0] = (t0 >> 0); + r[3 * i + 1] = (t0 >> 8) | (t1 << 4); + r[3 * i + 2] = (t1 >> 4); } } /************************************************* -* Name: poly_frombytes +* Name: PQCLEAN_KYBER768_CLEAN_poly_frombytes * * Description: De-serialization of a polynomial; -* inverse of poly_tobytes +* inverse of PQCLEAN_KYBER768_CLEAN_poly_tobytes +* +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *a: pointer to input byte array +* (of KYBER_POLYBYTES bytes) +**************************************************/ +void PQCLEAN_KYBER768_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 2; i++) { + r->coeffs[2 * i] = ((a[3 * i + 0] >> 0) | ((uint16_t)a[3 * i + 1] << 8)) & 0xFFF; + r->coeffs[2 * i + 1] = ((a[3 * i + 1] >> 4) | ((uint16_t)a[3 * i + 2] << 4)) & 0xFFF; + } +} + +/************************************************* +* Name: PQCLEAN_KYBER768_CLEAN_poly_frommsg +* +* Description: Convert 32-byte message to polynomial * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *a: pointer to input byte array (of KYBER_POLYBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *msg: pointer to input message **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_frombytes(poly *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_N / 2; i++) { - r->coeffs[2 * i] = (int16_t)(a[3 * i] | ((uint16_t)a[3 * i + 1] & 0x0f) << 8); - r->coeffs[2 * i + 1] = (int16_t)(a[3 * i + 1] >> 4 | ((uint16_t)a[3 * i + 2] & 0xff) << 4); +void PQCLEAN_KYBER768_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]) { + unsigned int i = 0, j = 0; + int16_t mask = 0; + + for (i = 0; i < KYBER_N / 8; i++) { + for (j = 0; j < 8; j++) { + mask = -(int16_t)((msg[i] >> j) & 1); + r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); + } + } +} + +/************************************************* +* Name: PQCLEAN_KYBER768_CLEAN_poly_tomsg +* +* Description: Convert polynomial to 32-byte message +* +* Arguments: - uint8_t *msg: pointer to output message +* - poly *a: pointer to input polynomial +**************************************************/ +void PQCLEAN_KYBER768_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a) { + unsigned int i = 0, j = 0; + uint16_t t = 0; + + PQCLEAN_KYBER768_CLEAN_poly_csubq(a); + + for (i = 0; i < KYBER_N / 8; i++) { + msg[i] = 0; + for (j = 0; j < 8; j++) { + t = ((((uint16_t)a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; + msg[i] |= t << j; + } } } /************************************************* -* Name: poly_getnoise +* Name: PQCLEAN_KYBER768_CLEAN_poly_getnoise * * Description: Sample a polynomial deterministically from a seed and a nonce, * with output polynomial close to centered binomial distribution * with parameter KYBER_ETA * -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *seed: pointer to input seed (pointing to array of length KYBER_SYMBYTES bytes) +* Arguments: - poly *r: pointer to output polynomial +* - const uint8_t *seed: pointer to input seed +* (of length KYBER_SYMBYTES bytes) * - uint8_t nonce: one-byte input nonce **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce) { +void PQCLEAN_KYBER768_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce) { uint8_t buf[KYBER_ETA * KYBER_N / 4]; - - prf(buf, KYBER_ETA * KYBER_N / 4, seed, nonce); + prf(buf, sizeof(buf), seed, nonce); PQCLEAN_KYBER768_CLEAN_cbd(r, buf); } /************************************************* -* Name: poly_ntt +* Name: PQCLEAN_KYBER768_CLEAN_poly_ntt * * Description: Computes negacyclic number-theoretic transform (NTT) of * a polynomial in place; @@ -127,20 +172,20 @@ void PQCLEAN_KYBER768_CLEAN_poly_ntt(poly *r) { } /************************************************* -* Name: poly_invntt +* Name: PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont * -* Description: Computes inverse of negacyclic number-theoretic transform (NTT) of -* a polynomial in place; +* Description: Computes inverse of negacyclic number-theoretic transform (NTT) +* of a polynomial in place; * inputs assumed to be in bitreversed order, output in normal order * * Arguments: - uint16_t *a: pointer to in/output polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_invntt(poly *r) { +void PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(poly *r) { PQCLEAN_KYBER768_CLEAN_invntt(r->coeffs); } /************************************************* -* Name: poly_basemul +* Name: PQCLEAN_KYBER768_CLEAN_poly_basemul_montgomery * * Description: Multiplication of two polynomials in NTT domain * @@ -148,68 +193,64 @@ void PQCLEAN_KYBER768_CLEAN_poly_invntt(poly *r) { * - const poly *a: pointer to first input polynomial * - const poly *b: pointer to second input polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N / 4; ++i) { - PQCLEAN_KYBER768_CLEAN_basemul( - r->coeffs + 4 * i, - a->coeffs + 4 * i, - b->coeffs + 4 * i, - PQCLEAN_KYBER768_CLEAN_zetas[64 + i]); - PQCLEAN_KYBER768_CLEAN_basemul( - r->coeffs + 4 * i + 2, - a->coeffs + 4 * i + 2, - b->coeffs + 4 * i + 2, - -PQCLEAN_KYBER768_CLEAN_zetas[64 + i]); +void PQCLEAN_KYBER768_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b) { + unsigned int i = 0; + for (i = 0; i < KYBER_N / 4; i++) { + PQCLEAN_KYBER768_CLEAN_basemul(&r->coeffs[4 * i], &a->coeffs[4 * i], &b->coeffs[4 * i], PQCLEAN_KYBER768_CLEAN_zetas[64 + i]); + PQCLEAN_KYBER768_CLEAN_basemul(&r->coeffs[4 * i + 2], &a->coeffs[4 * i + 2], &b->coeffs[4 * i + 2], + -PQCLEAN_KYBER768_CLEAN_zetas[64 + i]); } } /************************************************* -* Name: poly_frommont +* Name: PQCLEAN_KYBER768_CLEAN_poly_tomont * * Description: Inplace conversion of all coefficients of a polynomial -* from Montgomery domain to normal domain +* from normal domain to Montgomery domain * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_frommont(poly *r) { +void PQCLEAN_KYBER768_CLEAN_poly_tomont(poly *r) { + unsigned int i = 0; const int16_t f = (1ULL << 32) % KYBER_Q; - - for (size_t i = 0; i < KYBER_N; i++) { - r->coeffs[i] = PQCLEAN_KYBER768_CLEAN_montgomery_reduce( - (int32_t)r->coeffs[i] * f); + for (i = 0; i < KYBER_N; i++) { + r->coeffs[i] = PQCLEAN_KYBER768_CLEAN_montgomery_reduce((int32_t)r->coeffs[i] * f); } } /************************************************* -* Name: poly_reduce +* Name: PQCLEAN_KYBER768_CLEAN_poly_reduce * * Description: Applies Barrett reduction to all coefficients of a polynomial * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER768_CLEAN_poly_reduce(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER768_CLEAN_barrett_reduce(r->coeffs[i]); } } /************************************************* -* Name: poly_csubq +* Name: PQCLEAN_KYBER768_CLEAN_poly_csubq * -* Description: Applies conditional subtraction of q to each coefficient of a polynomial -* for details of conditional subtraction of q see comments in reduce.c +* Description: Applies conditional subtraction of q to each coefficient +* of a polynomial. For details of conditional subtraction +* of q see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER768_CLEAN_poly_csubq(poly *r) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = PQCLEAN_KYBER768_CLEAN_csubq(r->coeffs[i]); } } /************************************************* -* Name: poly_add +* Name: PQCLEAN_KYBER768_CLEAN_poly_add * * Description: Add two polynomials * @@ -218,13 +259,14 @@ void PQCLEAN_KYBER768_CLEAN_poly_csubq(poly *r) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER768_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] + b->coeffs[i]; } } /************************************************* -* Name: poly_sub +* Name: PQCLEAN_KYBER768_CLEAN_poly_sub * * Description: Subtract two polynomials * @@ -233,48 +275,8 @@ void PQCLEAN_KYBER768_CLEAN_poly_add(poly *r, const poly *a, const poly *b) { * - const poly *b: pointer to second input polynomial **************************************************/ void PQCLEAN_KYBER768_CLEAN_poly_sub(poly *r, const poly *a, const poly *b) { - for (size_t i = 0; i < KYBER_N; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_N; i++) { r->coeffs[i] = a->coeffs[i] - b->coeffs[i]; } } - -/************************************************* -* Name: poly_frommsg -* -* Description: Convert 32-byte message to polynomial -* -* Arguments: - poly *r: pointer to output polynomial -* - const uint8_t *msg: pointer to input message -**************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]) { - uint16_t mask; - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - for (size_t j = 0; j < 8; j++) { - mask = -((msg[i] >> j) & 1); - r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2); - } - } -} - -/************************************************* -* Name: poly_tomsg -* -* Description: Convert polynomial to 32-byte message -* -* Arguments: - uint8_t *msg: pointer to output message -* - const poly *a: pointer to input polynomial -**************************************************/ -void PQCLEAN_KYBER768_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a) { - uint16_t t; - - PQCLEAN_KYBER768_CLEAN_poly_csubq(a); - - for (size_t i = 0; i < KYBER_SYMBYTES; i++) { - msg[i] = 0; - for (size_t j = 0; j < 8; j++) { - t = (((a->coeffs[8 * i + j] << 1) + KYBER_Q / 2) / KYBER_Q) & 1; - msg[i] |= t << j; - } - } -} diff --git a/crypto_kem/kyber768/clean/poly.h b/crypto_kem/kyber768/clean/poly.h index b9e159a1..350bc8de 100644 --- a/crypto_kem/kyber768/clean/poly.h +++ b/crypto_kem/kyber768/clean/poly.h @@ -1,9 +1,9 @@ -#ifndef POLY_H -#define POLY_H +#ifndef PQCLEAN_KYBER768_CLEAN_POLY_H +#define PQCLEAN_KYBER768_CLEAN_POLY_H #include "params.h" - #include + /* * Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1] @@ -12,26 +12,41 @@ typedef struct { int16_t coeffs[KYBER_N]; } poly; -void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t *r, poly *a); -void PQCLEAN_KYBER768_CLEAN_poly_decompress(poly *r, const uint8_t *a); -void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t *r, poly *a); -void PQCLEAN_KYBER768_CLEAN_poly_frombytes(poly *r, const uint8_t *a); +void PQCLEAN_KYBER768_CLEAN_poly_compress(uint8_t r[KYBER_POLYCOMPRESSEDBYTES], poly *a); + +void PQCLEAN_KYBER768_CLEAN_poly_decompress(poly *r, const uint8_t a[KYBER_POLYCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER768_CLEAN_poly_tobytes(uint8_t r[KYBER_POLYBYTES], poly *a); + +void PQCLEAN_KYBER768_CLEAN_poly_frombytes(poly *r, const uint8_t a[KYBER_POLYBYTES]); + + +void PQCLEAN_KYBER768_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_INDCPA_MSGBYTES]); + +void PQCLEAN_KYBER768_CLEAN_poly_tomsg(uint8_t msg[KYBER_INDCPA_MSGBYTES], poly *a); -void PQCLEAN_KYBER768_CLEAN_poly_frommsg(poly *r, const uint8_t msg[KYBER_SYMBYTES]); -void PQCLEAN_KYBER768_CLEAN_poly_tomsg(uint8_t msg[KYBER_SYMBYTES], poly *a); -void PQCLEAN_KYBER768_CLEAN_poly_getnoise(poly *r, const uint8_t *seed, uint8_t nonce); +void PQCLEAN_KYBER768_CLEAN_poly_getnoise(poly *r, const uint8_t seed[KYBER_SYMBYTES], uint8_t nonce); + void PQCLEAN_KYBER768_CLEAN_poly_ntt(poly *r); -void PQCLEAN_KYBER768_CLEAN_poly_invntt(poly *r); -void PQCLEAN_KYBER768_CLEAN_poly_basemul(poly *r, const poly *a, const poly *b); -void PQCLEAN_KYBER768_CLEAN_poly_frommont(poly *r); + +void PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(poly *r); + +void PQCLEAN_KYBER768_CLEAN_poly_basemul_montgomery(poly *r, const poly *a, const poly *b); + +void PQCLEAN_KYBER768_CLEAN_poly_tomont(poly *r); + void PQCLEAN_KYBER768_CLEAN_poly_reduce(poly *r); + void PQCLEAN_KYBER768_CLEAN_poly_csubq(poly *r); + void PQCLEAN_KYBER768_CLEAN_poly_add(poly *r, const poly *a, const poly *b); + void PQCLEAN_KYBER768_CLEAN_poly_sub(poly *r, const poly *a, const poly *b); #endif diff --git a/crypto_kem/kyber768/clean/polyvec.c b/crypto_kem/kyber768/clean/polyvec.c index e70b61ca..9a8193b1 100644 --- a/crypto_kem/kyber768/clean/polyvec.c +++ b/crypto_kem/kyber768/clean/polyvec.c @@ -1,128 +1,153 @@ -#include "polyvec.h" - +#include "params.h" #include "poly.h" - -#include +#include "polyvec.h" #include + /************************************************* -* Name: polyvec_compress +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_compress * * Description: Compress and serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECCOMPRESSEDBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECCOMPRESSEDBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t *r, polyvec *a) { +void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a) { + unsigned int i = 0, j = 0, k = 0; + PQCLEAN_KYBER768_CLEAN_polyvec_csubq(a); uint16_t t[4]; - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - for (size_t k = 0; k < 4; k++) { - t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) / KYBER_Q) & 0x3ff; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + for (k = 0; k < 4; k++) { + { + t[k] = ((((uint32_t)a->vec[i].coeffs[4 * j + k] << 10) + KYBER_Q / 2) + / KYBER_Q) & 0x3ff; + } } - r[5 * j + 0] = (uint8_t)t[0]; - r[5 * j + 1] = (uint8_t)((t[0] >> 8) | ((t[1] & 0x3f) << 2)); - r[5 * j + 2] = (uint8_t)((t[1] >> 6) | ((t[2] & 0x0f) << 4)); - r[5 * j + 3] = (uint8_t)((t[2] >> 4) | ((t[3] & 0x03) << 6)); - r[5 * j + 4] = (uint8_t)((t[3] >> 2)); + r[0] = (t[0] >> 0); + r[1] = (t[0] >> 8) | (t[1] << 2); + r[2] = (t[1] >> 6) | (t[2] << 4); + r[3] = (t[2] >> 4) | (t[3] << 6); + r[4] = (t[3] >> 2); + r += 5; } - r += 320; } } /************************************************* -* Name: polyvec_decompress +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_decompress * * Description: De-serialize and decompress vector of polynomials; -* approximate inverse of polyvec_compress +* approximate inverse of PQCLEAN_KYBER768_CLEAN_polyvec_compress * * Arguments: - polyvec *r: pointer to output vector of polynomials -* - uint8_t *a: pointer to input byte array (of length KYBER_POLYVECCOMPRESSEDBYTES) +* - const uint8_t *a: pointer to input byte array +* (of length KYBER_POLYVECCOMPRESSEDBYTES) **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { - for (size_t j = 0; j < KYBER_N / 4; j++) { - r->vec[i].coeffs[4 * j + 0] = (int16_t)( (((a[5 * j + 0] | (((uint32_t)a[5 * j + 1] & 0x03) << 8)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 1] = (int16_t)(((((a[5 * j + 1] >> 2) | (((uint32_t)a[5 * j + 2] & 0x0f) << 6)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 2] = (int16_t)(((((a[5 * j + 2] >> 4) | (((uint32_t)a[5 * j + 3] & 0x3f) << 4)) * KYBER_Q) + 512) >> 10); - r->vec[i].coeffs[4 * j + 3] = (int16_t)(((((a[5 * j + 3] >> 6) | (((uint32_t)a[5 * j + 4] & 0xff) << 2)) * KYBER_Q) + 512) >> 10); +void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]) { + unsigned int i = 0, j = 0, k = 0; + + uint16_t t[4]; + for (i = 0; i < KYBER_K; i++) { + for (j = 0; j < KYBER_N / 4; j++) { + t[0] = (a[0] >> 0) | ((uint16_t)a[1] << 8); + t[1] = (a[1] >> 2) | ((uint16_t)a[2] << 6); + t[2] = (a[2] >> 4) | ((uint16_t)a[3] << 4); + t[3] = (a[3] >> 6) | ((uint16_t)a[4] << 2); + a += 5; + + for (k = 0; k < 4; k++) { + r->vec[i].coeffs[4 * j + k] = ((uint32_t)(t[k] & 0x3FF) * KYBER_Q + 512) >> 10; + } } - a += 320; } } /************************************************* -* Name: polyvec_tobytes +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_tobytes * * Description: Serialize vector of polynomials * -* Arguments: - uint8_t *r: pointer to output byte array (needs space for KYBER_POLYVECBYTES) -* - const polyvec *a: pointer to input vector of polynomials +* Arguments: - uint8_t *r: pointer to output byte array +* (needs space for KYBER_POLYVECBYTES) +* - polyvec *a: pointer to input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]); } } /************************************************* -* Name: polyvec_frombytes +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_frombytes * * Description: De-serialize vector of polynomials; -* inverse of polyvec_tobytes +* inverse of PQCLEAN_KYBER768_CLEAN_polyvec_tobytes * -* Arguments: - uint8_t *r: pointer to output byte array -* - const polyvec *a: pointer to input vector of polynomials (of length KYBER_POLYVECBYTES) +* Arguments: - uint8_t *r: pointer to output byte array +* - const polyvec *a: pointer to input vector of polynomials +* (of length KYBER_POLYVECBYTES) **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a) { - for (size_t i = 0; i < KYBER_K; i++) { +void PQCLEAN_KYBER768_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES); } } /************************************************* -* Name: polyvec_ntt +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_ntt * * Description: Apply forward NTT to all elements of a vector of polynomials * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ void PQCLEAN_KYBER768_CLEAN_polyvec_ntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_ntt(&r->vec[i]); } } /************************************************* -* Name: polyvec_invntt +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_invntt_tomont * * Description: Apply inverse NTT to all elements of a vector of polynomials +* and multiply by Montgomery factor 2^16 * * Arguments: - polyvec *r: pointer to in/output vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_invntt(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_invntt(&r->vec[i]); +void PQCLEAN_KYBER768_CLEAN_polyvec_invntt_tomont(polyvec *r) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { + PQCLEAN_KYBER768_CLEAN_poly_invntt_tomont(&r->vec[i]); } } /************************************************* -* Name: polyvec_pointwise_acc +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery * -* Description: Pointwise multiply elements of a and b and accumulate into r +* Description: Pointwise multiply elements of a and b, accumulate into r, +* and multiply by 2^-16. * * Arguments: - poly *r: pointer to output polynomial * - const polyvec *a: pointer to first input vector of polynomials * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ -void PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) { +void PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b) { + unsigned int i = 0; poly t; - PQCLEAN_KYBER768_CLEAN_poly_basemul(r, &a->vec[0], &b->vec[0]); - for (size_t i = 1; i < KYBER_K; i++) { - PQCLEAN_KYBER768_CLEAN_poly_basemul(&t, &a->vec[i], &b->vec[i]); + PQCLEAN_KYBER768_CLEAN_poly_basemul_montgomery(r, &a->vec[0], &b->vec[0]); + for (i = 1; i < KYBER_K; i++) { + PQCLEAN_KYBER768_CLEAN_poly_basemul_montgomery(&t, &a->vec[i], &b->vec[i]); PQCLEAN_KYBER768_CLEAN_poly_add(r, r, &t); } @@ -130,37 +155,40 @@ void PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, con } /************************************************* -* Name: polyvec_reduce +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_reduce * * Description: Applies Barrett reduction to each coefficient * of each element of a vector of polynomials * for details of the Barrett reduction see comments in reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER768_CLEAN_polyvec_reduce(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_reduce(&r->vec[i]); } } /************************************************* -* Name: polyvec_csubq +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_csubq * * Description: Applies conditional subtraction of q to each coefficient * of each element of a vector of polynomials -* for details of conditional subtraction of q see comments in reduce.c +* for details of conditional subtraction of q see comments in +* reduce.c * -* Arguments: - poly *r: pointer to input/output polynomial +* Arguments: - poly *r: pointer to input/output polynomial **************************************************/ void PQCLEAN_KYBER768_CLEAN_polyvec_csubq(polyvec *r) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_csubq(&r->vec[i]); } } /************************************************* -* Name: polyvec_add +* Name: PQCLEAN_KYBER768_CLEAN_polyvec_add * * Description: Add vectors of polynomials * @@ -169,7 +197,8 @@ void PQCLEAN_KYBER768_CLEAN_polyvec_csubq(polyvec *r) { * - const polyvec *b: pointer to second input vector of polynomials **************************************************/ void PQCLEAN_KYBER768_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) { - for (size_t i = 0; i < KYBER_K; i++) { + unsigned int i = 0; + for (i = 0; i < KYBER_K; i++) { PQCLEAN_KYBER768_CLEAN_poly_add(&r->vec[i], &a->vec[i], &b->vec[i]); } } diff --git a/crypto_kem/kyber768/clean/polyvec.h b/crypto_kem/kyber768/clean/polyvec.h index 94b9ee2a..70d4b0fe 100644 --- a/crypto_kem/kyber768/clean/polyvec.h +++ b/crypto_kem/kyber768/clean/polyvec.h @@ -1,29 +1,41 @@ -#ifndef POLYVEC_H -#define POLYVEC_H +#ifndef PQCLEAN_KYBER768_CLEAN_POLYVEC_H +#define PQCLEAN_KYBER768_CLEAN_POLYVEC_H #include "params.h" #include "poly.h" - #include typedef struct { poly vec[KYBER_K]; } polyvec; -void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, const uint8_t *a); -void PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(uint8_t *r, polyvec *a); -void PQCLEAN_KYBER768_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t *a); +void PQCLEAN_KYBER768_CLEAN_polyvec_compress(uint8_t r[KYBER_POLYVECCOMPRESSEDBYTES], polyvec *a); + +void PQCLEAN_KYBER768_CLEAN_polyvec_decompress(polyvec *r, + const uint8_t a[KYBER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_KYBER768_CLEAN_polyvec_tobytes(uint8_t r[KYBER_POLYVECBYTES], polyvec *a); + +void PQCLEAN_KYBER768_CLEAN_polyvec_frombytes(polyvec *r, const uint8_t a[KYBER_POLYVECBYTES]); + void PQCLEAN_KYBER768_CLEAN_polyvec_ntt(polyvec *r); -void PQCLEAN_KYBER768_CLEAN_polyvec_invntt(polyvec *r); -void PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b); +void PQCLEAN_KYBER768_CLEAN_polyvec_invntt_tomont(polyvec *r); + + +void PQCLEAN_KYBER768_CLEAN_polyvec_pointwise_acc_montgomery(poly *r, + const polyvec *a, + const polyvec *b); + void PQCLEAN_KYBER768_CLEAN_polyvec_reduce(polyvec *r); + void PQCLEAN_KYBER768_CLEAN_polyvec_csubq(polyvec *r); + void PQCLEAN_KYBER768_CLEAN_polyvec_add(polyvec *r, const polyvec *a, const polyvec *b); #endif diff --git a/crypto_kem/kyber768/clean/reduce.c b/crypto_kem/kyber768/clean/reduce.c index ea2aa1b0..0bf2b1bd 100644 --- a/crypto_kem/kyber768/clean/reduce.c +++ b/crypto_kem/kyber768/clean/reduce.c @@ -1,32 +1,32 @@ -#include "reduce.h" - #include "params.h" - +#include "reduce.h" #include + /************************************************* -* Name: montgomery_reduce +* Name: PQCLEAN_KYBER768_CLEAN_montgomery_reduce * * Description: Montgomery reduction; given a 32-bit integer a, computes * 16-bit integer congruent to a * R^-1 mod q, * where R=2^16 * -* Arguments: - int32_t a: input integer to be reduced; has to be in {-q2^15,...,q2^15-1} +* Arguments: - int32_t a: input integer to be reduced; +* has to be in {-q2^15,...,q2^15-1} * * Returns: integer in {-q+1,...,q-1} congruent to a * R^-1 modulo q. **************************************************/ int16_t PQCLEAN_KYBER768_CLEAN_montgomery_reduce(int32_t a) { - int32_t t; - int16_t u; + int32_t t = 0; + int16_t u = 0; u = (int16_t)(a * (int64_t)QINV); t = (int32_t)u * KYBER_Q; t = a - t; t >>= 16; - return (int16_t)t; + return t; } /************************************************* -* Name: barrett_reduce +* Name: PQCLEAN_KYBER768_CLEAN_barrett_reduce * * Description: Barrett reduction; given a 16-bit integer a, computes * 16-bit integer congruent to a mod q in {0,...,q} @@ -36,21 +36,20 @@ int16_t PQCLEAN_KYBER768_CLEAN_montgomery_reduce(int32_t a) { * Returns: integer in {0,...,q} congruent to a modulo q. **************************************************/ int16_t PQCLEAN_KYBER768_CLEAN_barrett_reduce(int16_t a) { - int32_t t; - const int32_t v = (1U << 26) / KYBER_Q + 1; + int16_t t = 0; + const int16_t v = ((1U << 26) + KYBER_Q / 2) / KYBER_Q; - t = v * a; - t >>= 26; + t = (int32_t)v * a >> 26; t *= KYBER_Q; - return a - (int16_t)t; + return a - t; } /************************************************* -* Name: csubq +* Name: PQCLEAN_KYBER768_CLEAN_csubq * * Description: Conditionallly subtract q * -* Arguments: - int16_t a: input integer +* Arguments: - int16_t x: input integer * * Returns: a - q if a >= q, else a **************************************************/ diff --git a/crypto_kem/kyber768/clean/reduce.h b/crypto_kem/kyber768/clean/reduce.h index 7a679969..120002d7 100644 --- a/crypto_kem/kyber768/clean/reduce.h +++ b/crypto_kem/kyber768/clean/reduce.h @@ -1,15 +1,19 @@ -#ifndef REDUCE_H -#define REDUCE_H +#ifndef PQCLEAN_KYBER768_CLEAN_REDUCE_H +#define PQCLEAN_KYBER768_CLEAN_REDUCE_H +#include "params.h" #include -#define MONT 2285 // 2^16 % Q -#define QINV 62209 // q^(-1) mod 2^16 +#define MONT 2285 // 2^16 mod q +#define QINV 62209 // q^-1 mod 2^16 + int16_t PQCLEAN_KYBER768_CLEAN_montgomery_reduce(int32_t a); + int16_t PQCLEAN_KYBER768_CLEAN_barrett_reduce(int16_t a); + int16_t PQCLEAN_KYBER768_CLEAN_csubq(int16_t a); #endif diff --git a/crypto_kem/kyber768/clean/symmetric-fips202.c b/crypto_kem/kyber768/clean/symmetric-fips202.c deleted file mode 100644 index 6957b586..00000000 --- a/crypto_kem/kyber768/clean/symmetric-fips202.c +++ /dev/null @@ -1,63 +0,0 @@ -#include "fips202.h" -#include "symmetric.h" - -#include -/************************************************* -* Name: kyber_shake128_absorb -* -* Description: Absorb step of the SHAKE128 specialized for the Kyber context. - -* Arguments: - keccak_state *s: pointer to (uninitialized) output Keccak state -* - const uint8_t *input: pointer to KYBER_SYMBYTES input to be absorbed into s -* - uint8_t i additional byte of input -* - uint8_t j additional byte of input -**************************************************/ -void PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y) { - size_t i; - uint8_t extseed[KYBER_SYMBYTES + 2]; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extseed[i] = input[i]; - } - extseed[i++] = x; - extseed[i] = y; - shake128_absorb(s, extseed, KYBER_SYMBYTES + 2); -} - -/************************************************* -* Name: kyber_shake128_squeezeblocks -* -* Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of SHAKE128_RATE bytes each. -* Modifies the state. Can be called multiple times to keep squeezing, -* i.e., is incremental. -* -* Arguments: - uint8_t *output: pointer to output blocks -* - unsigned long long nblocks: number of blocks to be squeezed (written to output) -* - keccak_state *s: pointer to in/output Keccak state -**************************************************/ -void PQCLEAN_KYBER768_CLEAN_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s) { - shake128_squeezeblocks(output, nblocks, s); -} - -/************************************************* -* Name: shake256_prf -* -* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input -* and then generates outlen bytes of SHAKE256 output -* -* Arguments: - uint8_t *output: pointer to output -* - size_t outlen: number of requested output bytes -* - const uint8_t * key: pointer to the key (of length KYBER_SYMBYTES) -* - const uint8_t nonce: single-byte nonce (public PRF input) -**************************************************/ -void PQCLEAN_KYBER768_CLEAN_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce) { - uint8_t extkey[KYBER_SYMBYTES + 1]; - size_t i; - - for (i = 0; i < KYBER_SYMBYTES; i++) { - extkey[i] = key[i]; - } - extkey[i] = nonce; - - shake256(output, outlen, extkey, KYBER_SYMBYTES + 1); -} diff --git a/crypto_kem/kyber768/clean/symmetric-shake.c b/crypto_kem/kyber768/clean/symmetric-shake.c new file mode 100644 index 00000000..3dc60426 --- /dev/null +++ b/crypto_kem/kyber768/clean/symmetric-shake.c @@ -0,0 +1,60 @@ +#include "fips202.h" +#include "params.h" +#include "symmetric.h" +#include +#include + +/************************************************* +* Name: PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb +* +* Description: Absorb step of the SHAKE128 specialized for the Kyber context. +* +* Arguments: - keccak_state *state: pointer to (uninitialized) output +* Keccak state +* - const uint8_t *seed: pointer to KYBER_SYMBYTES input +* to be absorbed into state +* - uint8_t i additional byte of input +* - uint8_t j additional byte of input +**************************************************/ +void PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(xof_state *state, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y) { + unsigned int i = 0; + uint8_t extseed[KYBER_SYMBYTES + 2]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extseed[i] = seed[i]; + } + extseed[i++] = x; + extseed[i] = y; + + shake128_absorb(state, extseed, sizeof(extseed)); +} + +/************************************************* +* Name: PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf +* +* Description: Usage of SHAKE256 as a PRF, concatenates secret and public input +* and then generates outlen bytes of SHAKE256 output +* +* Arguments: - uint8_t *out: pointer to output +* - size_t outlen: number of requested output bytes +* - const uint8_t *key: pointer to the key +* (of length KYBER_SYMBYTES) +* - uint8_t nonce: single-byte nonce (public PRF input) +**************************************************/ +void PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce) { + unsigned int i = 0; + uint8_t extkey[KYBER_SYMBYTES + 1]; + + for (i = 0; i < KYBER_SYMBYTES; i++) { + extkey[i] = key[i]; + } + extkey[i] = nonce; + + shake256(out, outlen, extkey, sizeof(extkey)); +} diff --git a/crypto_kem/kyber768/clean/symmetric.h b/crypto_kem/kyber768/clean/symmetric.h index 0d4ac2b7..b446aebe 100644 --- a/crypto_kem/kyber768/clean/symmetric.h +++ b/crypto_kem/kyber768/clean/symmetric.h @@ -2,29 +2,35 @@ #define SYMMETRIC_H #include "params.h" +#include +#include #include "fips202.h" -#include +typedef shake128ctx xof_state; + +void PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(xof_state *s, + const uint8_t seed[KYBER_SYMBYTES], + uint8_t x, + uint8_t y); -typedef shake128ctx keccak_state; +void PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(uint8_t *out, + size_t outlen, + const uint8_t key[KYBER_SYMBYTES], + uint8_t nonce); -void PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(keccak_state *s, const uint8_t *input, uint8_t x, uint8_t y); -void PQCLEAN_KYBER768_CLEAN_kyber_shake128_squeezeblocks(uint8_t *output, size_t nblocks, keccak_state *s); -void PQCLEAN_KYBER768_CLEAN_shake256_prf(uint8_t *output, size_t outlen, const uint8_t *key, uint8_t nonce); +#define XOF_BLOCKBYTES SHAKE128_RATE #define hash_h(OUT, IN, INBYTES) sha3_256(OUT, IN, INBYTES) #define hash_g(OUT, IN, INBYTES) sha3_512(OUT, IN, INBYTES) -#define xof_absorb(STATE, IN, X, Y) PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(STATE, IN, X, Y) -#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) PQCLEAN_KYBER768_CLEAN_kyber_shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) +#define xof_absorb(STATE, SEED, X, Y) PQCLEAN_KYBER768_CLEAN_kyber_shake128_absorb(STATE, SEED, X, Y) +#define xof_squeezeblocks(OUT, OUTBLOCKS, STATE) \ + shake128_squeezeblocks(OUT, OUTBLOCKS, STATE) #define xof_ctx_release(STATE) shake128_ctx_release(STATE) -#define prf(OUT, OUTBYTES, KEY, NONCE) PQCLEAN_KYBER768_CLEAN_shake256_prf(OUT, OUTBYTES, KEY, NONCE) +#define prf(OUT, OUTBYTES, KEY, NONCE) \ + PQCLEAN_KYBER768_CLEAN_kyber_shake256_prf(OUT, OUTBYTES, KEY, NONCE) #define kdf(OUT, IN, INBYTES) shake256(OUT, KYBER_SSBYTES, IN, INBYTES) -#define XOF_BLOCKBYTES 168 - -typedef keccak_state xof_state; - #endif /* SYMMETRIC_H */ diff --git a/crypto_kem/kyber768/clean/verify.c b/crypto_kem/kyber768/clean/verify.c index df2224a9..03e3573e 100644 --- a/crypto_kem/kyber768/clean/verify.c +++ b/crypto_kem/kyber768/clean/verify.c @@ -1,34 +1,31 @@ #include "verify.h" - #include #include /************************************************* -* Name: verify +* Name: PQCLEAN_KYBER768_CLEAN_verify * * Description: Compare two arrays for equality in constant time. * * Arguments: const uint8_t *a: pointer to first byte array * const uint8_t *b: pointer to second byte array -* size_t len: length of the byte arrays +* size_t len: length of the byte arrays * * Returns 0 if the byte arrays are equal, 1 otherwise **************************************************/ -uint8_t PQCLEAN_KYBER768_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { - uint64_t r; - size_t i; - r = 0; +int PQCLEAN_KYBER768_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { + size_t i = 0; + uint8_t r = 0; for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } - r = (-r) >> 63; - return (uint8_t)r; + return (-(uint64_t)r) >> 63; } /************************************************* -* Name: cmov +* Name: PQCLEAN_KYBER768_CLEAN_cmov * * Description: Copy len bytes from x to r if b is 1; * don't modify x if b is 0. Requires b to be in {0,1}; @@ -37,14 +34,14 @@ uint8_t PQCLEAN_KYBER768_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t * * Arguments: uint8_t *r: pointer to output byte array * const uint8_t *x: pointer to input byte array -* size_t len: Amount of bytes to be copied +* size_t len: Amount of bytes to be copied * uint8_t b: Condition bit; has to be in {0,1} **************************************************/ void PQCLEAN_KYBER768_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { - size_t i; + size_t i = 0; b = -b; for (i = 0; i < len; i++) { - r[i] ^= b & (x[i] ^ r[i]); + r[i] ^= b & (r[i] ^ x[i]); } } diff --git a/crypto_kem/kyber768/clean/verify.h b/crypto_kem/kyber768/clean/verify.h index a939d9d1..430ca7b3 100644 --- a/crypto_kem/kyber768/clean/verify.h +++ b/crypto_kem/kyber768/clean/verify.h @@ -1,10 +1,13 @@ -#ifndef VERIFY_H -#define VERIFY_H +#ifndef PQCLEAN_KYBER768_CLEAN_VERIFY_H +#define PQCLEAN_KYBER768_CLEAN_VERIFY_H +#include "params.h" #include #include -uint8_t PQCLEAN_KYBER768_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + +int PQCLEAN_KYBER768_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + void PQCLEAN_KYBER768_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); diff --git a/test/duplicate_consistency/kyber1024-90s_avx2.yml b/test/duplicate_consistency/kyber1024-90s_avx2.yml index 69779016..9fa0aae7 100644 --- a/test/duplicate_consistency/kyber1024-90s_avx2.yml +++ b/test/duplicate_consistency/kyber1024-90s_avx2.yml @@ -18,17 +18,16 @@ consistency_checks: - consts.c - consts.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - polyvec.h - reduce.h - - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s + - shuffle.S - verify.c - verify.h - source: @@ -52,18 +51,17 @@ consistency_checks: - consts.c - consts.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - poly.h - polyvec.h - reduce.h - - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s + - shuffle.S - symmetric.h - verify.c - verify.h diff --git a/test/duplicate_consistency/kyber1024_avx2.yml b/test/duplicate_consistency/kyber1024_avx2.yml index d8028676..6e45bb97 100644 --- a/test/duplicate_consistency/kyber1024_avx2.yml +++ b/test/duplicate_consistency/kyber1024_avx2.yml @@ -8,7 +8,7 @@ consistency_checks: - indcpa.h - params.h - polyvec.h - - symmetric-fips202.c + - symmetric-shake.c - verify.h - source: scheme: kyber768 @@ -21,19 +21,19 @@ consistency_checks: - fips202x4.c - fips202x4.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - poly.h - polyvec.h - reduce.h - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s - - symmetric-fips202.c + - shuffle.S + - symmetric-shake.c - symmetric.h - verify.c - verify.h diff --git a/test/duplicate_consistency/kyber1024_clean.yml b/test/duplicate_consistency/kyber1024_clean.yml index 531fdd64..2bf1c2dc 100644 --- a/test/duplicate_consistency/kyber1024_clean.yml +++ b/test/duplicate_consistency/kyber1024_clean.yml @@ -14,7 +14,7 @@ consistency_checks: - polyvec.h - reduce.c - reduce.h - - symmetric-fips202.c + - symmetric-shake.c - symmetric.h - verify.c - verify.h diff --git a/test/duplicate_consistency/kyber512-90s_avx2.yml b/test/duplicate_consistency/kyber512-90s_avx2.yml index b4b5f24d..8c2a8ffd 100644 --- a/test/duplicate_consistency/kyber512-90s_avx2.yml +++ b/test/duplicate_consistency/kyber512-90s_avx2.yml @@ -18,17 +18,16 @@ consistency_checks: - consts.c - consts.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - polyvec.h - reduce.h - - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s + - shuffle.S - verify.c - verify.h - source: @@ -52,18 +51,18 @@ consistency_checks: - consts.c - consts.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - poly.h - polyvec.h - reduce.h - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s + - shuffle.S - symmetric.h - verify.c - verify.h diff --git a/test/duplicate_consistency/kyber512_avx2.yml b/test/duplicate_consistency/kyber512_avx2.yml index b31cc6da..a154ac68 100644 --- a/test/duplicate_consistency/kyber512_avx2.yml +++ b/test/duplicate_consistency/kyber512_avx2.yml @@ -8,7 +8,7 @@ consistency_checks: - indcpa.h - params.h - polyvec.h - - symmetric-fips202.c + - symmetric-shake.c - verify.h - source: scheme: kyber768 @@ -21,19 +21,19 @@ consistency_checks: - fips202x4.c - fips202x4.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - poly.h - polyvec.h - reduce.h - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s - - symmetric-fips202.c + - shuffle.S + - symmetric-shake.c - symmetric.h - verify.c - verify.h diff --git a/test/duplicate_consistency/kyber512_clean.yml b/test/duplicate_consistency/kyber512_clean.yml index 531fdd64..2bf1c2dc 100644 --- a/test/duplicate_consistency/kyber512_clean.yml +++ b/test/duplicate_consistency/kyber512_clean.yml @@ -14,7 +14,7 @@ consistency_checks: - polyvec.h - reduce.c - reduce.h - - symmetric-fips202.c + - symmetric-shake.c - symmetric.h - verify.c - verify.h diff --git a/test/duplicate_consistency/kyber768-90s_avx2.yml b/test/duplicate_consistency/kyber768-90s_avx2.yml index 9bbc291c..fe370670 100644 --- a/test/duplicate_consistency/kyber768-90s_avx2.yml +++ b/test/duplicate_consistency/kyber768-90s_avx2.yml @@ -18,17 +18,17 @@ consistency_checks: - consts.c - consts.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - polyvec.h - reduce.h - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s + - shuffle.S - verify.c - verify.h consistency_checks: @@ -53,18 +53,17 @@ consistency_checks: - consts.c - consts.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - poly.h - polyvec.h - reduce.h - - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s + - shuffle.S - symmetric.h - verify.c - verify.h diff --git a/test/duplicate_consistency/kyber768_avx2.yml b/test/duplicate_consistency/kyber768_avx2.yml index 4f94e25a..e1c67def 100644 --- a/test/duplicate_consistency/kyber768_avx2.yml +++ b/test/duplicate_consistency/kyber768_avx2.yml @@ -8,7 +8,7 @@ consistency_checks: - indcpa.h - params.h - polyvec.h - - symmetric-fips202.c + - symmetric-shake.c - verify.h - source: scheme: kyber512 @@ -21,19 +21,19 @@ consistency_checks: - fips202x4.c - fips202x4.h - fq.inc - - fq.s + - fq.S - indcpa.h - - invntt.s + - invntt.S - ntt.h - - ntt.s + - ntt.S - poly.h - polyvec.h - reduce.h - rejsample.c - rejsample.h - shuffle.inc - - shuffle.s - - symmetric-fips202.c + - shuffle.S + - symmetric-shake.c - symmetric.h - verify.c - verify.h diff --git a/test/duplicate_consistency/kyber768_clean.yml b/test/duplicate_consistency/kyber768_clean.yml index fc1a7357..f4c7518a 100644 --- a/test/duplicate_consistency/kyber768_clean.yml +++ b/test/duplicate_consistency/kyber768_clean.yml @@ -14,7 +14,7 @@ consistency_checks: - polyvec.h - reduce.c - reduce.h - - symmetric-fips202.c + - symmetric-shake.c - symmetric.h - verify.c - verify.h