From 0a3146831f0112d01a620a4e5d3f50b87bc50fe1 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Thu, 15 Oct 2020 20:00:04 -0400 Subject: [PATCH 01/10] Update Saber and add AVX2 implementation --- crypto_kem/firesaber/META.yml | 11 +- crypto_kem/firesaber/avx2/LICENSE | 1 + crypto_kem/firesaber/avx2/Makefile | 22 + crypto_kem/firesaber/avx2/SABER_indcpa.c | 416 +++++++ crypto_kem/firesaber/avx2/SABER_indcpa.h | 13 + crypto_kem/firesaber/avx2/SABER_params.h | 45 + crypto_kem/firesaber/avx2/api.h | 18 + crypto_kem/firesaber/avx2/cbd.c | 52 + crypto_kem/firesaber/avx2/cbd.h | 16 + crypto_kem/firesaber/avx2/kem.c | 79 ++ crypto_kem/firesaber/avx2/kem.h | 35 + crypto_kem/firesaber/avx2/pack_unpack.c | 502 ++++++++ crypto_kem/firesaber/avx2/pack_unpack.h | 56 + crypto_kem/firesaber/avx2/poly.h | 27 + crypto_kem/firesaber/avx2/polymul/consts.h | 20 + crypto_kem/firesaber/avx2/polymul/matrix.c | 303 +++++ crypto_kem/firesaber/avx2/polymul/scm_avx.c | 753 ++++++++++++ .../firesaber/avx2/polymul/toom-cook_4way.c | 1010 +++++++++++++++++ crypto_kem/firesaber/avx2/verify.c | 35 + crypto_kem/firesaber/avx2/verify.h | 22 + crypto_kem/firesaber/clean/LICENSE | 9 +- crypto_kem/firesaber/clean/Makefile | 4 +- crypto_kem/firesaber/clean/SABER_indcpa.c | 316 +----- crypto_kem/firesaber/clean/SABER_indcpa.h | 12 +- crypto_kem/firesaber/clean/SABER_params.h | 42 +- crypto_kem/firesaber/clean/api.h | 12 +- crypto_kem/firesaber/clean/cbd.c | 34 +- crypto_kem/firesaber/clean/cbd.h | 7 +- crypto_kem/firesaber/clean/kem.c | 78 +- crypto_kem/firesaber/clean/pack_unpack.c | 336 ++---- crypto_kem/firesaber/clean/pack_unpack.h | 29 +- crypto_kem/firesaber/clean/poly.c | 58 +- crypto_kem/firesaber/clean/poly.h | 23 +- crypto_kem/firesaber/clean/poly_mul.c | 18 +- crypto_kem/firesaber/clean/poly_mul.h | 8 +- crypto_kem/firesaber/clean/verify.c | 13 +- crypto_kem/firesaber/clean/verify.h | 7 +- crypto_kem/lightsaber/META.yml | 11 +- crypto_kem/lightsaber/avx2/LICENSE | 1 + crypto_kem/lightsaber/avx2/Makefile | 22 + crypto_kem/lightsaber/avx2/SABER_indcpa.c | 416 +++++++ crypto_kem/lightsaber/avx2/SABER_indcpa.h | 13 + crypto_kem/lightsaber/avx2/SABER_params.h | 46 + crypto_kem/lightsaber/avx2/api.h | 18 + crypto_kem/lightsaber/avx2/cbd.c | 51 + crypto_kem/lightsaber/avx2/cbd.h | 16 + crypto_kem/lightsaber/avx2/kem.c | 79 ++ crypto_kem/lightsaber/avx2/kem.h | 35 + crypto_kem/lightsaber/avx2/pack_unpack.c | 502 ++++++++ crypto_kem/lightsaber/avx2/pack_unpack.h | 56 + crypto_kem/lightsaber/avx2/poly.h | 27 + crypto_kem/lightsaber/avx2/polymul/consts.h | 20 + crypto_kem/lightsaber/avx2/polymul/matrix.c | 303 +++++ crypto_kem/lightsaber/avx2/polymul/scm_avx.c | 753 ++++++++++++ .../lightsaber/avx2/polymul/toom-cook_4way.c | 1010 +++++++++++++++++ crypto_kem/lightsaber/avx2/verify.c | 35 + crypto_kem/lightsaber/avx2/verify.h | 22 + crypto_kem/lightsaber/clean/LICENSE | 9 +- crypto_kem/lightsaber/clean/Makefile | 4 +- crypto_kem/lightsaber/clean/SABER_indcpa.c | 316 +----- crypto_kem/lightsaber/clean/SABER_indcpa.h | 12 +- crypto_kem/lightsaber/clean/SABER_params.h | 43 +- crypto_kem/lightsaber/clean/api.h | 12 +- crypto_kem/lightsaber/clean/cbd.c | 27 +- crypto_kem/lightsaber/clean/cbd.h | 7 +- crypto_kem/lightsaber/clean/kem.c | 78 +- crypto_kem/lightsaber/clean/pack_unpack.c | 338 ++---- crypto_kem/lightsaber/clean/pack_unpack.h | 29 +- crypto_kem/lightsaber/clean/poly.c | 58 +- crypto_kem/lightsaber/clean/poly.h | 23 +- crypto_kem/lightsaber/clean/poly_mul.c | 18 +- crypto_kem/lightsaber/clean/poly_mul.h | 8 +- crypto_kem/lightsaber/clean/verify.c | 13 +- crypto_kem/lightsaber/clean/verify.h | 7 +- crypto_kem/saber/META.yml | 11 +- crypto_kem/saber/avx2/LICENSE | 1 + crypto_kem/saber/avx2/Makefile | 22 + crypto_kem/saber/avx2/SABER_indcpa.c | 416 +++++++ crypto_kem/saber/avx2/SABER_indcpa.h | 13 + crypto_kem/saber/avx2/SABER_params.h | 46 + crypto_kem/saber/avx2/api.h | 18 + crypto_kem/saber/avx2/cbd.c | 51 + crypto_kem/saber/avx2/cbd.h | 16 + crypto_kem/saber/avx2/kem.c | 79 ++ crypto_kem/saber/avx2/kem.h | 35 + crypto_kem/saber/avx2/pack_unpack.c | 502 ++++++++ crypto_kem/saber/avx2/pack_unpack.h | 56 + crypto_kem/saber/avx2/poly.h | 27 + crypto_kem/saber/avx2/polymul/consts.h | 20 + crypto_kem/saber/avx2/polymul/matrix.c | 303 +++++ crypto_kem/saber/avx2/polymul/scm_avx.c | 753 ++++++++++++ .../saber/avx2/polymul/toom-cook_4way.c | 1010 +++++++++++++++++ crypto_kem/saber/avx2/verify.c | 35 + crypto_kem/saber/avx2/verify.h | 22 + crypto_kem/saber/clean/LICENSE | 9 +- crypto_kem/saber/clean/Makefile | 4 +- crypto_kem/saber/clean/SABER_indcpa.c | 316 +----- crypto_kem/saber/clean/SABER_indcpa.h | 12 +- crypto_kem/saber/clean/SABER_params.h | 43 +- crypto_kem/saber/clean/api.h | 12 +- crypto_kem/saber/clean/cbd.c | 31 +- crypto_kem/saber/clean/cbd.h | 7 +- crypto_kem/saber/clean/kem.c | 78 +- crypto_kem/saber/clean/pack_unpack.c | 340 ++---- crypto_kem/saber/clean/pack_unpack.h | 29 +- crypto_kem/saber/clean/poly.c | 58 +- crypto_kem/saber/clean/poly.h | 23 +- crypto_kem/saber/clean/poly_mul.c | 18 +- crypto_kem/saber/clean/poly_mul.h | 8 +- crypto_kem/saber/clean/verify.c | 13 +- crypto_kem/saber/clean/verify.h | 7 +- test/duplicate_consistency/firesaber_avx2.yml | 7 + .../duplicate_consistency/firesaber_clean.yml | 36 +- .../duplicate_consistency/lightsaber_avx2.yml | 45 + .../lightsaber_clean.yml | 78 +- test/duplicate_consistency/saber_avx2.yml | 26 + test/duplicate_consistency/saber_clean.yml | 57 +- 117 files changed, 11459 insertions(+), 2114 deletions(-) create mode 100644 crypto_kem/firesaber/avx2/LICENSE create mode 100644 crypto_kem/firesaber/avx2/Makefile create mode 100644 crypto_kem/firesaber/avx2/SABER_indcpa.c create mode 100644 crypto_kem/firesaber/avx2/SABER_indcpa.h create mode 100644 crypto_kem/firesaber/avx2/SABER_params.h create mode 100644 crypto_kem/firesaber/avx2/api.h create mode 100644 crypto_kem/firesaber/avx2/cbd.c create mode 100644 crypto_kem/firesaber/avx2/cbd.h create mode 100644 crypto_kem/firesaber/avx2/kem.c create mode 100644 crypto_kem/firesaber/avx2/kem.h create mode 100644 crypto_kem/firesaber/avx2/pack_unpack.c create mode 100644 crypto_kem/firesaber/avx2/pack_unpack.h create mode 100644 crypto_kem/firesaber/avx2/poly.h create mode 100644 crypto_kem/firesaber/avx2/polymul/consts.h create mode 100644 crypto_kem/firesaber/avx2/polymul/matrix.c create mode 100644 crypto_kem/firesaber/avx2/polymul/scm_avx.c create mode 100644 crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c create mode 100644 crypto_kem/firesaber/avx2/verify.c create mode 100644 crypto_kem/firesaber/avx2/verify.h create mode 100644 crypto_kem/lightsaber/avx2/LICENSE create mode 100644 crypto_kem/lightsaber/avx2/Makefile create mode 100644 crypto_kem/lightsaber/avx2/SABER_indcpa.c create mode 100644 crypto_kem/lightsaber/avx2/SABER_indcpa.h create mode 100644 crypto_kem/lightsaber/avx2/SABER_params.h create mode 100644 crypto_kem/lightsaber/avx2/api.h create mode 100644 crypto_kem/lightsaber/avx2/cbd.c create mode 100644 crypto_kem/lightsaber/avx2/cbd.h create mode 100644 crypto_kem/lightsaber/avx2/kem.c create mode 100644 crypto_kem/lightsaber/avx2/kem.h create mode 100644 crypto_kem/lightsaber/avx2/pack_unpack.c create mode 100644 crypto_kem/lightsaber/avx2/pack_unpack.h create mode 100644 crypto_kem/lightsaber/avx2/poly.h create mode 100644 crypto_kem/lightsaber/avx2/polymul/consts.h create mode 100644 crypto_kem/lightsaber/avx2/polymul/matrix.c create mode 100644 crypto_kem/lightsaber/avx2/polymul/scm_avx.c create mode 100644 crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c create mode 100644 crypto_kem/lightsaber/avx2/verify.c create mode 100644 crypto_kem/lightsaber/avx2/verify.h create mode 100644 crypto_kem/saber/avx2/LICENSE create mode 100644 crypto_kem/saber/avx2/Makefile create mode 100644 crypto_kem/saber/avx2/SABER_indcpa.c create mode 100644 crypto_kem/saber/avx2/SABER_indcpa.h create mode 100644 crypto_kem/saber/avx2/SABER_params.h create mode 100644 crypto_kem/saber/avx2/api.h create mode 100644 crypto_kem/saber/avx2/cbd.c create mode 100644 crypto_kem/saber/avx2/cbd.h create mode 100644 crypto_kem/saber/avx2/kem.c create mode 100644 crypto_kem/saber/avx2/kem.h create mode 100644 crypto_kem/saber/avx2/pack_unpack.c create mode 100644 crypto_kem/saber/avx2/pack_unpack.h create mode 100644 crypto_kem/saber/avx2/poly.h create mode 100644 crypto_kem/saber/avx2/polymul/consts.h create mode 100644 crypto_kem/saber/avx2/polymul/matrix.c create mode 100644 crypto_kem/saber/avx2/polymul/scm_avx.c create mode 100644 crypto_kem/saber/avx2/polymul/toom-cook_4way.c create mode 100644 crypto_kem/saber/avx2/verify.c create mode 100644 crypto_kem/saber/avx2/verify.h create mode 100644 test/duplicate_consistency/firesaber_avx2.yml create mode 100644 test/duplicate_consistency/lightsaber_avx2.yml create mode 100644 test/duplicate_consistency/saber_avx2.yml diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml index e58c7a7c..def16e46 100644 --- a/crypto_kem/firesaber/META.yml +++ b/crypto_kem/firesaber/META.yml @@ -14,4 +14,13 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/commit/14ede83f1ff3bcc41f0464543542366c68b55871 + version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + - name: avx2 + version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 diff --git a/crypto_kem/firesaber/avx2/LICENSE b/crypto_kem/firesaber/avx2/LICENSE new file mode 100644 index 00000000..d5d21fff --- /dev/null +++ b/crypto_kem/firesaber/avx2/LICENSE @@ -0,0 +1 @@ +Public Domain diff --git a/crypto_kem/firesaber/avx2/Makefile b/crypto_kem/firesaber/avx2/Makefile new file mode 100644 index 00000000..a44bbdb4 --- /dev/null +++ b/crypto_kem/firesaber/avx2/Makefile @@ -0,0 +1,22 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libfiresaber_avx2.a +HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h +OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o + +CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.s $(HEADERS) + $(AS) -o $@ $< + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/firesaber/avx2/SABER_indcpa.c b/crypto_kem/firesaber/avx2/SABER_indcpa.c new file mode 100644 index 00000000..ab017224 --- /dev/null +++ b/crypto_kem/firesaber/avx2/SABER_indcpa.c @@ -0,0 +1,416 @@ +#include "./polymul/toom-cook_4way.c" +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include "fips202.h" +#include "pack_unpack.h" +#include "randombytes.h" +#include +#include +#include +//#include "randombytes.h" +//#include "./polymul/toom_cook_4/toom-cook_4way.c" + +#define h1 4 //2^(EQ-EP-1) + +#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) + + +static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { + int32_t i, j; + + for (j = 0; j < SABER_KEYBYTES; j++) { + message_dec[j] = 0; + for (i = 0; i < 8; i++) { + message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); + } + } +} + +/*----------------------------------------------------------------------------------- + This routine generates a=[Matrix K x K] of 256-coefficient polynomials +-------------------------------------------------------------------------------------*/ + +static void GenMatrix(polyvec *a, const uint8_t *seed) { + uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; + + uint16_t temp_ar[SABER_N]; + + int i, j, k; + uint16_t mod = (SABER_Q - 1); + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + PQCLEAN_FIRESABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); + for (k = 0; k < SABER_N; k++) { + a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; + } + } + } +} + +static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { + + uint32_t i; + + uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; + + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); + + for (i = 0; i < SABER_K; i++) { + PQCLEAN_FIRESABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); + } +} + +//********************************matrix-vector mul routines***************************************************** +static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) { + int64_t i, j; + + __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time + + for (i = 0; i < NUM_POLY; i++) { + for (j = 0; j < NUM_POLY; j++) { + + if (isTranspose == 0) { + toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); + } else { + toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); + } + } + + TC_interpol(c_bucket, res_avx[i]); + } + +} + +static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) { + + int64_t i; + + __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time + + for (i = 0; i < NUM_POLY; i++) { + toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); + } + TC_interpol(c_bucket, res_avx); +} + +//********************************matrix-vector mul routines***************************************************** + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { + + polyvec a[SABER_K]; + + uint16_t skpv1[SABER_K][SABER_N]; + + + + uint8_t seed[SABER_SEEDBYTES]; + uint8_t noiseseed[SABER_COINBYTES]; + int32_t i, j, k; + + +//--------------AVX declaration------------------ + + __m256i sk_avx[SABER_K][SABER_N / 16]; + __m256i mod; + __m256i res_avx[SABER_K][SABER_N / 16]; + __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; + //__m256i acc[2*SABER_N/16]; + + mod = _mm256_set1_epi16(SABER_Q - 1); + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + +//--------------AVX declaration ends------------------ + + randombytes(seed, SABER_SEEDBYTES); + + shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(noiseseed, SABER_COINBYTES); + + + GenMatrix(a, seed); //sample matrix A + + GenSecret(skpv1, noiseseed); + + +// Load sk into avx vectors + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + } + + } + + // Load a into avx vectors + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + for (k = 0; k < SABER_N / 16; k++) { + a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); + } + } + } + + + + //------------------------do the matrix vector multiplication and rounding------------ + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sk_avx[j], b_bucket[j]); + } + matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order + + // Now truncation + + + for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N / 16; j++) { + res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); + res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); + res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); + } + } + + //------------------Pack sk into byte string------- + + PQCLEAN_FIRESABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); + + //------------------Pack pk into byte string------- + + for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); + } + } + PQCLEAN_FIRESABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string + + + for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. + pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; + } + +} + + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + + + uint32_t i, j, k; + polyvec a[SABER_K]; // skpv; + uint8_t seed[SABER_SEEDBYTES]; + uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client + + + uint16_t skpv1[SABER_K][SABER_N]; + uint16_t temp[SABER_K][SABER_N]; + uint16_t message[SABER_KEYBYTES * 8]; + + uint8_t msk_c[SABER_SCALEBYTES_KEM]; + + //--------------AVX declaration------------------ + + __m256i sk_avx[SABER_K][SABER_N / 16]; + __m256i mod, mod_p; + __m256i res_avx[SABER_K][SABER_N / 16]; + __m256i vprime_avx[SABER_N / 16]; + __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; + //__m256i acc[2*SABER_N/16]; + + __m256i pkcl_avx[SABER_K][SABER_N / 16]; + + __m256i message_avx[SABER_N / 16]; + + mod = _mm256_set1_epi16(SABER_Q - 1); + mod_p = _mm256_set1_epi16(SABER_P - 1); + + + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + + //--------------AVX declaration ends------------------ + for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. + seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; + } + + GenMatrix(a, seed); + GenSecret(skpv1, noiseseed); + + // ----------- Load skpv1 into avx vectors ---------- + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + } + } + + // ----------- Load skpv1 into avx vectors ---------- + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + for (k = 0; k < SABER_N / 16; k++) { + a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); + } + } + } + //-----------------matrix-vector multiplication and rounding + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sk_avx[j], b_bucket[j]); + } + matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order + + // Now truncation + + for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N / 16; j++) { + res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); + res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); + res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); + + } + } + + + //-----this result should be put in b_prime for later use in server. + for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); + } + } + + PQCLEAN_FIRESABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string + +//**************client matrix-vector multiplication ends******************// + + //------now calculate the v' + + //-------unpack the public_key + PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); + } + } + + // InnerProduct + //for(k=0;k> i) & 0x01); + } + } + // message encoding + for (i = 0; i < SABER_N / 16; i++) { + message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); + message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); + } + + // SHIFTRIGHT(v'+h1-m mod p, EP-ET) + for (k = 0; k < SABER_N / 16; k++) { + vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); + vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); + vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); + } + + // Unpack avx + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); + } + + PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(msk_c, temp[0]); + + + for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { + ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; + } + +} + + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + + uint32_t i, j; + uint16_t sksv[SABER_K][SABER_N]; //secret key of the server + uint16_t pksv[SABER_K][SABER_N]; + uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; + uint8_t scale_ar[SABER_SCALEBYTES_KEM]; + uint16_t op[SABER_N]; + + //--------------AVX declaration------------------ + + + //__m256i mod_p; + + __m256i v_avx[SABER_N / 16]; + + //__m256i acc[2*SABER_N/16]; + + __m256i sksv_avx[SABER_K][SABER_N / 16]; + __m256i pksv_avx[SABER_K][SABER_N / 16]; + + //mod_p=_mm256_set1_epi16(SABER_P-1); + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + //--------------AVX declaration ends------------------ + + //-------unpack the public_key + + PQCLEAN_FIRESABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key + PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); + pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); + } + } + + for (i = 0; i < SABER_N / 16; i++) { + v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); + } + + + // InnerProduct(b', s, mod p) + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sksv_avx[j], b_bucket[j]); + } + + vector_vector_mul(pksv_avx, b_bucket, v_avx); + + for (i = 0; i < SABER_N / 16; i++) { + _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); + } + + + for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { + scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; + } + + PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(op, scale_ar); + + + //addition of h2 + for (i = 0; i < SABER_N; i++) { + message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); + } + + + POL2MSG(m, message_dec_unpacked); +} diff --git a/crypto_kem/firesaber/avx2/SABER_indcpa.h b/crypto_kem/firesaber/avx2/SABER_indcpa.h new file mode 100644 index 00000000..1b6c8311 --- /dev/null +++ b/crypto_kem/firesaber/avx2/SABER_indcpa.h @@ -0,0 +1,13 @@ +#ifndef INDCPA_H +#define INDCPA_H +#include "SABER_params.h" +#include + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); + + +#endif diff --git a/crypto_kem/firesaber/avx2/SABER_params.h b/crypto_kem/firesaber/avx2/SABER_params.h new file mode 100644 index 00000000..e1476b6a --- /dev/null +++ b/crypto_kem/firesaber/avx2/SABER_params.h @@ -0,0 +1,45 @@ +#ifndef PARAMS_H +#define PARAMS_H +#include "api.h" + + + + +#define SABER_K 4 +#define SABER_MU 6 +#define SABER_ET 6 + +#define SABER_EQ 13 +#define SABER_EP 10 + +#define SABER_N 256 +#define SABER_Q 8192 //2^13 +#define SABER_P 1024 + +#define SABER_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 +#define SABER_COINBYTES 32 +#define SABER_KEYBYTES 32 + +#define SABER_HASHBYTES 32 + +#define SABER_POLYBYTES 416 //13*256/8 + +#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) + +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation + +#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) + +#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) + +#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) +#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) + +#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) + +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) + +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ + +#endif diff --git a/crypto_kem/firesaber/avx2/api.h b/crypto_kem/firesaber/avx2/api.h new file mode 100644 index 00000000..cb5240dd --- /dev/null +++ b/crypto_kem/firesaber/avx2/api.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_FIRESABER_AVX2_API_H +#define PQCLEAN_FIRESABER_AVX2_API_H + + +#define PQCLEAN_FIRESABER_AVX2_CRYPTO_ALGNAME "FireSaber" +#define PQCLEAN_FIRESABER_AVX2_CRYPTO_BYTES 32 +#define PQCLEAN_FIRESABER_AVX2_CRYPTO_CIPHERTEXTBYTES 1472 +#define PQCLEAN_FIRESABER_AVX2_CRYPTO_PUBLICKEYBYTES 1312 +#define PQCLEAN_FIRESABER_AVX2_CRYPTO_SECRETKEYBYTES 3040 + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + + +#endif /* PQCLEAN_FIRESABER_AVX2_API_H */ diff --git a/crypto_kem/firesaber/avx2/cbd.c b/crypto_kem/firesaber/avx2/cbd.c new file mode 100644 index 00000000..37970a81 --- /dev/null +++ b/crypto_kem/firesaber/avx2/cbd.c @@ -0,0 +1,52 @@ +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ + + +static uint64_t load_littleendian(const unsigned char *x, int bytes) { + int i; + uint64_t r = x[0]; + for (i = 1; i < bytes; i++) { + r |= (uint64_t)x[i] << (8 * i); + } + return r; +} + + +void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { + uint16_t Qmod_minus1 = SABER_Q - 1; + + uint32_t t, d, a[4], b[4]; + int i, j; + + for (i = 0; i < SABER_N / 4; i++) { + t = load_littleendian(buf + 3 * i, 3); + d = 0; + for (j = 0; j < 3; j++) { + d += (t >> j) & 0x249249; + } + + a[0] = d & 0x7; + b[0] = (d >> 3) & 0x7; + a[1] = (d >> 6) & 0x7; + b[1] = (d >> 9) & 0x7; + a[2] = (d >> 12) & 0x7; + b[2] = (d >> 15) & 0x7; + a[3] = (d >> 18) & 0x7; + b[3] = (d >> 21); + + r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; + r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; + r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; + r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; + + } +} diff --git a/crypto_kem/firesaber/avx2/cbd.h b/crypto_kem/firesaber/avx2/cbd.h new file mode 100644 index 00000000..210bcc50 --- /dev/null +++ b/crypto_kem/firesaber/avx2/cbd.h @@ -0,0 +1,16 @@ +#ifndef CBD_H +#define CBD_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ +#include "poly.h" +#include + +void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); + + +#endif diff --git a/crypto_kem/firesaber/avx2/kem.c b/crypto_kem/firesaber/avx2/kem.c new file mode 100644 index 00000000..2e72e6aa --- /dev/null +++ b/crypto_kem/firesaber/avx2/kem.c @@ -0,0 +1,79 @@ +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "api.h" +#include "fips202.h" +#include "randombytes.h" +#include "verify.h" +#include +#include +#include +#include + + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { + int i; + + PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk + for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + } + + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. + + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in PQCLEAN_FIRESABER_AVX2_crypto_kem_dec() fails. + return (0); +} + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { + + uint8_t kr[64]; // Will contain key, coins + uint8_t buf[64]; + + randombytes(buf, 32); + + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output + + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); + // K^ <-- kr[0:31] + // noiseseed (r) <-- kr[32:63]; + PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { + int i; + uint8_t fail; + uint8_t cmp[SABER_BYTES_CCA_DEC]; + uint8_t buf[64]; + uint8_t kr[64]; // Will contain key, coins + const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + + PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message + + // Multitarget countermeasure for coins + contributory KEM + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk + buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; + } + + sha3_512(kr, buf, 64); + + PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); + + fail = PQCLEAN_FIRESABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) + + PQCLEAN_FIRESABER_AVX2_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} diff --git a/crypto_kem/firesaber/avx2/kem.h b/crypto_kem/firesaber/avx2/kem.h new file mode 100644 index 00000000..a55514d9 --- /dev/null +++ b/crypto_kem/firesaber/avx2/kem.h @@ -0,0 +1,35 @@ +#ifndef INDCPA_H +#define INDCPA_H + +#include + +void PQCLEAN_FIRESABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); + + +void PQCLEAN_FIRESABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); + + +void PQCLEAN_FIRESABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); + + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); + +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); + + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); + + + +//uint64_t clock1,clock2; + +//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; + + +#endif diff --git a/crypto_kem/firesaber/avx2/pack_unpack.c b/crypto_kem/firesaber/avx2/pack_unpack.c new file mode 100644 index 00000000..33c481b3 --- /dev/null +++ b/crypto_kem/firesaber/avx2/pack_unpack.c @@ -0,0 +1,502 @@ +#include "pack_unpack.h" + + +void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 3 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); + bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); + bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); + } +} + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 3 * j; + offset_data = 8 * j; + data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; + data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; + data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); + data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; + data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; + data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); + data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); + data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); + } + +} + +void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0; + + for (j = 0; j < SABER_N / 2; j++) { + offset_data = 2 * j; + bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); + } +} + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0; + + for (j = 0; j < SABER_N / 2; j++) { + offset_data = 2 * j; + data[offset_data] = bytes[j] & 0x0f; + data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; + } +} + +void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 3 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); + bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); + bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); + } +} + + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 3 * j; + offset_data = 4 * j; + data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; + data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; + data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; + data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); + } + +} + +void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); + } + } +} + +void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); + } + } +} + +void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); + + bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); + + bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); + + } + } + + +} + +void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 13 * j; + offset_data = 8 * j; + data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } +} + + + +void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); + data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); + data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); + data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); + + } + } +} + +void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + } + + +} + + + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); + data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); + data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); + data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); + + } + } + + +} + + +void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); + + bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); + + bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); + + } + } + + +} + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + } + + +} + +void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + //for(i=0;i> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + //} + + +} + + +void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + /*This function packs 11 bit data stream into 8 bits of data. + */ + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 11) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 11 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); + + } + } + +} + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 11) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 11 * j; + offset_data = 8 * j; + + data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); + + data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); + + data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); + + data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); + + data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); + + data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); + + data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); + + data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); + } + } + + +} + +void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 14) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 7 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); + } + } + + +} + + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 14) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 7 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); + + data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); + + data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); + + data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); + } + } + + +} + +void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { + + if (modulus == 1024) { + PQCLEAN_FIRESABER_AVX2_POLVECp2BS(bytes, data); + } else if (modulus == 8192) { + PQCLEAN_FIRESABER_AVX2_POLVECq2BS(bytes, data); + } +} + +void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { + + if (modulus == 1024) { + PQCLEAN_FIRESABER_AVX2_BS2POLVECp(data, bytes); + } else if (modulus == 8192) { + PQCLEAN_FIRESABER_AVX2_BS2POLVECq(data, bytes); + } + +} diff --git a/crypto_kem/firesaber/avx2/pack_unpack.h b/crypto_kem/firesaber/avx2/pack_unpack.h new file mode 100644 index 00000000..ba8a568f --- /dev/null +++ b/crypto_kem/firesaber/avx2/pack_unpack.h @@ -0,0 +1,56 @@ +#ifndef PACK_UNPACK_H +#define PACK_UNPACK_H +#include "SABER_params.h" +#include +#include + +void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); + +void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); + +void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + + +void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); + +void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + + +void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + + +void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); + + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + + +#endif diff --git a/crypto_kem/firesaber/avx2/poly.h b/crypto_kem/firesaber/avx2/poly.h new file mode 100644 index 00000000..8443de34 --- /dev/null +++ b/crypto_kem/firesaber/avx2/poly.h @@ -0,0 +1,27 @@ +#ifndef POLY_H +#define POLY_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ +#include "SABER_params.h" +#include + +typedef struct { + uint16_t coeffs[SABER_N]; +} poly; + +typedef struct { + poly vec[SABER_K]; +} polyvec; + +void PQCLEAN_FIRESABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); + + +void PQCLEAN_FIRESABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); + + +#endif diff --git a/crypto_kem/firesaber/avx2/polymul/consts.h b/crypto_kem/firesaber/avx2/polymul/consts.h new file mode 100644 index 00000000..40826398 --- /dev/null +++ b/crypto_kem/firesaber/avx2/polymul/consts.h @@ -0,0 +1,20 @@ +#include "../SABER_params.h" + +#define AVX_N (SABER_N >> 4) +#define small_len_avx (AVX_N >> 2) + +#define SCHB_N 16 + +#define N_SB (SABER_N >> 2) +#define N_SB_RES (2*N_SB-1) + +#define N_SB_16 (N_SB >> 2) +#define N_SB_16_RES (2*N_SB_16-1) + +#define AVX_N1 16 /*N/16*/ + +#define SCM_SIZE 16 + +// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements +#define NUM_POLY SABER_K +//int NUM_POLY=2; diff --git a/crypto_kem/firesaber/avx2/polymul/matrix.c b/crypto_kem/firesaber/avx2/polymul/matrix.c new file mode 100644 index 00000000..5fa35783 --- /dev/null +++ b/crypto_kem/firesaber/avx2/polymul/matrix.c @@ -0,0 +1,303 @@ +#include + +static void transpose_n1(__m256i *M) +{ + //int i; + register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; + register __m256i temp, temp0, temp1, temp2; + + //for(i=0; i<8; i=i+1) + //{ + r0 = _mm256_unpacklo_epi16(M[0], M[1]); + r1 = _mm256_unpacklo_epi16(M[2], M[3]); + r2 = _mm256_unpacklo_epi16(M[4], M[5]); + r3 = _mm256_unpacklo_epi16(M[6], M[7]); + r4 = _mm256_unpacklo_epi16(M[8], M[9]); + r5 = _mm256_unpacklo_epi16(M[10], M[11]); + r6 = _mm256_unpacklo_epi16(M[12], M[13]); + r7 = _mm256_unpacklo_epi16(M[14], M[15]); + + + temp = _mm256_unpacklo_epi32(r0, r1); + temp0 = _mm256_unpacklo_epi32(r2, r3); + temp1 = _mm256_unpacklo_epi32(r4, r5); + temp2 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpackhi_epi32(r0, r1); + r9 = _mm256_unpackhi_epi32(r2, r3); + r10 = _mm256_unpackhi_epi32(r4, r5); + r11 = _mm256_unpackhi_epi32(r6, r7); + + r0 = _mm256_unpacklo_epi64(temp, temp0); + r2 = _mm256_unpackhi_epi64(temp, temp0); + + r1 = _mm256_unpacklo_epi64(temp1, temp2); + r3 = _mm256_unpackhi_epi64(temp1, temp2); + + temp = _mm256_unpackhi_epi16(M[0], M[1]); + temp0 = _mm256_unpackhi_epi16(M[2], M[3]); + temp1 = _mm256_unpackhi_epi16(M[4], M[5]); + temp2 = _mm256_unpackhi_epi16(M[6], M[7]); + r4 = _mm256_unpackhi_epi16(M[8], M[9]); + + M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); + M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); + + + r5 = _mm256_unpackhi_epi16(M[10], M[11]); + r6 = _mm256_unpackhi_epi16(M[12], M[13]); + r7 = _mm256_unpackhi_epi16(M[14], M[15]); + + + + r0 = _mm256_unpacklo_epi64(r8, r9); + r1 = _mm256_unpacklo_epi64(r10, r11); + + r2 = _mm256_unpackhi_epi64(r8, r9); + r3 = _mm256_unpackhi_epi64(r10, r11); + + + + M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); + M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); + + + //for(i=0; i<4; i=i+1) + //{ + r0 = _mm256_unpacklo_epi32(temp, temp0); + r1 = _mm256_unpacklo_epi32(temp1, temp2); + r2 = _mm256_unpacklo_epi32(r4, r5); + r3 = _mm256_unpacklo_epi32(r6, r7); + + //} + + + //for(i=0; i<2; i=i+1) + //{ + r8 = _mm256_unpacklo_epi64(r0, r1); + r10 = _mm256_unpackhi_epi64(r0, r1); + + r9 = _mm256_unpacklo_epi64(r2, r3); + r11 = _mm256_unpackhi_epi64(r2, r3); + + M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); + M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); + M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); + M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); + + r0 = _mm256_unpackhi_epi32(temp, temp0); + r1 = _mm256_unpackhi_epi32(temp1, temp2); + r2 = _mm256_unpackhi_epi32(r4, r5); + r3 = _mm256_unpackhi_epi32(r6, r7); + + //} +// for(i=0; i<2; i=i+1) +// { + r4 = _mm256_unpacklo_epi64(r0, r1); + r6 = _mm256_unpackhi_epi64(r0, r1); + + r5 = _mm256_unpacklo_epi64(r2, r3); + r7 = _mm256_unpackhi_epi64(r2, r3); + +// } + + //------------------------------------------------------- + + M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); + M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); + M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); + M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); +} + +/* +void transpose_unrolled(__m256i *M) +{ + int i; + __m256i tL[8], tH[8]; + __m256i bL[4], bH[4], cL[4], cH[4]; + __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; + + __m256i r0, r1, r2, r3, r4, r5, r6, r7; + + //for(i=0; i<8; i=i+1) + //{ + tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); + tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); + + tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); + tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); + + tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); + tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); + + tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); + tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); + + tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); + tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); + + tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); + tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); + + tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); + tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); + + tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); + tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); + + //} + + //------------------------------------------------------- + //for(i=0; i<4; i=i+1) + //{ + bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); + bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); + + bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); + bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); + + bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); + bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); + + bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); + bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); + + //} + + //for(i=0; i<2; i=i+1) + //{ + dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); + dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); + + dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); + dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); + + M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); + M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); + M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); + M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); + + //} + //for(i=0; i<2; i=i+1) + //{ + eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); + eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); + + eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); + eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); + + //} + + //------------------------------------------------------- + + //------------------------------------------------------- + for(i=0; i<4; i=i+1) + { + cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); + cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); + } + + + for(i=0; i<2; i=i+1) + { + fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); + fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); + gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); + } + + //------------------------------------------------------- + + + + M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); + M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); + M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); + M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); + + M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); + M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); + M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); + M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); + + M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); + M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); + M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); + M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); +} + + +void transpose1(__m256i *M) +{ + int i; + __m256i tL[8], tH[8]; + __m256i bL[4], bH[4], cL[4], cH[4]; + __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; + + for(i=0; i<8; i=i+1) + { + tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); + tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); + } + + for(i=0; i<4; i=i+1) + { + bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); + bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); + } + for(i=0; i<4; i=i+1) + { + cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); + cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); + } + + for(i=0; i<2; i=i+1) + { + dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); + dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); + eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); + } + + for(i=0; i<2; i=i+1) + { + fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); + fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); + gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); + } + + M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); + M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); + M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); + M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); + + M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); + M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); + M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); + M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); + + M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); + M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); + M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); + M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); + + M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); + M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); + M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); + M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); +} +*/ diff --git a/crypto_kem/firesaber/avx2/polymul/scm_avx.c b/crypto_kem/firesaber/avx2/polymul/scm_avx.c new file mode 100644 index 00000000..4e4f11f8 --- /dev/null +++ b/crypto_kem/firesaber/avx2/polymul/scm_avx.c @@ -0,0 +1,753 @@ +//#define SCM_SIZE 16 + +//#pragma STDC FP_CONTRACT ON + +#include + +inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { + return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); +} + + +static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched + //the c_avx are added cummulatively +{ + + register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + register __m256i temp; + + + a0=a[0]; + a1=a[1]; + a2=a[2]; + a3=a[3]; + a4=a[4]; + a5=a[5]; + a6=a[6]; + a7=a[7]; + + b0=b[0]; + b1=b[1]; + b2=b[2]; + b3=b[3]; + b4=b[4]; + b5=b[5]; + b6=b[6]; + b7=b[7]; + + // New Unrolled first triangle + + //otherwise accumulate + c_avx[0] = mul_add(a0, b0, c_avx[0]); + + + temp = _mm256_mullo_epi16 (a0, b1); + temp=mul_add(a1, b0, temp); + c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); + + + temp = _mm256_mullo_epi16 (a0, b2); + temp = mul_add(a1, b1, temp); + temp=mul_add(a2, b0, temp); + c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); + + + temp = _mm256_mullo_epi16 (a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + temp=mul_add(a3, b0, temp); + c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); + + temp = _mm256_mullo_epi16 (a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + temp=mul_add(a2, b2, temp); + c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); + + + temp = _mm256_mullo_epi16 (a0, b5); + temp = mul_add(a1, b4 , temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + temp=mul_add(a5, b0, temp); + c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); + + temp = _mm256_mullo_epi16 (a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + temp=mul_add(a4, b2, temp); + c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); + + + temp = _mm256_mullo_epi16 (a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add (a6, b1, temp); + temp = mul_add (a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add (a3, b4, temp); + temp = mul_add (a4, b3, temp); + temp=mul_add(a5, b2, temp); + c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); + + temp = _mm256_mullo_epi16 (a0, b[8]); + temp = mul_add (a1, b7, temp); + temp = mul_add (a7, b1, temp); + temp = mul_add (a[8], b0, temp); + temp = mul_add (a2, b6,temp); + temp = mul_add(a3, b5, temp); + temp = mul_add (a4, b4,temp); + temp = mul_add (a5, b3, temp); + + temp=mul_add(a6, b2, temp); + c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); + + + temp = _mm256_mullo_epi16 (a0, b[9]); + temp = mul_add (a1, b[8], temp); + temp = mul_add (a[8], b1, temp); + temp = mul_add (a[9], b0, temp); + temp = mul_add (a2, b7, temp); + temp = mul_add (a3, b6, temp); + temp = mul_add (a4, b5, temp); + temp = mul_add (a5, b4, temp); + temp = mul_add (a6, b3, temp); + temp=mul_add(a7, b2, temp); + c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); + + + temp= _mm256_mullo_epi16 (a0, b[10]); + temp = mul_add (a1, b[9], temp); + temp = mul_add (a[9], b1, temp); + temp = mul_add (a[10], b0, temp); + temp = mul_add (a2, b[8], temp); + temp = mul_add (a3, b7, temp); + temp = mul_add (a4, b6, temp); + temp = mul_add (a5, b5, temp); + temp = mul_add (a6, b4, temp); + temp = mul_add (a7, b3, temp); + temp=mul_add(a[8], b2, temp); + c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); + + + temp = _mm256_mullo_epi16 (a0, b[11]); + temp = mul_add (a1, b[10], temp ); + temp = mul_add (a[10], b1, temp ); + temp = mul_add (a[11], b0, temp ); + temp = mul_add (a2, b[9], temp ); + temp = mul_add (a3, b[8], temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a[8], b3, temp ); + temp=mul_add(a[9], b2, temp); + c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); + + + temp = _mm256_mullo_epi16 (a0, b[12]); + temp = mul_add (a1, b[11], temp); + temp = mul_add (a[11], b1, temp); + temp = mul_add (a[12], b0, temp); + temp = mul_add (a2, b[10], temp); + temp = mul_add (a3, b[9], temp); + temp = mul_add (a4, b[8], temp); + temp = mul_add (a5, b7, temp); + temp = mul_add (a6, b6, temp); + temp = mul_add (a7, b5, temp); + temp = mul_add (a[8], b4, temp); + temp = mul_add (a[9], b3, temp); + temp=mul_add(a[10], b2, temp); + c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); + + + temp = _mm256_mullo_epi16 (a0, b[13]); + temp = mul_add (a1, b[12], temp ); + temp = mul_add (a[12], b1, temp ); + temp = mul_add (a[13], b0, temp ); + temp = mul_add (a2, b[11], temp ); + temp = mul_add (a3, b[10], temp ); + temp = mul_add (a4, b[9], temp ); + temp = mul_add (a5, b[8], temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a[8], b5, temp ); + temp = mul_add (a[9], b4, temp ); + temp = mul_add (a[10], b3, temp ); + temp=mul_add(a[11], b2, temp); + c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); + + + + temp = _mm256_mullo_epi16 (a0, b[14]); + temp = mul_add (a1, b[13], temp ); + temp = mul_add (a[13], b1, temp ); + temp = mul_add (a[14], b0, temp ); + temp = mul_add (a2, b[12], temp ); + temp = mul_add (a3, b[11], temp ); + temp = mul_add (a4, b[10], temp ); + temp = mul_add (a5, b[9], temp ); + temp = mul_add (a6, b[8], temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a[8], b6, temp ); + temp = mul_add (a[9], b5, temp ); + temp = mul_add (a[10], b4, temp ); + temp = mul_add (a[11], b3, temp ); + temp=mul_add(a[12], b2, temp); + c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); + + + temp = _mm256_mullo_epi16 (a0, b[15]); + temp = mul_add (a1, b[14], temp ); + temp = mul_add (a[14], b1, temp ); + temp = mul_add (a[15], b0, temp ); + temp = mul_add (a2, b[13], temp ); + temp = mul_add (a3, b[12], temp ); + temp = mul_add (a4, b[11], temp ); + temp = mul_add (a5, b[10], temp ); + temp = mul_add (a6, b[9], temp ); + temp = mul_add (a7, b[8], temp ); + temp = mul_add (a[8], b7, temp ); + temp = mul_add (a[9], b6, temp ); + temp = mul_add (a[10], b5, temp ); + temp = mul_add (a[11], b4, temp ); + temp = mul_add (a[12], b3, temp ); + temp=mul_add(a[13], b2, temp); + c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); + + + // unrolled second triangle + a0=a[14]; + a1=a[15]; + a2=a[13]; + a3=a[12]; + a4=a[11]; + a5=a[10]; + a6=a[9]; + a7=a[8]; + + b0=b[14]; + b1=b[15]; + b2=b[13]; + b3=b[12]; + b4=b[11]; + b5=b[10]; + b6=b[9]; + b7=b[8]; + + temp = _mm256_mullo_epi16 (a[1], b1); + temp = mul_add (a[2], b0, temp ); + temp = mul_add (a[3], b2, temp ); + temp = mul_add (a[4], b3, temp ); + temp = mul_add (a[5], b4, temp ); + temp = mul_add (a[6], b5, temp ); + temp = mul_add (a[7], b6, temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a6, b[7], temp ); + temp = mul_add (a5, b[6], temp ); + temp = mul_add (a4, b[5], temp ); + temp = mul_add (a3, b[4], temp ); + temp = mul_add (a2, b[3], temp ); + temp = mul_add (a0, b[2], temp ); + temp=mul_add(a1, b[1], temp); + c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); + + + temp = _mm256_mullo_epi16 (a[2], b1); + temp = mul_add (a[3], b0, temp ); + temp = mul_add (a[4], b2, temp ); + temp = mul_add (a[5], b3, temp ); + temp = mul_add (a[6], b4, temp ); + temp = mul_add (a[7], b5, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a5, b[7], temp ); + temp = mul_add (a4, b[6], temp ); + temp = mul_add (a3, b[5], temp ); + temp = mul_add (a2, b[4], temp ); + temp = mul_add (a0, b[3], temp ); + temp=mul_add(a1, b[2], temp); + c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); + + + temp = _mm256_mullo_epi16 (a[3], b1); + temp = mul_add (a[4], b0, temp ); + temp = mul_add (a[5], b2, temp ); + temp = mul_add (a[6], b3, temp ); + temp = mul_add (a[7], b4, temp ); + temp = mul_add (a7, b5, temp ); + temp = mul_add (a6, b6, temp ); + temp = mul_add (a5, b7, temp ); + temp = mul_add (a4, b[7], temp ); + temp = mul_add (a3, b[6], temp ); + temp = mul_add (a2, b[5], temp ); + temp = mul_add (a0, b[4], temp ); + temp=mul_add(a1, b[3], temp); + c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); + + + temp = _mm256_mullo_epi16 (a[4], b1); + temp = mul_add (a[5], b0, temp ); + temp = mul_add (a[6], b2, temp ); + temp = mul_add (a[7], b3, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a3, b[7], temp ); + temp = mul_add (a2, b[6], temp ); + temp = mul_add (a0, b[5], temp ); + temp=mul_add(a1, b[4], temp); + c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); + + + temp = _mm256_mullo_epi16 (a[5], b1); + temp = mul_add (a[6], b0, temp ); + temp = mul_add (a[7], b2, temp ); + temp = mul_add (a7, b3, temp ); + temp = mul_add (a6, b4, temp ); + temp = mul_add (a5, b5, temp ); + temp = mul_add (a4, b6, temp ); + temp = mul_add (a3, b7, temp ); + temp = mul_add (a2, b[7], temp ); + temp = mul_add (a0, b[6], temp ); + temp=mul_add(a1, b[5], temp); + c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); + + + temp = _mm256_mullo_epi16 (a[6], b1); + temp = mul_add (a[7], b0, temp ); + temp = mul_add (a7, b2, temp ); + temp = mul_add (a6, b3, temp ); + temp = mul_add (a5, b4, temp ); + temp = mul_add (a4, b5, temp ); + temp = mul_add (a3, b6, temp ); + temp = mul_add (a2, b7, temp ); + temp = mul_add (a0, b[7], temp ); + temp=mul_add(a1, b[6], temp); + c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); + + + temp = _mm256_mullo_epi16 (a[7], b1); + temp = mul_add (a7, b0, temp ); + temp = mul_add (a6, b2, temp ); + temp = mul_add (a5, b3, temp ); + temp = mul_add (a4, b4, temp ); + temp = mul_add (a3, b5, temp ); + temp = mul_add (a2, b6, temp ); + temp = mul_add (a0, b7, temp ); + temp=mul_add(a1, b[7], temp); + c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); + + + temp = _mm256_mullo_epi16 (a7, b1); + temp = mul_add (a6, b0, temp ); + temp = mul_add (a5, b2, temp ); + temp = mul_add (a4, b3, temp ); + temp = mul_add (a3, b4, temp ); + temp = mul_add (a2, b5, temp ); + temp = mul_add (a0, b6, temp ); + temp=mul_add(a1, b7, temp); + c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); + + + temp = _mm256_mullo_epi16 (a6, b1); + temp = mul_add (a5, b0, temp ); + temp = mul_add (a4, b2, temp ); + temp = mul_add (a3, b3, temp ); + temp = mul_add (a2, b4, temp ); + temp = mul_add (a0, b5, temp ); + temp=mul_add(a1, b6, temp); + c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); + + + temp = _mm256_mullo_epi16 (a5, b1); + temp = mul_add (a4, b0, temp ); + temp = mul_add (a3, b2, temp ); + temp = mul_add (a2, b3, temp ); + temp = mul_add (a0, b4, temp ); + temp=mul_add(a1, b5, temp); + c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); + + + temp = _mm256_mullo_epi16 (a4, b1); + temp = mul_add (a3, b0, temp ); + temp = mul_add (a2, b2, temp ); + temp = mul_add (a0, b3, temp ); + temp=mul_add(a1, b4, temp); + c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); + + + temp = _mm256_mullo_epi16 (a3, b1); + temp = mul_add (a2, b0, temp ); + temp = mul_add (a0, b2, temp ); + temp=mul_add(a1, b3, temp); + c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); + + + temp = _mm256_mullo_epi16 (a2, b1); + temp = mul_add (a0, b0, temp ); + temp=mul_add(a1, b2, temp); + c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); + + + temp = _mm256_mullo_epi16 (a0, b1); + temp=mul_add(a1, b0, temp); + c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); + + + c_avx[30] = mul_add(a1, b1, c_avx[30]); + + + + c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); + + +} + + + +static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched + //the c_avx are not added cummulatively +{ + + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + + a0=a[0]; + a1=a[1]; + a2=a[2]; + a3=a[3]; + a4=a[4]; + a5=a[5]; + a6=a[6]; + a7=a[7]; + + b0=b[0]; + b1=b[1]; + b2=b[2]; + b3=b[3]; + b4=b[4]; + b5=b[5]; + b6=b[6]; + b7=b[7]; + + // New Unrolled first triangle + c_avx[0] = _mm256_mullo_epi16 (a0, b0); + + temp = _mm256_mullo_epi16 (a0, b1); + c_avx[1]=mul_add(a1, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b2); + + temp = mul_add(a1, b1, temp); + c_avx[2]= mul_add(a2, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + c_avx[3]= mul_add(a3, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + c_avx[4]= mul_add(a2, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b5); + temp = mul_add(a1, b4 , temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + c_avx[5] = mul_add(a5, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + c_avx[6] = mul_add(a4, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add (a6, b1, temp); + temp = mul_add (a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add (a3, b4, temp); + temp = mul_add (a4, b3, temp); + c_avx[7] = mul_add (a5, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[8]); + temp = mul_add (a1, b7, temp); + temp = mul_add (a7, b1, temp); + temp = mul_add (a[8], b0, temp); + temp = mul_add (a2, b6,temp); + temp = mul_add(a3, b5, temp); + temp = mul_add (a4, b4,temp); + temp = mul_add (a5, b3, temp); + c_avx[8] = mul_add (a6, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[9]); + temp = mul_add (a1, b[8], temp); + temp = mul_add (a[8], b1, temp); + temp = mul_add (a[9], b0, temp); + temp = mul_add (a2, b7, temp); + temp = mul_add (a3, b6, temp); + temp = mul_add (a4, b5, temp); + temp = mul_add (a5, b4, temp); + temp = mul_add (a6, b3, temp); + c_avx[9] = mul_add (a7, b2, temp); + + temp= _mm256_mullo_epi16 (a0, b[10]); + temp = mul_add (a1, b[9], temp); + temp = mul_add (a[9], b1, temp); + temp = mul_add (a[10], b0, temp); + temp = mul_add (a2, b[8], temp); + temp = mul_add (a3, b7, temp); + temp = mul_add (a4, b6, temp); + temp = mul_add (a5, b5, temp); + temp = mul_add (a6, b4, temp); + temp = mul_add (a7, b3, temp); + c_avx[10] = mul_add (a[8], b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[11]); + temp = mul_add (a1, b[10], temp ); + temp = mul_add (a[10], b1, temp ); + temp = mul_add (a[11], b0, temp ); + temp = mul_add (a2, b[9], temp ); + temp = mul_add (a3, b[8], temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a[8], b3, temp ); + c_avx[11] = mul_add (a[9], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[12]); + temp = mul_add (a1, b[11], temp); + temp = mul_add (a[11], b1, temp); + temp = mul_add (a[12], b0, temp); + temp = mul_add (a2, b[10], temp); + temp = mul_add (a3, b[9], temp); + temp = mul_add (a4, b[8], temp); + temp = mul_add (a5, b7, temp); + temp = mul_add (a6, b6, temp); + temp = mul_add (a7, b5, temp); + temp = mul_add (a[8], b4, temp); + temp = mul_add (a[9], b3, temp); + c_avx[12] = mul_add (a[10], b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[13]); + temp = mul_add (a1, b[12], temp ); + temp = mul_add (a[12], b1, temp ); + temp = mul_add (a[13], b0, temp ); + temp = mul_add (a2, b[11], temp ); + temp = mul_add (a3, b[10], temp ); + temp = mul_add (a4, b[9], temp ); + temp = mul_add (a5, b[8], temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a[8], b5, temp ); + temp = mul_add (a[9], b4, temp ); + temp = mul_add (a[10], b3, temp ); + c_avx[13] = mul_add (a[11], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[14]); + temp = mul_add (a1, b[13], temp ); + temp = mul_add (a[13], b1, temp ); + temp = mul_add (a[14], b0, temp ); + temp = mul_add (a2, b[12], temp ); + temp = mul_add (a3, b[11], temp ); + temp = mul_add (a4, b[10], temp ); + temp = mul_add (a5, b[9], temp ); + temp = mul_add (a6, b[8], temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a[8], b6, temp ); + temp = mul_add (a[9], b5, temp ); + temp = mul_add (a[10], b4, temp ); + temp = mul_add (a[11], b3, temp ); + c_avx[14] = mul_add (a[12], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[15]); + temp = mul_add (a1, b[14], temp ); + temp = mul_add (a[14], b1, temp ); + temp = mul_add (a[15], b0, temp ); + temp = mul_add (a2, b[13], temp ); + temp = mul_add (a3, b[12], temp ); + temp = mul_add (a4, b[11], temp ); + temp = mul_add (a5, b[10], temp ); + temp = mul_add (a6, b[9], temp ); + temp = mul_add (a7, b[8], temp ); + temp = mul_add (a[8], b7, temp ); + temp = mul_add (a[9], b6, temp ); + temp = mul_add (a[10], b5, temp ); + temp = mul_add (a[11], b4, temp ); + temp = mul_add (a[12], b3, temp ); + c_avx[15] = mul_add (a[13], b2, temp ); + + + // unrolled second triangle + a0=a[14]; + a1=a[15]; + a2=a[13]; + a3=a[12]; + a4=a[11]; + a5=a[10]; + a6=a[9]; + a7=a[8]; + + b0=b[14]; + b1=b[15]; + b2=b[13]; + b3=b[12]; + b4=b[11]; + b5=b[10]; + b6=b[9]; + b7=b[8]; + + + temp = _mm256_mullo_epi16 (a[1], b1); + temp = mul_add (a[2], b0, temp ); + temp = mul_add (a[3], b2, temp ); + temp = mul_add (a[4], b3, temp ); + temp = mul_add (a[5], b4, temp ); + temp = mul_add (a[6], b5, temp ); + temp = mul_add (a[7], b6, temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a6, b[7], temp ); + temp = mul_add (a5, b[6], temp ); + temp = mul_add (a4, b[5], temp ); + temp = mul_add (a3, b[4], temp ); + temp = mul_add (a2, b[3], temp ); + temp = mul_add (a0, b[2], temp ); + c_avx[16] = mul_add (a1, b[1], temp ); + + temp = _mm256_mullo_epi16 (a[2], b1); + temp = mul_add (a[3], b0, temp ); + temp = mul_add (a[4], b2, temp ); + temp = mul_add (a[5], b3, temp ); + temp = mul_add (a[6], b4, temp ); + temp = mul_add (a[7], b5, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a5, b[7], temp ); + temp = mul_add (a4, b[6], temp ); + temp = mul_add (a3, b[5], temp ); + temp = mul_add (a2, b[4], temp ); + temp = mul_add (a0, b[3], temp ); + c_avx[17] = mul_add (a1, b[2], temp ); + + temp = _mm256_mullo_epi16 (a[3], b1); + temp = mul_add (a[4], b0, temp ); + temp = mul_add (a[5], b2, temp ); + temp = mul_add (a[6], b3, temp ); + temp = mul_add (a[7], b4, temp ); + temp = mul_add (a7, b5, temp ); + temp = mul_add (a6, b6, temp ); + temp = mul_add (a5, b7, temp ); + temp = mul_add (a4, b[7], temp ); + temp = mul_add (a3, b[6], temp ); + temp = mul_add (a2, b[5], temp ); + temp = mul_add (a0, b[4], temp ); + c_avx[18] = mul_add (a1, b[3], temp ); + + temp = _mm256_mullo_epi16 (a[4], b1); + temp = mul_add (a[5], b0, temp ); + temp = mul_add (a[6], b2, temp ); + temp = mul_add (a[7], b3, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a3, b[7], temp ); + temp = mul_add (a2, b[6], temp ); + temp = mul_add (a0, b[5], temp ); + c_avx[19] = mul_add (a1, b[4], temp ); + + temp = _mm256_mullo_epi16 (a[5], b1); + temp = mul_add (a[6], b0, temp ); + temp = mul_add (a[7], b2, temp ); + temp = mul_add (a7, b3, temp ); + temp = mul_add (a6, b4, temp ); + temp = mul_add (a5, b5, temp ); + temp = mul_add (a4, b6, temp ); + temp = mul_add (a3, b7, temp ); + temp = mul_add (a2, b[7], temp ); + temp = mul_add (a0, b[6], temp ); + c_avx[20] = mul_add (a1, b[5], temp ); + + temp = _mm256_mullo_epi16 (a[6], b1); + temp = mul_add (a[7], b0, temp ); + temp = mul_add (a7, b2, temp ); + temp = mul_add (a6, b3, temp ); + temp = mul_add (a5, b4, temp ); + temp = mul_add (a4, b5, temp ); + temp = mul_add (a3, b6, temp ); + temp = mul_add (a2, b7, temp ); + temp = mul_add (a0, b[7], temp ); + c_avx[21] = mul_add (a1, b[6], temp ); + + temp = _mm256_mullo_epi16 (a[7], b1); + temp = mul_add (a7, b0, temp ); + temp = mul_add (a6, b2, temp ); + temp = mul_add (a5, b3, temp ); + temp = mul_add (a4, b4, temp ); + temp = mul_add (a3, b5, temp ); + temp = mul_add (a2, b6, temp ); + temp = mul_add (a0, b7, temp ); + c_avx[22] = mul_add (a1, b[7], temp ); + + temp = _mm256_mullo_epi16 (a7, b1); + temp = mul_add (a6, b0, temp ); + temp = mul_add (a5, b2, temp ); + temp = mul_add (a4, b3, temp ); + temp = mul_add (a3, b4, temp ); + temp = mul_add (a2, b5, temp ); + temp = mul_add (a0, b6, temp ); + c_avx[23] = mul_add (a1, b7, temp ); + + temp = _mm256_mullo_epi16 (a6, b1); + temp = mul_add (a5, b0, temp ); + temp = mul_add (a4, b2, temp ); + temp = mul_add (a3, b3, temp ); + temp = mul_add (a2, b4, temp ); + temp = mul_add (a0, b5, temp ); + c_avx[24] = mul_add (a1, b6, temp ); + + temp = _mm256_mullo_epi16 (a5, b1); + temp = mul_add (a4, b0, temp ); + temp = mul_add (a3, b2, temp ); + temp = mul_add (a2, b3, temp ); + temp = mul_add (a0, b4, temp ); + c_avx[25] = mul_add (a1, b5, temp ); + + temp = _mm256_mullo_epi16 (a4, b1); + temp = mul_add (a3, b0, temp ); + temp = mul_add (a2, b2, temp ); + temp = mul_add (a0, b3, temp ); + c_avx[26] = mul_add (a1, b4, temp ); + + temp = _mm256_mullo_epi16 (a3, b1); + temp = mul_add (a2, b0, temp ); + temp = mul_add (a0, b2, temp ); + c_avx[27] = mul_add (a1, b3, temp ); + + temp = _mm256_mullo_epi16 (a2, b1); + temp = mul_add (a0, b0, temp ); + c_avx[28] = mul_add (a1, b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b1); + c_avx[29] = mul_add (a1, b0, temp); + + c_avx[30] = _mm256_mullo_epi16 (a1, b1); + + + c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); + +} diff --git a/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c b/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c new file mode 100644 index 00000000..78fb86c2 --- /dev/null +++ b/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c @@ -0,0 +1,1010 @@ +/* +Cleaned version for step by step approach look into the _debug file +*/ +//#include "timing.c" +#include "consts.h" +#include "matrix.c" +#include "scm_avx.c" + +static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX. +{ + __m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time + + //uint16_t i; + + register __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + + + //CLOCK1=cpucycles(); + + //------------------AVX evaluation for 1st poly----------------------- + + r0_avx=a[0]; + r1_avx=a[1]; + r2_avx=a[2]; + r3_avx=a[3]; + a_bucket[0]=r0_avx; + a_bucket[1]=r1_avx; + a_bucket[2]=r2_avx; + a_bucket[3]=r3_avx; + a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]); + + + //------------------AVX evaluation for 1st poly ends------------------ + + + //------------------AVX evaluation for 2nd poly----------------------- + r0_avx=a[small_len_avx]; + r1_avx=a[small_len_avx+1]; + r2_avx=a[small_len_avx+2]; + r3_avx=a[small_len_avx+3]; + a_bucket[0+9]=r0_avx; + a_bucket[1+9]=r1_avx; + a_bucket[2+9]=r2_avx; + a_bucket[3+9]=r3_avx; + a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]); + + + //------------------AVX evaluation for 2nd poly ends------------------ + + + //------------------AVX evaluation for 3rd poly----------------------- + r0_avx=a[2*small_len_avx]; + r1_avx=a[2*small_len_avx+1]; + r2_avx=a[2*small_len_avx+2]; + r3_avx=a[2*small_len_avx+3]; + a_bucket[0+18]=r0_avx; + a_bucket[1+18]=r1_avx; + a_bucket[2+18]=r2_avx; + a_bucket[3+18]=r3_avx; + a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]); + + //------------------AVX evaluation for 3rd poly ends------------------ + + + //------------------AVX evaluation for 4th poly----------------------- + + r0_avx=a[3*small_len_avx]; + r1_avx=a[3*small_len_avx+1]; + r2_avx=a[3*small_len_avx+2]; + r3_avx=a[3*small_len_avx+3]; + a_bucket[0+27]=r0_avx; + a_bucket[1+27]=r1_avx; + a_bucket[2+27]=r2_avx; + a_bucket[3+27]=r3_avx; + a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]); + + //------------------AVX evaluation for 4th poly ends------------------ + + //------------------AVX evaluation for 5th poly----------------------- + + r0_avx=a[4*small_len_avx+0]; + r1_avx=a[4*small_len_avx+1]; + r2_avx=a[4*small_len_avx+2]; + r3_avx=a[4*small_len_avx+3]; + a_bucket[0+36]=r0_avx; + a_bucket[1+36]=r1_avx; + a_bucket[2+36]=r2_avx; + a_bucket[3+36]=r3_avx; + a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]); + + //------------------AVX evaluation for 5th poly ends------------------ + + + //------------------AVX evaluation for 6th poly----------------------- + r0_avx=a[5*small_len_avx]; + r1_avx=a[5*small_len_avx+1]; + r2_avx=a[5*small_len_avx+2]; + r3_avx=a[5*small_len_avx+3]; + a_bucket[0+45]=r0_avx; + a_bucket[1+45]=r1_avx; + a_bucket[2+45]=r2_avx; + a_bucket[3+45]=r3_avx; + a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]); + + //------------------AVX evaluation for 6th poly ends------------------ + + //------------------AVX evaluation for 7th poly----------------------- + + r0_avx=a[6*small_len_avx]; + r1_avx=a[6*small_len_avx+1]; + r2_avx=a[6*small_len_avx+2]; + r3_avx=a[6*small_len_avx+3]; + a_bucket[0+54]=r0_avx; + a_bucket[1+54]=r1_avx; + a_bucket[2+54]=r2_avx; + a_bucket[3+54]=r3_avx; + a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]); + + //------------------AVX evaluation for 7th poly ends------------------ + + + + //CLOCK2=cpucycles(); + //CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1); + //printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1); + + + //CLOCK1=cpucycles(); + //-----------------Forward transposes-------------------------------------- + transpose_n1(a_bucket); + transpose_n1(a_bucket+16); + transpose_n1(a_bucket+32); + transpose_n1(a_bucket+48); + + //-----------------Forwatrd transposes ends--------------------------------- + + //----------------------all multiplications--------------------------------- + if(f==0){ + schoolbook_avx_new2(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); + schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); + schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); + } + else{ + schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); + //schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); + schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); + } + /* + schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f); + schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f); + schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f); + schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f); + */ + + + //----------------------all multiplications ends----------------------------- + + + //-----------------Reverse transposes-------------------------------------- + + /* + transpose(c_bucket); + transpose(c_bucket+16); + + transpose(c_bucket+2*SCM_SIZE); + transpose(c_bucket+16+2*SCM_SIZE); + + transpose(c_bucket+4*SCM_SIZE); + transpose(c_bucket+16+4*SCM_SIZE); + + transpose(c_bucket+6*SCM_SIZE); + transpose(c_bucket+16+6*SCM_SIZE); + */ + //-----------------Reverse transposes ends--------------------------------- + + //CLOCK2=cpucycles(); + //CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1); + + //KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6); + +} + +static void KARA_eval(__m256i* b, __m256i *b_bucket){ + + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + + //-------1st poly---------------------------------------------------- + r0_avx=b[0]; + r1_avx=b[1]; + r2_avx=b[2]; + r3_avx=b[3]; + b_bucket[0]=r0_avx; + b_bucket[1]=r1_avx; + b_bucket[2]=r2_avx; + b_bucket[3]=r3_avx; + b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]); + //-------2nd poly---------------------------------------------------- + + r0_avx=b[small_len_avx]; + r1_avx=b[small_len_avx+1]; + r2_avx=b[small_len_avx+2]; + r3_avx=b[small_len_avx+3]; + b_bucket[0+9]=r0_avx; + b_bucket[1+9]=r1_avx; + b_bucket[2+9]=r2_avx; + b_bucket[3+9]=r3_avx; + b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]); + + //-------3rd poly---------------------------------------------------- + + r0_avx=b[2*small_len_avx+0]; + r1_avx=b[2*small_len_avx+1]; + r2_avx=b[2*small_len_avx+2]; + r3_avx=b[2*small_len_avx+3]; + b_bucket[0+18]=r0_avx; + b_bucket[1+18]=r1_avx; + b_bucket[2+18]=r2_avx; + b_bucket[3+18]=r3_avx; + b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]); + + //-------4th poly---------------------------------------------------- + r0_avx=b[3*small_len_avx]; + r1_avx=b[3*small_len_avx+1]; + r2_avx=b[3*small_len_avx+2]; + r3_avx=b[3*small_len_avx+3]; + b_bucket[0+27]=r0_avx; + b_bucket[1+27]=r1_avx; + b_bucket[2+27]=r2_avx; + b_bucket[3+27]=r3_avx; + b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]); + + //-------5th poly---------------------------------------------------- + + r0_avx=b[4*small_len_avx]; + r1_avx=b[4*small_len_avx+1]; + r2_avx=b[4*small_len_avx+2]; + r3_avx=b[4*small_len_avx+3]; + b_bucket[0+36]=r0_avx; + b_bucket[1+36]=r1_avx; + b_bucket[2+36]=r2_avx; + b_bucket[3+36]=r3_avx; + b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]); + + //-------6th poly---------------------------------------------------- + + r0_avx=b[5*small_len_avx]; + r1_avx=b[5*small_len_avx+1]; + r2_avx=b[5*small_len_avx+2]; + r3_avx=b[5*small_len_avx+3]; + b_bucket[0+45]=r0_avx; + b_bucket[1+45]=r1_avx; + b_bucket[2+45]=r2_avx; + b_bucket[3+45]=r3_avx; + b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]); + + //-------7th poly---------------------------------------------------- + + r0_avx=b[6*small_len_avx]; + r1_avx=b[6*small_len_avx+1]; + r2_avx=b[6*small_len_avx+2]; + r3_avx=b[6*small_len_avx+3]; + b_bucket[0+54]=r0_avx; + b_bucket[1+54]=r1_avx; + b_bucket[2+54]=r2_avx; + b_bucket[3+54]=r3_avx; + b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]); + + //--------------Evaluating B poly ends------------------------------- + + transpose_n1(b_bucket); + transpose_n1(b_bucket+16); + transpose_n1(b_bucket+32); + transpose_n1(b_bucket+48); +} + +static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){ + + //int64_t i; + register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results + + __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; + + //CLOCK1=cpucycles(); + + //------------------------AVX interpolation for 1st poly external------------------- + + //loop1 + res_avx0 = c_bucket[0]; + res_avx2 = c_bucket[1]; + res_avx4 = c_bucket[2]; + res_avx6 = c_bucket[3]; + + c6_avx=c_bucket[6]; + c7_avx=c_bucket[7]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx); + + res_avx1 = c_bucket[16]; + res_avx3 = c_bucket[17]; + res_avx5 = c_bucket[18]; + res_avx7 = c_bucket[19]; + + c22_avx=c_bucket[22]; + c23_avx=c_bucket[23]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final0[0]=res_avx0; + result_final0[1]=res_avx1; + + result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final0[6]=res_avx6; + result_final0[7]=res_avx7; + + + //------------------------AVX interpolation for 1st poly ends-------------- + + + //------------------------AVX interpolation for 2nd poly external------------------- + + //loop1 + res_avx0 = c_bucket[9]; //c_bucket0 + res_avx2 = c_bucket[10]; //c_bucket1 + res_avx4 = c_bucket[11]; //c_bucket2 + res_avx6 = c_bucket[12]; //c_bucket3 + + c6_avx=c_bucket[15]; //c_bucket6 + c7_avx=c_bucket[32]; //c_bucket7 + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx); + + res_avx1 = c_bucket[25]; //c_bucket0 + res_avx3 = c_bucket[26]; //c_bucket1 + res_avx5 = c_bucket[27]; //c_bucket2 + res_avx7 = c_bucket[28]; //c_bucket3 + + c22_avx=c_bucket[31]; + c23_avx=c_bucket[48]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final1[0]=res_avx0; + result_final1[1]=res_avx1; + + result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final1[6]=res_avx6; + result_final1[7]=res_avx7; + + + //------------------------AVX interpolation for 2nd poly ends-------------- + + //------------------------AVX interpolation for 3rd poly external------------------- + + //loop1 + res_avx0 = c_bucket[34]; //c_bucket0 + res_avx2 = c_bucket[35]; //c_bucket1 + res_avx4 = c_bucket[36]; + res_avx6 = c_bucket[37]; + + c6_avx=c_bucket[40]; + c7_avx=c_bucket[41]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx); + + res_avx1 = c_bucket[50]; //c_bucket0 + res_avx3 = c_bucket[51]; //c_bucket1 + res_avx5 = c_bucket[52]; + res_avx7 = c_bucket[53]; + + c22_avx=c_bucket[56]; + c23_avx=c_bucket[57]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + //loop4 + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + //loop5 + result_final2[0]=res_avx0; + result_final2[1]=res_avx1; + + result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final2[6]=res_avx6; + result_final2[7]=res_avx7; + + //------------------------AVX interpolation for 3rd poly ends-------------- + + //------------------------AVX interpolation for 4th poly external------------------- + + //loop1 + res_avx0 = c_bucket[43]; + res_avx2 = c_bucket[44]; + res_avx4 = c_bucket[45]; + res_avx6 = c_bucket[46]; + + c6_avx=c_bucket[65]; + c7_avx=c_bucket[66]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx); + + res_avx1 = c_bucket[59]; + res_avx3 = c_bucket[60]; + res_avx5 = c_bucket[61]; + res_avx7 = c_bucket[62]; + + c22_avx=c_bucket[81]; + c23_avx=c_bucket[82]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final3[0]=res_avx0; + result_final3[1]=res_avx1; + + result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final3[6]=res_avx6; + result_final3[7]=res_avx7; + + + //------------------------AVX interpolation for 4th poly ends-------------- + + //------------------------AVX interpolation for 5th poly external------------------- + + //loop1 + res_avx0 = c_bucket[68]; + res_avx2 = c_bucket[69]; + res_avx4 = c_bucket[70]; + res_avx6 = c_bucket[71]; + + c6_avx=c_bucket[74]; + c7_avx=c_bucket[75]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx); + + res_avx1 = c_bucket[84]; + res_avx3 = c_bucket[85]; + res_avx5 = c_bucket[86]; + res_avx7 = c_bucket[87]; + + c22_avx=c_bucket[90]; + c23_avx=c_bucket[91]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final4[0]=res_avx0; + result_final4[1]=res_avx1; + + result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final4[6]=res_avx6; + result_final4[7]=res_avx7; + + + //------------------------AVX interpolation for 5th poly ends-------------- + + //------------------------AVX interpolation for 6th poly external------------------- + + //loop1 + res_avx0 = c_bucket[77]; + res_avx2 = c_bucket[78]; + res_avx4 = c_bucket[79]; + res_avx6 = c_bucket[96]; + + c6_avx=c_bucket[99]; + c7_avx=c_bucket[100]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx); + + res_avx1 = c_bucket[93]; + res_avx3 = c_bucket[94]; + res_avx5 = c_bucket[95]; + res_avx7 = c_bucket[112]; + + c22_avx=c_bucket[115]; + c23_avx=c_bucket[116]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final5[0]=res_avx0; + result_final5[1]=res_avx1; + + result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final5[6]=res_avx6; + result_final5[7]=res_avx7; + + + //------------------------AVX interpolation for 6th poly ends-------------- + + //------------------------AVX interpolation for 7th poly external------------------- + + //loop1 + res_avx0 = c_bucket[102]; + res_avx2 = c_bucket[103]; + res_avx4 = c_bucket[104]; + res_avx6 = c_bucket[105]; + + c6_avx=c_bucket[108]; + c7_avx=c_bucket[109]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx); + + res_avx1 = c_bucket[118]; + res_avx3 = c_bucket[119]; + res_avx5 = c_bucket[120]; + res_avx7 = c_bucket[121]; + + c22_avx=c_bucket[124]; + c23_avx=c_bucket[125]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final6[0]=res_avx0; + result_final6[1]=res_avx1; + + result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final6[6]=res_avx6; + result_final6[7]=res_avx7; + + + //------------------------AVX interpolation for 7th poly ends-------------- + + //CLOCK2=cpucycles(); + //CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1); + //printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1); + + + +} + +static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ + + int i; + +//---------------AVX data----------------------------- + + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i aw_avx[7*small_len_avx]; + +//----------------AVX data---------------------------- + + +// EVALUATION + + //CLOCK1=cpucycles(); + + for (i=0; i>= 63; + return (uint8_t) r; +} + +/* b = 1 means mov, b = 0 means don't mov*/ +void PQCLEAN_FIRESABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; + + b = -b; + for (i = 0; i < len; i++) { + r[i] ^= b & (x[i] ^ r[i]); + } +} diff --git a/crypto_kem/firesaber/avx2/verify.h b/crypto_kem/firesaber/avx2/verify.h new file mode 100644 index 00000000..2ec50370 --- /dev/null +++ b/crypto_kem/firesaber/avx2/verify.h @@ -0,0 +1,22 @@ +#ifndef VERIFY_H +#define VERIFY_H +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ + +#include +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +uint8_t PQCLEAN_FIRESABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + + +/* b = 1 means mov, b = 0 means don't mov*/ +void PQCLEAN_FIRESABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); + + +#endif diff --git a/crypto_kem/firesaber/clean/LICENSE b/crypto_kem/firesaber/clean/LICENSE index 08c799e3..d5d21fff 100644 --- a/crypto_kem/firesaber/clean/LICENSE +++ b/crypto_kem/firesaber/clean/LICENSE @@ -1,8 +1 @@ ----------------------------------------------------------------------------------------- -SABER_v1.1 - -Public domain - -Authors: Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, -Frederik Vercauteren ----------------------------------------------------------------------------------------- +Public Domain diff --git a/crypto_kem/firesaber/clean/Makefile b/crypto_kem/firesaber/clean/Makefile index e00112e8..8f8dd8f7 100644 --- a/crypto_kem/firesaber/clean/Makefile +++ b/crypto_kem/firesaber/clean/Makefile @@ -1,10 +1,10 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libfiresaber_clean.a -HEADERS=api.h cbd.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h pack_unpack.h +HEADERS=api.h cbd.h pack_unpack.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o -CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) all: $(LIB) diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.c b/crypto_kem/firesaber/clean/SABER_indcpa.c index da8aa685..8f4364e7 100644 --- a/crypto_kem/firesaber/clean/SABER_indcpa.c +++ b/crypto_kem/firesaber/clean/SABER_indcpa.c @@ -3,296 +3,90 @@ #include "fips202.h" #include "pack_unpack.h" #include "poly.h" -#include "poly_mul.h" #include "randombytes.h" #include #include +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + uint16_t A[SABER_L][SABER_L][SABER_N]; + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N] = {0}; -/*----------------------------------------------------------------------------------- - This routine generates a=[Matrix K x K] of 256-coefficient polynomials --------------------------------------------------------------------------------------*/ + uint8_t seed_A[SABER_SEEDBYTES]; + uint8_t seed_s[SABER_NOISE_SEEDBYTES]; + int i, j; -#define h1 4 //2^(EQ-EP-1) + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(seed_s, SABER_NOISE_SEEDBYTES); -#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) + PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A); + PQCLEAN_FIRESABER_CLEAN_GenSecret(s, seed_s); + PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1); -static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]); -static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose); - -static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec); - -static void GenMatrix(polyvec *a, const unsigned char *seed) { - unsigned char buf[SABER_K * SABER_K * (13 * SABER_N / 8)]; - - uint16_t temp_ar[SABER_N]; - - int i, j, k; - uint16_t mod = (SABER_Q - 1); - - shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_FIRESABER_CLEAN_BS2POL(buf + (i * SABER_K + j) * (13 * SABER_N / 8), temp_ar); - for (k = 0; k < SABER_N; k++) { - a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; - } + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_N; j++) { + b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP); } } + + PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); + PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); + memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); } +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + uint16_t A[SABER_L][SABER_L][SABER_N]; + uint16_t sp[SABER_L][SABER_N]; + uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t vp[SABER_N] = {0}; + uint16_t mp[SABER_N]; + uint16_t b[SABER_L][SABER_N]; + int i, j; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk) { - polyvec a[SABER_K]; + PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A); + PQCLEAN_FIRESABER_CLEAN_GenSecret(sp, seed_sp); + PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); - uint16_t skpv[SABER_K][SABER_N]; - - unsigned char seed[SABER_SEEDBYTES]; - unsigned char noiseseed[SABER_COINBYTES]; - int32_t i, j; - uint16_t mod_q = SABER_Q - 1; - - - uint16_t res[SABER_K][SABER_N]; - - randombytes(seed, SABER_SEEDBYTES); - - // for not revealing system RNG state - shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); - randombytes(noiseseed, SABER_COINBYTES); - - GenMatrix(a, seed); //sample matrix A - - // generate secret from constant-time binomial distribution - PQCLEAN_FIRESABER_CLEAN_GenSecret(skpv, noiseseed); - - // do the matrix vector multiplication and rounding - for (i = 0; i < SABER_K; i++) { + for (i = 0; i < SABER_L; i++) { for (j = 0; j < SABER_N; j++) { - res[i][j] = 0; - } - } - MatrixVectorMul(a, skpv, res, SABER_Q - 1, 1); - - // now rounding - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - // shift right 3 bits - res[i][j] = (res[i][j] + h1) & (mod_q); - res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP)); + bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); } } - // unload and pack sk=3 x (256 coefficients of 14 bits) - PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(sk, skpv, SABER_Q); + PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); + PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, pk); + PQCLEAN_FIRESABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); - // unload and pack pk=256 bits seed and 3 x (256 coefficients of 11 bits) - // load the public-key coefficients - PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(pk, res, SABER_P); + PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(mp, m); - - // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. - for (i = 0; i < SABER_SEEDBYTES; i++) { - pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; + for (j = 0; j < SABER_N; j++) { + vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); } + PQCLEAN_FIRESABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); } +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(const unsigned char *message_received, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext) { - uint32_t i, j, k; - polyvec a[SABER_K]; - unsigned char seed[SABER_SEEDBYTES]; - // public key of received by the client - uint16_t pkcl[SABER_K][SABER_N]; - uint16_t skpv1[SABER_K][SABER_N]; - uint16_t message[SABER_KEYBYTES * 8]; - uint16_t res[SABER_K][SABER_N]; - uint16_t mod_p = SABER_P - 1; - uint16_t mod_q = SABER_Q - 1; - uint16_t vprime[SABER_N]; - unsigned char msk_c[SABER_SCALEBYTES_KEM]; + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N]; + uint16_t v[SABER_N] = {0}; + uint16_t cm[SABER_N]; + int i; - // extract the seedbytes from Public Key. - for (i = 0; i < SABER_SEEDBYTES; i++) { - seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; - } + PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk); + PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, ciphertext); + PQCLEAN_FIRESABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); + PQCLEAN_FIRESABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); - GenMatrix(a, seed); - - // generate secret from constant-time binomial distribution - PQCLEAN_FIRESABER_CLEAN_GenSecret(skpv1, noiseseed); - - // matrix-vector multiplication and rounding - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - res[i][j] = 0; - } - } - MatrixVectorMul(a, skpv1, res, SABER_Q - 1, 0); - - // now rounding - //shift right 3 bits - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - res[i][j] = ( res[i][j] + h1 ) & mod_q; - res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP) ); - } - } - - PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(ciphertext, res, SABER_P); - - // ************client matrix-vector multiplication ends************ - - // now calculate the v' - // unpack the public_key - // pkcl is the b in the protocol - PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(pk, pkcl, SABER_P); for (i = 0; i < SABER_N; i++) { - vprime[i] = 0; - } - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - skpv1[i][j] = skpv1[i][j] & (mod_p); - } + v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); } - // vector-vector scalar multiplication with mod p - InnerProd(pkcl, skpv1, mod_p, vprime); - - // addition of h1 to vprime - for (i = 0; i < SABER_N; i++) { - vprime[i] = vprime[i] + h1; - } - - // unpack message_received; - for (j = 0; j < SABER_KEYBYTES; j++) { - for (i = 0; i < 8; i++) { - message[8 * j + i] = ((message_received[j] >> i) & 0x01); - } - } - - // message encoding - for (i = 0; i < SABER_N; i++) { - message[i] = (message[i] << (SABER_EP - 1)); - } - - for (k = 0; k < SABER_N; k++) { - vprime[k] = ( (vprime[k] - message[k]) & (mod_p) ) >> (SABER_EP - SABER_ET); - } - - - PQCLEAN_FIRESABER_CLEAN_pack_6bit(msk_c, vprime); - - for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { - ciphertext[SABER_POLYVECCOMPRESSEDBYTES + j] = msk_c[j]; - } -} - - -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char message_dec[]) { - uint32_t i, j; - // secret key of the server - uint16_t sksv[SABER_K][SABER_N]; - uint16_t pksv[SABER_K][SABER_N]; - uint8_t scale_ar[SABER_SCALEBYTES_KEM]; - uint16_t mod_p = SABER_P - 1; - uint16_t v[SABER_N]; - uint16_t op[SABER_N]; - - // sksv is the secret-key - PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(sk, sksv, SABER_Q); - // pksv is the ciphertext - PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(ciphertext, pksv, SABER_P); - - // vector-vector scalar multiplication with mod p - for (i = 0; i < SABER_N; i++) { - v[i] = 0; - } - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - sksv[i][j] = sksv[i][j] & (mod_p); - } - } - InnerProd(pksv, sksv, mod_p, v); - - //Extraction - for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { - scale_ar[i] = ciphertext[SABER_POLYVECCOMPRESSEDBYTES + i]; - } - - PQCLEAN_FIRESABER_CLEAN_un_pack6bit(scale_ar, op); - - //addition of h1 - for (i = 0; i < SABER_N; i++) { - v[i] = ( ( v[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (mod_p) ) >> (SABER_EP - 1); - } - - // pack decrypted message - POL2MSG(v, message_dec); -} -static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose) { - uint16_t acc[SABER_N]; - int32_t i, j, k; - - if (transpose == 1) { - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_FIRESABER_CLEAN_pol_mul((uint16_t *)&a[j].vec[i], skpv[j], acc, SABER_Q, SABER_N); - - for (k = 0; k < SABER_N; k++) { - res[i][k] = res[i][k] + acc[k]; - //reduction mod p - res[i][k] = (res[i][k] & mod); - //clear the accumulator - acc[k] = 0; - } - } - } - } else { - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_FIRESABER_CLEAN_pol_mul((uint16_t *)&a[i].vec[j], skpv[j], acc, SABER_Q, SABER_N); - for (k = 0; k < SABER_N; k++) { - res[i][k] = res[i][k] + acc[k]; - // reduction - res[i][k] = res[i][k] & mod; - // clear the accumulator - acc[k] = 0; - } - } - } - } -} - -static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec) { - int32_t i, j; - - for (j = 0; j < SABER_KEYBYTES; j++) { - message_dec[j] = 0; - for (i = 0; i < 8; i++) { - message_dec[j] = message_dec[j] | (uint8_t) (message_dec_unpacked[j * 8 + i] << i); - } - } -} - - -static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]) { - uint32_t j, k; - uint16_t acc[SABER_N]; - - // vector-vector scalar multiplication with mod p - for (j = 0; j < SABER_K; j++) { - PQCLEAN_FIRESABER_CLEAN_pol_mul(pkcl[j], skpv[j], acc, SABER_P, SABER_N); - - for (k = 0; k < SABER_N; k++) { - res[k] = res[k] + acc[k]; - // reduction - res[k] = res[k] & mod; - // clear the accumulator - acc[k] = 0; - } - } + PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(m, v); } diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.h b/crypto_kem/firesaber/clean/SABER_indcpa.h index 6007352d..28a5feee 100644 --- a/crypto_kem/firesaber/clean/SABER_indcpa.h +++ b/crypto_kem/firesaber/clean/SABER_indcpa.h @@ -1,9 +1,13 @@ #ifndef INDCPA_H #define INDCPA_H +#include "SABER_params.h" +#include + +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); + +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); + +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk); -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(const unsigned char *message, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext); -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char *message_dec); #endif - diff --git a/crypto_kem/firesaber/clean/SABER_params.h b/crypto_kem/firesaber/clean/SABER_params.h index b0d517f8..9121a12b 100644 --- a/crypto_kem/firesaber/clean/SABER_params.h +++ b/crypto_kem/firesaber/clean/SABER_params.h @@ -1,49 +1,39 @@ #ifndef PARAMS_H #define PARAMS_H -#include "api.h" -#define SABER_K 4 +/* Change this for different security strengths */ + +/* Don't change anything below this line */ +#define SABER_L 4 #define SABER_MU 6 #define SABER_ET 6 #define SABER_EQ 13 #define SABER_EP 10 - #define SABER_N 256 -#define SABER_Q 8192 -#define SABER_P 1024 -#define SABER_SEEDBYTES 32 -#define SABER_NOISESEEDBYTES 32 -#define SABER_COINBYTES 32 -#define SABER_KEYBYTES 32 +#define SABER_SEEDBYTES 32 +#define SABER_NOISE_SEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 -#define SABER_HASHBYTES 32 +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) -#define SABER_POLYBYTES 416 //13*256/8 +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) -#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) -#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation - -#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) - -#define SABER_SCALEBYTES (SABER_DELTA*SABER_N/8) - -#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) -#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) - -#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ - - - +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) #endif - diff --git a/crypto_kem/firesaber/clean/api.h b/crypto_kem/firesaber/clean/api.h index 56d17038..14718674 100644 --- a/crypto_kem/firesaber/clean/api.h +++ b/crypto_kem/firesaber/clean/api.h @@ -1,14 +1,18 @@ #ifndef PQCLEAN_FIRESABER_CLEAN_API_H #define PQCLEAN_FIRESABER_CLEAN_API_H + #define PQCLEAN_FIRESABER_CLEAN_CRYPTO_ALGNAME "FireSaber" -#define PQCLEAN_FIRESABER_CLEAN_CRYPTO_SECRETKEYBYTES 3040 -#define PQCLEAN_FIRESABER_CLEAN_CRYPTO_PUBLICKEYBYTES (4*320+32) #define PQCLEAN_FIRESABER_CLEAN_CRYPTO_BYTES 32 #define PQCLEAN_FIRESABER_CLEAN_CRYPTO_CIPHERTEXTBYTES 1472 +#define PQCLEAN_FIRESABER_CLEAN_CRYPTO_PUBLICKEYBYTES 1312 +#define PQCLEAN_FIRESABER_CLEAN_CRYPTO_SECRETKEYBYTES 3040 int PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); -int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); -int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + +int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + #endif /* api_h */ diff --git a/crypto_kem/firesaber/clean/cbd.c b/crypto_kem/firesaber/clean/cbd.c index cca885a1..8032eb5c 100644 --- a/crypto_kem/firesaber/clean/cbd.c +++ b/crypto_kem/firesaber/clean/cbd.c @@ -1,3 +1,7 @@ +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include /*--------------------------------------------------------------------- This file has been adapted from the implementation (available at, Public Domain https://github.com/pq-crystals/kyber) @@ -6,12 +10,8 @@ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -#include "SABER_params.h" -#include "api.h" -#include "cbd.h" -#include -static uint64_t load_littleendian(const unsigned char *x, int bytes) { +static uint64_t load_littleendian(const uint8_t *x, int bytes) { int i; uint64_t r = x[0]; for (i = 1; i < bytes; i++) { @@ -20,33 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { return r; } - -void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) { - uint16_t Qmod_minus1 = SABER_Q - 1; - +void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { uint32_t t, d, a[4], b[4]; int i, j; for (i = 0; i < SABER_N / 4; i++) { - t = (uint32_t) load_littleendian(buf + 3 * i, 3); + t = load_littleendian(buf + 3 * i, 3); d = 0; for (j = 0; j < 3; j++) { d += (t >> j) & 0x249249; } - a[0] = d & 0x7; - b[0] = (d >> 3) & 0x7; - a[1] = (d >> 6) & 0x7; - b[1] = (d >> 9) & 0x7; + a[0] = d & 0x7; + b[0] = (d >> 3) & 0x7; + a[1] = (d >> 6) & 0x7; + b[1] = (d >> 9) & 0x7; a[2] = (d >> 12) & 0x7; b[2] = (d >> 15) & 0x7; a[3] = (d >> 18) & 0x7; b[3] = (d >> 21); - r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; - r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; - r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; - r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; - + s[4 * i + 0] = (uint16_t)(a[0] - b[0]); + s[4 * i + 1] = (uint16_t)(a[1] - b[1]); + s[4 * i + 2] = (uint16_t)(a[2] - b[2]); + s[4 * i + 3] = (uint16_t)(a[3] - b[3]); } } diff --git a/crypto_kem/firesaber/clean/cbd.h b/crypto_kem/firesaber/clean/cbd.h index b10e5202..0fa18b02 100644 --- a/crypto_kem/firesaber/clean/cbd.h +++ b/crypto_kem/firesaber/clean/cbd.h @@ -1,6 +1,5 @@ #ifndef CBD_H #define CBD_H - /*--------------------------------------------------------------------- This file has been adapted from the implementation (available at, Public Domain https://github.com/pq-crystals/kyber) @@ -8,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ - -#include "poly.h" +#include "SABER_params.h" #include -void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf); +void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); + #endif diff --git a/crypto_kem/firesaber/clean/kem.c b/crypto_kem/firesaber/clean/kem.c index c66cfed9..e94219a6 100644 --- a/crypto_kem/firesaber/clean/kem.c +++ b/crypto_kem/firesaber/clean/kem.c @@ -1,5 +1,6 @@ #include "SABER_indcpa.h" #include "SABER_params.h" +#include "api.h" #include "fips202.h" #include "randombytes.h" #include "verify.h" @@ -7,90 +8,71 @@ #include #include -int PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + +int PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { int i; - // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk - PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(pk, sk); - - // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { - sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk } - // Then hash(pk) is appended. - sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. - // Remaining part of sk contains a pseudo-random number. - // This is output when check in crypto_kem_dec() fails. - randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES ); + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec() fails. return (0); } -int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) { - // Will contain key, coins - unsigned char kr[64]; - unsigned char buf[64]; +int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { + + uint8_t kr[64]; // Will contain key, coins + uint8_t buf[64]; randombytes(buf, 32); - // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output - sha3_256(buf, buf, 32); + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output - // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM - sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); - - // kr[0:63] <-- Hash(buf[0:63]); - sha3_512(kr, buf, 64); + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); // K^ <-- kr[0:31] // noiseseed (r) <-- kr[32:63]; - // buf[0:31] contains message; kr[32:63] contains randomness r; - PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, ct); + PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; - sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC); + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); - // hash concatenation of pre-k and h(c) to k - sha3_256(ss, kr, 64); + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k return (0); } - -int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) { +int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { int i; - unsigned char fail; - unsigned char cmp[SABER_BYTES_CCA_DEC]; - unsigned char buf[64]; - - // Will contain key, coins - unsigned char kr[64]; - const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES; - - // buf[0:31] <-- message - PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(sk, ct, buf); + uint8_t fail; + uint8_t cmp[SABER_BYTES_CCA_DEC]; + uint8_t buf[64]; + uint8_t kr[64]; // Will contain key, coins + const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message // Multitarget countermeasure for coins + contributory KEM - // Save hash by storing h(pk) in sk - for (i = 0; i < 32; i++) { + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; } sha3_512(kr, buf, 64); - PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, cmp); + PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(cmp, buf, kr + 32, pk); + fail = PQCLEAN_FIRESABER_CLEAN_verify(c, cmp, SABER_BYTES_CCA_DEC); - fail = PQCLEAN_FIRESABER_CLEAN_verify(ct, cmp, SABER_BYTES_CCA_DEC); - - // overwrite coins in kr with h(c) - sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC); + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) PQCLEAN_FIRESABER_CLEAN_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); - // hash concatenation of pre-k and h(c) to k - sha3_256(ss, kr, 64); + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k return (0); } diff --git a/crypto_kem/firesaber/clean/pack_unpack.c b/crypto_kem/firesaber/clean/pack_unpack.c index 9e68ffc1..0add1409 100644 --- a/crypto_kem/firesaber/clean/pack_unpack.c +++ b/crypto_kem/firesaber/clean/pack_unpack.c @@ -1,254 +1,136 @@ +#include "api.h" #include "pack_unpack.h" +#include -void PQCLEAN_FIRESABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | - ((data[offset_data + 1] & 0x7) << 3) | - ((data[offset_data + 2] & 0x3) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | - ((data[offset_data + 3] & 0x7) << 1) | - ((data[offset_data + 4] & 0x7) << 4) | - (((data[offset_data + 5]) & 0x01) << 7); - bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | - ((data[offset_data + 6] & 0x7) << 2) | - ((data[offset_data + 7] & 0x7) << 5); - } -} - -void PQCLEAN_FIRESABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; - data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3 ) & 0x07; - data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6 ) & 0x03) | - (((bytes[offset_byte + 1]) & 0x01) << 2); - data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1 ) & 0x07; - data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4 ) & 0x07; - data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7 ) & 0x01) | - (((bytes[offset_byte + 2]) & 0x03) << 1); - data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07); - data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07); - } -} - -void PQCLEAN_FIRESABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data; - - for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - bytes[j] = (data[offset_data] & 0x0f) | - ((data[offset_data + 1] & 0x0f) << 4); - } -} - -void PQCLEAN_FIRESABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar) { - uint32_t j; - uint32_t offset_data; - - for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - ar[offset_data] = bytes[j] & 0x0f; - ar[offset_data + 1] = (bytes[j] >> 4) & 0x0f; - } -} - -void PQCLEAN_FIRESABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - +void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 4; j++) { offset_byte = 3 * j; offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | - ((data[offset_data + 1] & 0x03) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | - ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | - ((data[offset_data + 3] & 0x3f) << 2); + bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); + bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); + bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); } } - -void PQCLEAN_FIRESABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - +void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 4; j++) { offset_byte = 3 * j; offset_data = 4 * j; data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; - data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | - ((bytes[offset_byte + 1] & 0x0f) << 2); - data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | - ((bytes[offset_byte + 2] & 0x03) << 4); + data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2); + data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4); data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); } } - -static void POLVECp2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x03) | - ((data[i][offset_data + 1] & 0x3f) << 2); - bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 6) & 0x0f) | - ((data[i][offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 3] = ((data[i][offset_data + 2] >> 4) & 0x3f) | - ((data[i][offset_data + 3] & 0x03) << 6); - bytes[offset_byte + 4] = ((data[i][offset_data + 3] >> 2) & 0xff); - } - } -} - -static void BS2POLVECp(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x03) << 8); - data[i][offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | - ((bytes[offset_byte + 2] & 0x0f) << 6); - data[i][offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | - ((bytes[offset_byte + 3] & 0x3f) << 4); - data[i][offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | - ((bytes[offset_byte + 4] & 0xff) << 2); - } - } -} - - - -static void POLVECq2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x1f) | - ((data[i][offset_data + 1] & 0x07) << 5); - bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 3) & 0xff); - bytes[offset_byte + 3] = ((data[i][offset_data + 1] >> 11) & 0x03) | - ((data[i][offset_data + 2] & 0x3f) << 2); - bytes[offset_byte + 4] = ((data[i][offset_data + 2] >> 6) & 0x7f) | - ((data[i][offset_data + 3] & 0x01) << 7); - bytes[offset_byte + 5] = ((data[i][offset_data + 3] >> 1) & 0xff); - bytes[offset_byte + 6] = ((data[i][offset_data + 3] >> 9) & 0x0f) | - ((data[i][offset_data + 4] & 0x0f) << 4); - bytes[offset_byte + 7] = ((data[i][offset_data + 4] >> 4) & 0xff); - bytes[offset_byte + 8] = ((data[i][offset_data + 4] >> 12) & 0x01) | - ((data[i][offset_data + 5] & 0x7f) << 1); - bytes[offset_byte + 9] = ((data[i][offset_data + 5] >> 7) & 0x3f) | - ((data[i][offset_data + 6] & 0x03) << 6); - bytes[offset_byte + 10] = ((data[i][offset_data + 6] >> 2) & 0xff); - bytes[offset_byte + 11] = ((data[i][offset_data + 6] >> 10) & 0x07) | - ((data[i][offset_data + 7] & 0x1f) << 3); - bytes[offset_byte + 12] = ((data[i][offset_data + 7] >> 5) & 0xff); - } - } -} - -static void BS2POLVECq(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | - ((bytes[offset_byte + 2] & 0xff) << 3) | - ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | - ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | - ((bytes[offset_byte + 5] & 0xff) << 1) | - ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | - ((bytes[offset_byte + 7] & 0xff) << 4) | - ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | - ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | - ((bytes[offset_byte + 10] & 0xff) << 2) | - ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | - ((bytes[offset_byte + 12] & 0xff) << 5); - } - } -} - -//only BS2POLq no BS2POLp -void PQCLEAN_FIRESABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]) { - uint32_t j; - uint32_t offset_data, offset_byte; - +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 8; j++) { offset_byte = 13 * j; offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | - ((bytes[offset_byte + 2] & 0xff) << 3) | - ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | - ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | - ((bytes[offset_byte + 5] & 0xff) << 1) | - ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | - ((bytes[offset_byte + 7] & 0xff) << 4) | - ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | - ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | - ((bytes[offset_byte + 10] & 0xff) << 2) | - ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | - ((bytes[offset_byte + 12] & 0xff) << 5); + bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); + bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); + bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); + bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); + bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); + bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); + bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); + bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); + bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); + bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); + bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); + bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); + bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); } } -void PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - if (modulus == 1024) { - POLVECp2BS(bytes, data); - } else if (modulus == 8192) { - POLVECq2BS(bytes, data); +static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 13 * j; + offset_data = 8 * j; + data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); } } -void PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - if (modulus == 1024) { - BS2POLVECp(bytes, data); - } else if (modulus == 8192) { - BS2POLVECq(bytes, data); +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); + bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); + bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); + bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); + bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); + } +} + +static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 5 * j; + offset_data = 4 * j; + data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); + data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); + data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); + data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); + } +} + +void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(data[i], bytes + i * SABER_POLYBYTES); + } +} + +void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); + } +} + +void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { + size_t i, j; + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + data[j * 8 + i] = ((bytes[j] >> i) & 0x01); + } + } +} + +void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { + size_t i, j; + memset(bytes, 0, SABER_KEYBYTES); + + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); + } } } diff --git a/crypto_kem/firesaber/clean/pack_unpack.h b/crypto_kem/firesaber/clean/pack_unpack.h index 6509f107..0a8ee253 100644 --- a/crypto_kem/firesaber/clean/pack_unpack.h +++ b/crypto_kem/firesaber/clean/pack_unpack.h @@ -1,28 +1,27 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H - #include "SABER_params.h" #include #include +void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); -void PQCLEAN_FIRESABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_FIRESABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data); - -void PQCLEAN_FIRESABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_FIRESABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar); - -void PQCLEAN_FIRESABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_FIRESABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data); +void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_FIRESABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]); +void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); -void PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus); +void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); + + +void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); + +void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); + +void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); -void PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus); #endif diff --git a/crypto_kem/firesaber/clean/poly.c b/crypto_kem/firesaber/clean/poly.c index 6fef45d5..c65175fe 100644 --- a/crypto_kem/firesaber/clean/poly.c +++ b/crypto_kem/firesaber/clean/poly.c @@ -1,21 +1,49 @@ -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ -#include "SABER_params.h" +#include "api.h" #include "cbd.h" #include "fips202.h" +#include "pack_unpack.h" #include "poly.h" +#include "poly_mul.h" +#include -void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed) { - uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; - - shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); - - for (size_t i = 0; i < SABER_K; i++) { - PQCLEAN_FIRESABER_CLEAN_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); +void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { + int i, j; + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_L; j++) { + if (transpose == 1) { + PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]); + } else { + PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]); + } + } + } +} + +void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { + int j; + for (j = 0; j < SABER_L; j++) { + PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(b[j], s[j], res); + } +} + +void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; + int i; + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); + } +} + +void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; + size_t i; + + shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); } } diff --git a/crypto_kem/firesaber/clean/poly.h b/crypto_kem/firesaber/clean/poly.h index 4f69a068..044e4eec 100644 --- a/crypto_kem/firesaber/clean/poly.h +++ b/crypto_kem/firesaber/clean/poly.h @@ -1,26 +1,15 @@ #ifndef POLY_H #define POLY_H - -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ - - #include "SABER_params.h" #include -typedef struct { - uint16_t coeffs[SABER_N]; -} poly; +void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); -typedef struct { - poly vec[SABER_K]; -} polyvec; +void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); + +void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); -void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed); #endif diff --git a/crypto_kem/firesaber/clean/poly_mul.c b/crypto_kem/firesaber/clean/poly_mul.c index 20f1d4ad..27c92f29 100644 --- a/crypto_kem/firesaber/clean/poly_mul.c +++ b/crypto_kem/firesaber/clean/poly_mul.c @@ -228,19 +228,15 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re } } -void PQCLEAN_FIRESABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n) { - uint32_t i; - // normal multiplication - uint16_t c[512]; - - for (i = 0; i < 512; i++) { - c[i] = 0; - } +/* res += a*b */ +void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) { + uint16_t c[2 * SABER_N] = {0}; + int i; toom_cook_4way(a, b, c); - // reduction - for (i = n; i < 2 * n; i++) { - res[i - n] = (c[i - n] - c[i]) & (p - 1); + /* reduction */ + for (i = SABER_N; i < 2 * SABER_N; i++) { + res[i - SABER_N] += (c[i - SABER_N] - c[i]); } } diff --git a/crypto_kem/firesaber/clean/poly_mul.h b/crypto_kem/firesaber/clean/poly_mul.h index 4d960042..e554d60c 100644 --- a/crypto_kem/firesaber/clean/poly_mul.h +++ b/crypto_kem/firesaber/clean/poly_mul.h @@ -1,9 +1,9 @@ -#ifndef POLYMUL_H -#define POLYMUL_H - +#ifndef POLY_MUL_H +#define POLY_MUL_H #include "SABER_params.h" #include -void PQCLEAN_FIRESABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n); +void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]); + #endif diff --git a/crypto_kem/firesaber/clean/verify.c b/crypto_kem/firesaber/clean/verify.c index 3c571e92..97a302a9 100644 --- a/crypto_kem/firesaber/clean/verify.c +++ b/crypto_kem/firesaber/clean/verify.c @@ -1,3 +1,5 @@ +#include "verify.h" + /*------------------------------------------------- This file has been adapted from the implementation (available at https://github.com/pq-crystals/kyber) of @@ -5,26 +7,25 @@ This file has been adapted from the implementation by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------*/ -#include "verify.h" -#include + /* returns 0 for equal strings, 1 for non-equal strings */ -unsigned char PQCLEAN_FIRESABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len) { +uint8_t PQCLEAN_FIRESABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { uint64_t r; size_t i; - r = 0; + for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } r = (~r + 1); // Two's complement r >>= 63; - return (unsigned char)r; + return (uint8_t) r; } /* b = 1 means mov, b = 0 means don't mov*/ -void PQCLEAN_FIRESABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { +void PQCLEAN_FIRESABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { size_t i; b = -b; diff --git a/crypto_kem/firesaber/clean/verify.h b/crypto_kem/firesaber/clean/verify.h index 1b69b071..1d5e4cb9 100644 --- a/crypto_kem/firesaber/clean/verify.h +++ b/crypto_kem/firesaber/clean/verify.h @@ -1,6 +1,5 @@ #ifndef VERIFY_H #define VERIFY_H - /*------------------------------------------------- This file has been adapted from the implementation (available at https://github.com/pq-crystals/kyber) of @@ -13,9 +12,11 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle #include /* returns 0 for equal strings, 1 for non-equal strings */ -unsigned char PQCLEAN_FIRESABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len); +uint8_t PQCLEAN_FIRESABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + /* b = 1 means mov, b = 0 means don't mov*/ -void PQCLEAN_FIRESABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); +void PQCLEAN_FIRESABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); + #endif diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml index 1b7912f6..1cc06c9a 100644 --- a/crypto_kem/lightsaber/META.yml +++ b/crypto_kem/lightsaber/META.yml @@ -14,4 +14,13 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/commit/14ede83f1ff3bcc41f0464543542366c68b55871 + version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + - name: avx2 + version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 diff --git a/crypto_kem/lightsaber/avx2/LICENSE b/crypto_kem/lightsaber/avx2/LICENSE new file mode 100644 index 00000000..d5d21fff --- /dev/null +++ b/crypto_kem/lightsaber/avx2/LICENSE @@ -0,0 +1 @@ +Public Domain diff --git a/crypto_kem/lightsaber/avx2/Makefile b/crypto_kem/lightsaber/avx2/Makefile new file mode 100644 index 00000000..0522fe8d --- /dev/null +++ b/crypto_kem/lightsaber/avx2/Makefile @@ -0,0 +1,22 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=liblightsaber_avx2.a +HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h +OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o + +CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.s $(HEADERS) + $(AS) -o $@ $< + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/lightsaber/avx2/SABER_indcpa.c b/crypto_kem/lightsaber/avx2/SABER_indcpa.c new file mode 100644 index 00000000..3270a8c9 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/SABER_indcpa.c @@ -0,0 +1,416 @@ +#include "./polymul/toom-cook_4way.c" +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include "fips202.h" +#include "pack_unpack.h" +#include "randombytes.h" +#include +#include +#include +//#include "randombytes.h" +//#include "./polymul/toom_cook_4/toom-cook_4way.c" + +#define h1 4 //2^(EQ-EP-1) + +#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) + + +static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { + int32_t i, j; + + for (j = 0; j < SABER_KEYBYTES; j++) { + message_dec[j] = 0; + for (i = 0; i < 8; i++) { + message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); + } + } +} + +/*----------------------------------------------------------------------------------- + This routine generates a=[Matrix K x K] of 256-coefficient polynomials +-------------------------------------------------------------------------------------*/ + +static void GenMatrix(polyvec *a, const uint8_t *seed) { + uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; + + uint16_t temp_ar[SABER_N]; + + int i, j, k; + uint16_t mod = (SABER_Q - 1); + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + PQCLEAN_LIGHTSABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); + for (k = 0; k < SABER_N; k++) { + a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; + } + } + } +} + +static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { + + uint32_t i; + + uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; + + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); + + for (i = 0; i < SABER_K; i++) { + PQCLEAN_LIGHTSABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); + } +} + +//********************************matrix-vector mul routines***************************************************** +static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) { + int64_t i, j; + + __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time + + for (i = 0; i < NUM_POLY; i++) { + for (j = 0; j < NUM_POLY; j++) { + + if (isTranspose == 0) { + toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); + } else { + toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); + } + } + + TC_interpol(c_bucket, res_avx[i]); + } + +} + +static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) { + + int64_t i; + + __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time + + for (i = 0; i < NUM_POLY; i++) { + toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); + } + TC_interpol(c_bucket, res_avx); +} + +//********************************matrix-vector mul routines***************************************************** + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { + + polyvec a[SABER_K]; + + uint16_t skpv1[SABER_K][SABER_N]; + + + + uint8_t seed[SABER_SEEDBYTES]; + uint8_t noiseseed[SABER_COINBYTES]; + int32_t i, j, k; + + +//--------------AVX declaration------------------ + + __m256i sk_avx[SABER_K][SABER_N / 16]; + __m256i mod; + __m256i res_avx[SABER_K][SABER_N / 16]; + __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; + //__m256i acc[2*SABER_N/16]; + + mod = _mm256_set1_epi16(SABER_Q - 1); + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + +//--------------AVX declaration ends------------------ + + randombytes(seed, SABER_SEEDBYTES); + + shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(noiseseed, SABER_COINBYTES); + + + GenMatrix(a, seed); //sample matrix A + + GenSecret(skpv1, noiseseed); + + +// Load sk into avx vectors + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + } + + } + + // Load a into avx vectors + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + for (k = 0; k < SABER_N / 16; k++) { + a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); + } + } + } + + + + //------------------------do the matrix vector multiplication and rounding------------ + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sk_avx[j], b_bucket[j]); + } + matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order + + // Now truncation + + + for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N / 16; j++) { + res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); + res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); + res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); + } + } + + //------------------Pack sk into byte string------- + + PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); + + //------------------Pack pk into byte string------- + + for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); + } + } + PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string + + + for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. + pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; + } + +} + + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + + + uint32_t i, j, k; + polyvec a[SABER_K]; // skpv; + uint8_t seed[SABER_SEEDBYTES]; + uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client + + + uint16_t skpv1[SABER_K][SABER_N]; + uint16_t temp[SABER_K][SABER_N]; + uint16_t message[SABER_KEYBYTES * 8]; + + uint8_t msk_c[SABER_SCALEBYTES_KEM]; + + //--------------AVX declaration------------------ + + __m256i sk_avx[SABER_K][SABER_N / 16]; + __m256i mod, mod_p; + __m256i res_avx[SABER_K][SABER_N / 16]; + __m256i vprime_avx[SABER_N / 16]; + __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; + //__m256i acc[2*SABER_N/16]; + + __m256i pkcl_avx[SABER_K][SABER_N / 16]; + + __m256i message_avx[SABER_N / 16]; + + mod = _mm256_set1_epi16(SABER_Q - 1); + mod_p = _mm256_set1_epi16(SABER_P - 1); + + + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + + //--------------AVX declaration ends------------------ + for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. + seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; + } + + GenMatrix(a, seed); + GenSecret(skpv1, noiseseed); + + // ----------- Load skpv1 into avx vectors ---------- + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + } + } + + // ----------- Load skpv1 into avx vectors ---------- + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + for (k = 0; k < SABER_N / 16; k++) { + a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); + } + } + } + //-----------------matrix-vector multiplication and rounding + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sk_avx[j], b_bucket[j]); + } + matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order + + // Now truncation + + for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N / 16; j++) { + res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); + res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); + res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); + + } + } + + + //-----this result should be put in b_prime for later use in server. + for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); + } + } + + PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string + +//**************client matrix-vector multiplication ends******************// + + //------now calculate the v' + + //-------unpack the public_key + PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); + } + } + + // InnerProduct + //for(k=0;k> i) & 0x01); + } + } + // message encoding + for (i = 0; i < SABER_N / 16; i++) { + message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); + message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); + } + + // SHIFTRIGHT(v'+h1-m mod p, EP-ET) + for (k = 0; k < SABER_N / 16; k++) { + vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); + vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); + vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); + } + + // Unpack avx + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); + } + + PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(msk_c, temp[0]); + + + for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { + ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; + } + +} + + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + + uint32_t i, j; + uint16_t sksv[SABER_K][SABER_N]; //secret key of the server + uint16_t pksv[SABER_K][SABER_N]; + uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; + uint8_t scale_ar[SABER_SCALEBYTES_KEM]; + uint16_t op[SABER_N]; + + //--------------AVX declaration------------------ + + + //__m256i mod_p; + + __m256i v_avx[SABER_N / 16]; + + //__m256i acc[2*SABER_N/16]; + + __m256i sksv_avx[SABER_K][SABER_N / 16]; + __m256i pksv_avx[SABER_K][SABER_N / 16]; + + //mod_p=_mm256_set1_epi16(SABER_P-1); + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + //--------------AVX declaration ends------------------ + + //-------unpack the public_key + + PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key + PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); + pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); + } + } + + for (i = 0; i < SABER_N / 16; i++) { + v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); + } + + + // InnerProduct(b', s, mod p) + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sksv_avx[j], b_bucket[j]); + } + + vector_vector_mul(pksv_avx, b_bucket, v_avx); + + for (i = 0; i < SABER_N / 16; i++) { + _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); + } + + + for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { + scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; + } + + PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(op, scale_ar); + + + //addition of h2 + for (i = 0; i < SABER_N; i++) { + message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); + } + + + POL2MSG(m, message_dec_unpacked); +} diff --git a/crypto_kem/lightsaber/avx2/SABER_indcpa.h b/crypto_kem/lightsaber/avx2/SABER_indcpa.h new file mode 100644 index 00000000..61ee77ba --- /dev/null +++ b/crypto_kem/lightsaber/avx2/SABER_indcpa.h @@ -0,0 +1,13 @@ +#ifndef INDCPA_H +#define INDCPA_H +#include "SABER_params.h" +#include + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); + + +#endif diff --git a/crypto_kem/lightsaber/avx2/SABER_params.h b/crypto_kem/lightsaber/avx2/SABER_params.h new file mode 100644 index 00000000..11d34fda --- /dev/null +++ b/crypto_kem/lightsaber/avx2/SABER_params.h @@ -0,0 +1,46 @@ +#ifndef PARAMS_H +#define PARAMS_H +#include "api.h" + + + + +#define SABER_K 2 +#define SABER_MU 10 +#define SABER_ET 3 + + +#define SABER_EQ 13 +#define SABER_EP 10 + +#define SABER_N 256 +#define SABER_Q 8192 //2^13 +#define SABER_P 1024 + +#define SABER_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 +#define SABER_COINBYTES 32 +#define SABER_KEYBYTES 32 + +#define SABER_HASHBYTES 32 + +#define SABER_POLYBYTES 416 //13*256/8 + +#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) + +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation + +#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) + +#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) + +#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) +#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) + +#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) + +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) + +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ + +#endif diff --git a/crypto_kem/lightsaber/avx2/api.h b/crypto_kem/lightsaber/avx2/api.h new file mode 100644 index 00000000..d1e2105b --- /dev/null +++ b/crypto_kem/lightsaber/avx2/api.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_LIGHTSABER_AVX2_API_H +#define PQCLEAN_LIGHTSABER_AVX2_API_H + + +#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_ALGNAME "LightSaber" +#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_BYTES 32 +#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_CIPHERTEXTBYTES 736 +#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_PUBLICKEYBYTES 672 +#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_SECRETKEYBYTES 1568 + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + + +#endif /* PQCLEAN_LIGHTSABER_AVX2_API_H */ diff --git a/crypto_kem/lightsaber/avx2/cbd.c b/crypto_kem/lightsaber/avx2/cbd.c new file mode 100644 index 00000000..a43170e2 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/cbd.c @@ -0,0 +1,51 @@ +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ + + +static uint64_t load_littleendian(const unsigned char *x, int bytes) { + int i; + uint64_t r = x[0]; + for (i = 1; i < bytes; i++) { + r |= (uint64_t)x[i] << (8 * i); + } + return r; +} + + +void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { + uint16_t Qmod_minus1 = SABER_Q - 1; + + uint64_t t, d, a[4], b[4]; + int i, j; + + for (i = 0; i < SABER_N / 4; i++) { + t = load_littleendian(buf + 5 * i, 5); + d = 0; + for (j = 0; j < 5; j++) { + d += (t >> j) & 0x0842108421UL; + } + + a[0] = d & 0x1f; + b[0] = (d >> 5) & 0x1f; + a[1] = (d >> 10) & 0x1f; + b[1] = (d >> 15) & 0x1f; + a[2] = (d >> 20) & 0x1f; + b[2] = (d >> 25) & 0x1f; + a[3] = (d >> 30) & 0x1f; + b[3] = (d >> 35); + + r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; + r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; + r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; + r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; + } +} diff --git a/crypto_kem/lightsaber/avx2/cbd.h b/crypto_kem/lightsaber/avx2/cbd.h new file mode 100644 index 00000000..01ba76e8 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/cbd.h @@ -0,0 +1,16 @@ +#ifndef CBD_H +#define CBD_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ +#include "poly.h" +#include + +void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); + + +#endif diff --git a/crypto_kem/lightsaber/avx2/kem.c b/crypto_kem/lightsaber/avx2/kem.c new file mode 100644 index 00000000..70221f10 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/kem.c @@ -0,0 +1,79 @@ +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "api.h" +#include "fips202.h" +#include "randombytes.h" +#include "verify.h" +#include +#include +#include +#include + + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { + int i; + + PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk + for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + } + + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. + + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec() fails. + return (0); +} + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { + + uint8_t kr[64]; // Will contain key, coins + uint8_t buf[64]; + + randombytes(buf, 32); + + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output + + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); + // K^ <-- kr[0:31] + // noiseseed (r) <-- kr[32:63]; + PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { + int i; + uint8_t fail; + uint8_t cmp[SABER_BYTES_CCA_DEC]; + uint8_t buf[64]; + uint8_t kr[64]; // Will contain key, coins + const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + + PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message + + // Multitarget countermeasure for coins + contributory KEM + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk + buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; + } + + sha3_512(kr, buf, 64); + + PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); + + fail = PQCLEAN_LIGHTSABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) + + PQCLEAN_LIGHTSABER_AVX2_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} diff --git a/crypto_kem/lightsaber/avx2/kem.h b/crypto_kem/lightsaber/avx2/kem.h new file mode 100644 index 00000000..b80c335d --- /dev/null +++ b/crypto_kem/lightsaber/avx2/kem.h @@ -0,0 +1,35 @@ +#ifndef INDCPA_H +#define INDCPA_H + +#include + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); + + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); + + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); + + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); + +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); + + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); + + + +//uint64_t clock1,clock2; + +//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; + + +#endif diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.c b/crypto_kem/lightsaber/avx2/pack_unpack.c new file mode 100644 index 00000000..e912fd0a --- /dev/null +++ b/crypto_kem/lightsaber/avx2/pack_unpack.c @@ -0,0 +1,502 @@ +#include "pack_unpack.h" + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 3 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); + bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); + bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); + } +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 3 * j; + offset_data = 8 * j; + data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; + data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; + data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); + data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; + data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; + data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); + data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); + data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); + } + +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0; + + for (j = 0; j < SABER_N / 2; j++) { + offset_data = 2 * j; + bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); + } +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0; + + for (j = 0; j < SABER_N / 2; j++) { + offset_data = 2 * j; + data[offset_data] = bytes[j] & 0x0f; + data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; + } +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 3 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); + bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); + bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); + } +} + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 3 * j; + offset_data = 4 * j; + data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; + data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; + data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; + data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); + } + +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); + } + } +} + +void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); + } + } +} + +void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); + + bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); + + bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); + + } + } + + +} + +void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 13 * j; + offset_data = 8 * j; + data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } +} + + + +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); + data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); + data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); + data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); + + } + } +} + +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + } + + +} + + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); + data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); + data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); + data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); + + } + } + + +} + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); + + bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); + + bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); + + } + } + + +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + } + + +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + //for(i=0;i> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + //} + + +} + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + /*This function packs 11 bit data stream into 8 bits of data. + */ + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 11) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 11 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); + + } + } + +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 11) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 11 * j; + offset_data = 8 * j; + + data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); + + data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); + + data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); + + data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); + + data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); + + data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); + + data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); + + data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); + } + } + + +} + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 14) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 7 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); + } + } + + +} + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 14) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 7 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); + + data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); + + data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); + + data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); + } + } + + +} + +void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { + + if (modulus == 1024) { + PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(bytes, data); + } else if (modulus == 8192) { + PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(bytes, data); + } +} + +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { + + if (modulus == 1024) { + PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(data, bytes); + } else if (modulus == 8192) { + PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(data, bytes); + } + +} diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.h b/crypto_kem/lightsaber/avx2/pack_unpack.h new file mode 100644 index 00000000..9a5d41f0 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/pack_unpack.h @@ -0,0 +1,56 @@ +#ifndef PACK_UNPACK_H +#define PACK_UNPACK_H +#include "SABER_params.h" +#include +#include + +void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); + +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); + +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + + +void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); + +void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); + + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + + +#endif diff --git a/crypto_kem/lightsaber/avx2/poly.h b/crypto_kem/lightsaber/avx2/poly.h new file mode 100644 index 00000000..8f2a7574 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/poly.h @@ -0,0 +1,27 @@ +#ifndef POLY_H +#define POLY_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ +#include "SABER_params.h" +#include + +typedef struct { + uint16_t coeffs[SABER_N]; +} poly; + +typedef struct { + poly vec[SABER_K]; +} polyvec; + +void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); + + +void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); + + +#endif diff --git a/crypto_kem/lightsaber/avx2/polymul/consts.h b/crypto_kem/lightsaber/avx2/polymul/consts.h new file mode 100644 index 00000000..40826398 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/polymul/consts.h @@ -0,0 +1,20 @@ +#include "../SABER_params.h" + +#define AVX_N (SABER_N >> 4) +#define small_len_avx (AVX_N >> 2) + +#define SCHB_N 16 + +#define N_SB (SABER_N >> 2) +#define N_SB_RES (2*N_SB-1) + +#define N_SB_16 (N_SB >> 2) +#define N_SB_16_RES (2*N_SB_16-1) + +#define AVX_N1 16 /*N/16*/ + +#define SCM_SIZE 16 + +// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements +#define NUM_POLY SABER_K +//int NUM_POLY=2; diff --git a/crypto_kem/lightsaber/avx2/polymul/matrix.c b/crypto_kem/lightsaber/avx2/polymul/matrix.c new file mode 100644 index 00000000..5fa35783 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/polymul/matrix.c @@ -0,0 +1,303 @@ +#include + +static void transpose_n1(__m256i *M) +{ + //int i; + register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; + register __m256i temp, temp0, temp1, temp2; + + //for(i=0; i<8; i=i+1) + //{ + r0 = _mm256_unpacklo_epi16(M[0], M[1]); + r1 = _mm256_unpacklo_epi16(M[2], M[3]); + r2 = _mm256_unpacklo_epi16(M[4], M[5]); + r3 = _mm256_unpacklo_epi16(M[6], M[7]); + r4 = _mm256_unpacklo_epi16(M[8], M[9]); + r5 = _mm256_unpacklo_epi16(M[10], M[11]); + r6 = _mm256_unpacklo_epi16(M[12], M[13]); + r7 = _mm256_unpacklo_epi16(M[14], M[15]); + + + temp = _mm256_unpacklo_epi32(r0, r1); + temp0 = _mm256_unpacklo_epi32(r2, r3); + temp1 = _mm256_unpacklo_epi32(r4, r5); + temp2 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpackhi_epi32(r0, r1); + r9 = _mm256_unpackhi_epi32(r2, r3); + r10 = _mm256_unpackhi_epi32(r4, r5); + r11 = _mm256_unpackhi_epi32(r6, r7); + + r0 = _mm256_unpacklo_epi64(temp, temp0); + r2 = _mm256_unpackhi_epi64(temp, temp0); + + r1 = _mm256_unpacklo_epi64(temp1, temp2); + r3 = _mm256_unpackhi_epi64(temp1, temp2); + + temp = _mm256_unpackhi_epi16(M[0], M[1]); + temp0 = _mm256_unpackhi_epi16(M[2], M[3]); + temp1 = _mm256_unpackhi_epi16(M[4], M[5]); + temp2 = _mm256_unpackhi_epi16(M[6], M[7]); + r4 = _mm256_unpackhi_epi16(M[8], M[9]); + + M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); + M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); + + + r5 = _mm256_unpackhi_epi16(M[10], M[11]); + r6 = _mm256_unpackhi_epi16(M[12], M[13]); + r7 = _mm256_unpackhi_epi16(M[14], M[15]); + + + + r0 = _mm256_unpacklo_epi64(r8, r9); + r1 = _mm256_unpacklo_epi64(r10, r11); + + r2 = _mm256_unpackhi_epi64(r8, r9); + r3 = _mm256_unpackhi_epi64(r10, r11); + + + + M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); + M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); + + + //for(i=0; i<4; i=i+1) + //{ + r0 = _mm256_unpacklo_epi32(temp, temp0); + r1 = _mm256_unpacklo_epi32(temp1, temp2); + r2 = _mm256_unpacklo_epi32(r4, r5); + r3 = _mm256_unpacklo_epi32(r6, r7); + + //} + + + //for(i=0; i<2; i=i+1) + //{ + r8 = _mm256_unpacklo_epi64(r0, r1); + r10 = _mm256_unpackhi_epi64(r0, r1); + + r9 = _mm256_unpacklo_epi64(r2, r3); + r11 = _mm256_unpackhi_epi64(r2, r3); + + M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); + M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); + M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); + M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); + + r0 = _mm256_unpackhi_epi32(temp, temp0); + r1 = _mm256_unpackhi_epi32(temp1, temp2); + r2 = _mm256_unpackhi_epi32(r4, r5); + r3 = _mm256_unpackhi_epi32(r6, r7); + + //} +// for(i=0; i<2; i=i+1) +// { + r4 = _mm256_unpacklo_epi64(r0, r1); + r6 = _mm256_unpackhi_epi64(r0, r1); + + r5 = _mm256_unpacklo_epi64(r2, r3); + r7 = _mm256_unpackhi_epi64(r2, r3); + +// } + + //------------------------------------------------------- + + M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); + M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); + M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); + M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); +} + +/* +void transpose_unrolled(__m256i *M) +{ + int i; + __m256i tL[8], tH[8]; + __m256i bL[4], bH[4], cL[4], cH[4]; + __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; + + __m256i r0, r1, r2, r3, r4, r5, r6, r7; + + //for(i=0; i<8; i=i+1) + //{ + tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); + tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); + + tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); + tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); + + tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); + tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); + + tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); + tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); + + tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); + tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); + + tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); + tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); + + tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); + tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); + + tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); + tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); + + //} + + //------------------------------------------------------- + //for(i=0; i<4; i=i+1) + //{ + bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); + bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); + + bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); + bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); + + bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); + bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); + + bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); + bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); + + //} + + //for(i=0; i<2; i=i+1) + //{ + dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); + dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); + + dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); + dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); + + M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); + M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); + M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); + M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); + + //} + //for(i=0; i<2; i=i+1) + //{ + eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); + eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); + + eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); + eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); + + //} + + //------------------------------------------------------- + + //------------------------------------------------------- + for(i=0; i<4; i=i+1) + { + cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); + cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); + } + + + for(i=0; i<2; i=i+1) + { + fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); + fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); + gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); + } + + //------------------------------------------------------- + + + + M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); + M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); + M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); + M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); + + M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); + M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); + M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); + M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); + + M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); + M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); + M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); + M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); +} + + +void transpose1(__m256i *M) +{ + int i; + __m256i tL[8], tH[8]; + __m256i bL[4], bH[4], cL[4], cH[4]; + __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; + + for(i=0; i<8; i=i+1) + { + tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); + tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); + } + + for(i=0; i<4; i=i+1) + { + bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); + bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); + } + for(i=0; i<4; i=i+1) + { + cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); + cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); + } + + for(i=0; i<2; i=i+1) + { + dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); + dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); + eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); + } + + for(i=0; i<2; i=i+1) + { + fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); + fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); + gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); + } + + M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); + M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); + M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); + M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); + + M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); + M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); + M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); + M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); + + M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); + M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); + M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); + M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); + + M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); + M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); + M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); + M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); +} +*/ diff --git a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c new file mode 100644 index 00000000..4e4f11f8 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c @@ -0,0 +1,753 @@ +//#define SCM_SIZE 16 + +//#pragma STDC FP_CONTRACT ON + +#include + +inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { + return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); +} + + +static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched + //the c_avx are added cummulatively +{ + + register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + register __m256i temp; + + + a0=a[0]; + a1=a[1]; + a2=a[2]; + a3=a[3]; + a4=a[4]; + a5=a[5]; + a6=a[6]; + a7=a[7]; + + b0=b[0]; + b1=b[1]; + b2=b[2]; + b3=b[3]; + b4=b[4]; + b5=b[5]; + b6=b[6]; + b7=b[7]; + + // New Unrolled first triangle + + //otherwise accumulate + c_avx[0] = mul_add(a0, b0, c_avx[0]); + + + temp = _mm256_mullo_epi16 (a0, b1); + temp=mul_add(a1, b0, temp); + c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); + + + temp = _mm256_mullo_epi16 (a0, b2); + temp = mul_add(a1, b1, temp); + temp=mul_add(a2, b0, temp); + c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); + + + temp = _mm256_mullo_epi16 (a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + temp=mul_add(a3, b0, temp); + c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); + + temp = _mm256_mullo_epi16 (a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + temp=mul_add(a2, b2, temp); + c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); + + + temp = _mm256_mullo_epi16 (a0, b5); + temp = mul_add(a1, b4 , temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + temp=mul_add(a5, b0, temp); + c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); + + temp = _mm256_mullo_epi16 (a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + temp=mul_add(a4, b2, temp); + c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); + + + temp = _mm256_mullo_epi16 (a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add (a6, b1, temp); + temp = mul_add (a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add (a3, b4, temp); + temp = mul_add (a4, b3, temp); + temp=mul_add(a5, b2, temp); + c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); + + temp = _mm256_mullo_epi16 (a0, b[8]); + temp = mul_add (a1, b7, temp); + temp = mul_add (a7, b1, temp); + temp = mul_add (a[8], b0, temp); + temp = mul_add (a2, b6,temp); + temp = mul_add(a3, b5, temp); + temp = mul_add (a4, b4,temp); + temp = mul_add (a5, b3, temp); + + temp=mul_add(a6, b2, temp); + c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); + + + temp = _mm256_mullo_epi16 (a0, b[9]); + temp = mul_add (a1, b[8], temp); + temp = mul_add (a[8], b1, temp); + temp = mul_add (a[9], b0, temp); + temp = mul_add (a2, b7, temp); + temp = mul_add (a3, b6, temp); + temp = mul_add (a4, b5, temp); + temp = mul_add (a5, b4, temp); + temp = mul_add (a6, b3, temp); + temp=mul_add(a7, b2, temp); + c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); + + + temp= _mm256_mullo_epi16 (a0, b[10]); + temp = mul_add (a1, b[9], temp); + temp = mul_add (a[9], b1, temp); + temp = mul_add (a[10], b0, temp); + temp = mul_add (a2, b[8], temp); + temp = mul_add (a3, b7, temp); + temp = mul_add (a4, b6, temp); + temp = mul_add (a5, b5, temp); + temp = mul_add (a6, b4, temp); + temp = mul_add (a7, b3, temp); + temp=mul_add(a[8], b2, temp); + c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); + + + temp = _mm256_mullo_epi16 (a0, b[11]); + temp = mul_add (a1, b[10], temp ); + temp = mul_add (a[10], b1, temp ); + temp = mul_add (a[11], b0, temp ); + temp = mul_add (a2, b[9], temp ); + temp = mul_add (a3, b[8], temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a[8], b3, temp ); + temp=mul_add(a[9], b2, temp); + c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); + + + temp = _mm256_mullo_epi16 (a0, b[12]); + temp = mul_add (a1, b[11], temp); + temp = mul_add (a[11], b1, temp); + temp = mul_add (a[12], b0, temp); + temp = mul_add (a2, b[10], temp); + temp = mul_add (a3, b[9], temp); + temp = mul_add (a4, b[8], temp); + temp = mul_add (a5, b7, temp); + temp = mul_add (a6, b6, temp); + temp = mul_add (a7, b5, temp); + temp = mul_add (a[8], b4, temp); + temp = mul_add (a[9], b3, temp); + temp=mul_add(a[10], b2, temp); + c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); + + + temp = _mm256_mullo_epi16 (a0, b[13]); + temp = mul_add (a1, b[12], temp ); + temp = mul_add (a[12], b1, temp ); + temp = mul_add (a[13], b0, temp ); + temp = mul_add (a2, b[11], temp ); + temp = mul_add (a3, b[10], temp ); + temp = mul_add (a4, b[9], temp ); + temp = mul_add (a5, b[8], temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a[8], b5, temp ); + temp = mul_add (a[9], b4, temp ); + temp = mul_add (a[10], b3, temp ); + temp=mul_add(a[11], b2, temp); + c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); + + + + temp = _mm256_mullo_epi16 (a0, b[14]); + temp = mul_add (a1, b[13], temp ); + temp = mul_add (a[13], b1, temp ); + temp = mul_add (a[14], b0, temp ); + temp = mul_add (a2, b[12], temp ); + temp = mul_add (a3, b[11], temp ); + temp = mul_add (a4, b[10], temp ); + temp = mul_add (a5, b[9], temp ); + temp = mul_add (a6, b[8], temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a[8], b6, temp ); + temp = mul_add (a[9], b5, temp ); + temp = mul_add (a[10], b4, temp ); + temp = mul_add (a[11], b3, temp ); + temp=mul_add(a[12], b2, temp); + c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); + + + temp = _mm256_mullo_epi16 (a0, b[15]); + temp = mul_add (a1, b[14], temp ); + temp = mul_add (a[14], b1, temp ); + temp = mul_add (a[15], b0, temp ); + temp = mul_add (a2, b[13], temp ); + temp = mul_add (a3, b[12], temp ); + temp = mul_add (a4, b[11], temp ); + temp = mul_add (a5, b[10], temp ); + temp = mul_add (a6, b[9], temp ); + temp = mul_add (a7, b[8], temp ); + temp = mul_add (a[8], b7, temp ); + temp = mul_add (a[9], b6, temp ); + temp = mul_add (a[10], b5, temp ); + temp = mul_add (a[11], b4, temp ); + temp = mul_add (a[12], b3, temp ); + temp=mul_add(a[13], b2, temp); + c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); + + + // unrolled second triangle + a0=a[14]; + a1=a[15]; + a2=a[13]; + a3=a[12]; + a4=a[11]; + a5=a[10]; + a6=a[9]; + a7=a[8]; + + b0=b[14]; + b1=b[15]; + b2=b[13]; + b3=b[12]; + b4=b[11]; + b5=b[10]; + b6=b[9]; + b7=b[8]; + + temp = _mm256_mullo_epi16 (a[1], b1); + temp = mul_add (a[2], b0, temp ); + temp = mul_add (a[3], b2, temp ); + temp = mul_add (a[4], b3, temp ); + temp = mul_add (a[5], b4, temp ); + temp = mul_add (a[6], b5, temp ); + temp = mul_add (a[7], b6, temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a6, b[7], temp ); + temp = mul_add (a5, b[6], temp ); + temp = mul_add (a4, b[5], temp ); + temp = mul_add (a3, b[4], temp ); + temp = mul_add (a2, b[3], temp ); + temp = mul_add (a0, b[2], temp ); + temp=mul_add(a1, b[1], temp); + c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); + + + temp = _mm256_mullo_epi16 (a[2], b1); + temp = mul_add (a[3], b0, temp ); + temp = mul_add (a[4], b2, temp ); + temp = mul_add (a[5], b3, temp ); + temp = mul_add (a[6], b4, temp ); + temp = mul_add (a[7], b5, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a5, b[7], temp ); + temp = mul_add (a4, b[6], temp ); + temp = mul_add (a3, b[5], temp ); + temp = mul_add (a2, b[4], temp ); + temp = mul_add (a0, b[3], temp ); + temp=mul_add(a1, b[2], temp); + c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); + + + temp = _mm256_mullo_epi16 (a[3], b1); + temp = mul_add (a[4], b0, temp ); + temp = mul_add (a[5], b2, temp ); + temp = mul_add (a[6], b3, temp ); + temp = mul_add (a[7], b4, temp ); + temp = mul_add (a7, b5, temp ); + temp = mul_add (a6, b6, temp ); + temp = mul_add (a5, b7, temp ); + temp = mul_add (a4, b[7], temp ); + temp = mul_add (a3, b[6], temp ); + temp = mul_add (a2, b[5], temp ); + temp = mul_add (a0, b[4], temp ); + temp=mul_add(a1, b[3], temp); + c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); + + + temp = _mm256_mullo_epi16 (a[4], b1); + temp = mul_add (a[5], b0, temp ); + temp = mul_add (a[6], b2, temp ); + temp = mul_add (a[7], b3, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a3, b[7], temp ); + temp = mul_add (a2, b[6], temp ); + temp = mul_add (a0, b[5], temp ); + temp=mul_add(a1, b[4], temp); + c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); + + + temp = _mm256_mullo_epi16 (a[5], b1); + temp = mul_add (a[6], b0, temp ); + temp = mul_add (a[7], b2, temp ); + temp = mul_add (a7, b3, temp ); + temp = mul_add (a6, b4, temp ); + temp = mul_add (a5, b5, temp ); + temp = mul_add (a4, b6, temp ); + temp = mul_add (a3, b7, temp ); + temp = mul_add (a2, b[7], temp ); + temp = mul_add (a0, b[6], temp ); + temp=mul_add(a1, b[5], temp); + c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); + + + temp = _mm256_mullo_epi16 (a[6], b1); + temp = mul_add (a[7], b0, temp ); + temp = mul_add (a7, b2, temp ); + temp = mul_add (a6, b3, temp ); + temp = mul_add (a5, b4, temp ); + temp = mul_add (a4, b5, temp ); + temp = mul_add (a3, b6, temp ); + temp = mul_add (a2, b7, temp ); + temp = mul_add (a0, b[7], temp ); + temp=mul_add(a1, b[6], temp); + c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); + + + temp = _mm256_mullo_epi16 (a[7], b1); + temp = mul_add (a7, b0, temp ); + temp = mul_add (a6, b2, temp ); + temp = mul_add (a5, b3, temp ); + temp = mul_add (a4, b4, temp ); + temp = mul_add (a3, b5, temp ); + temp = mul_add (a2, b6, temp ); + temp = mul_add (a0, b7, temp ); + temp=mul_add(a1, b[7], temp); + c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); + + + temp = _mm256_mullo_epi16 (a7, b1); + temp = mul_add (a6, b0, temp ); + temp = mul_add (a5, b2, temp ); + temp = mul_add (a4, b3, temp ); + temp = mul_add (a3, b4, temp ); + temp = mul_add (a2, b5, temp ); + temp = mul_add (a0, b6, temp ); + temp=mul_add(a1, b7, temp); + c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); + + + temp = _mm256_mullo_epi16 (a6, b1); + temp = mul_add (a5, b0, temp ); + temp = mul_add (a4, b2, temp ); + temp = mul_add (a3, b3, temp ); + temp = mul_add (a2, b4, temp ); + temp = mul_add (a0, b5, temp ); + temp=mul_add(a1, b6, temp); + c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); + + + temp = _mm256_mullo_epi16 (a5, b1); + temp = mul_add (a4, b0, temp ); + temp = mul_add (a3, b2, temp ); + temp = mul_add (a2, b3, temp ); + temp = mul_add (a0, b4, temp ); + temp=mul_add(a1, b5, temp); + c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); + + + temp = _mm256_mullo_epi16 (a4, b1); + temp = mul_add (a3, b0, temp ); + temp = mul_add (a2, b2, temp ); + temp = mul_add (a0, b3, temp ); + temp=mul_add(a1, b4, temp); + c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); + + + temp = _mm256_mullo_epi16 (a3, b1); + temp = mul_add (a2, b0, temp ); + temp = mul_add (a0, b2, temp ); + temp=mul_add(a1, b3, temp); + c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); + + + temp = _mm256_mullo_epi16 (a2, b1); + temp = mul_add (a0, b0, temp ); + temp=mul_add(a1, b2, temp); + c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); + + + temp = _mm256_mullo_epi16 (a0, b1); + temp=mul_add(a1, b0, temp); + c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); + + + c_avx[30] = mul_add(a1, b1, c_avx[30]); + + + + c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); + + +} + + + +static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched + //the c_avx are not added cummulatively +{ + + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + + a0=a[0]; + a1=a[1]; + a2=a[2]; + a3=a[3]; + a4=a[4]; + a5=a[5]; + a6=a[6]; + a7=a[7]; + + b0=b[0]; + b1=b[1]; + b2=b[2]; + b3=b[3]; + b4=b[4]; + b5=b[5]; + b6=b[6]; + b7=b[7]; + + // New Unrolled first triangle + c_avx[0] = _mm256_mullo_epi16 (a0, b0); + + temp = _mm256_mullo_epi16 (a0, b1); + c_avx[1]=mul_add(a1, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b2); + + temp = mul_add(a1, b1, temp); + c_avx[2]= mul_add(a2, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + c_avx[3]= mul_add(a3, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + c_avx[4]= mul_add(a2, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b5); + temp = mul_add(a1, b4 , temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + c_avx[5] = mul_add(a5, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + c_avx[6] = mul_add(a4, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add (a6, b1, temp); + temp = mul_add (a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add (a3, b4, temp); + temp = mul_add (a4, b3, temp); + c_avx[7] = mul_add (a5, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[8]); + temp = mul_add (a1, b7, temp); + temp = mul_add (a7, b1, temp); + temp = mul_add (a[8], b0, temp); + temp = mul_add (a2, b6,temp); + temp = mul_add(a3, b5, temp); + temp = mul_add (a4, b4,temp); + temp = mul_add (a5, b3, temp); + c_avx[8] = mul_add (a6, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[9]); + temp = mul_add (a1, b[8], temp); + temp = mul_add (a[8], b1, temp); + temp = mul_add (a[9], b0, temp); + temp = mul_add (a2, b7, temp); + temp = mul_add (a3, b6, temp); + temp = mul_add (a4, b5, temp); + temp = mul_add (a5, b4, temp); + temp = mul_add (a6, b3, temp); + c_avx[9] = mul_add (a7, b2, temp); + + temp= _mm256_mullo_epi16 (a0, b[10]); + temp = mul_add (a1, b[9], temp); + temp = mul_add (a[9], b1, temp); + temp = mul_add (a[10], b0, temp); + temp = mul_add (a2, b[8], temp); + temp = mul_add (a3, b7, temp); + temp = mul_add (a4, b6, temp); + temp = mul_add (a5, b5, temp); + temp = mul_add (a6, b4, temp); + temp = mul_add (a7, b3, temp); + c_avx[10] = mul_add (a[8], b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[11]); + temp = mul_add (a1, b[10], temp ); + temp = mul_add (a[10], b1, temp ); + temp = mul_add (a[11], b0, temp ); + temp = mul_add (a2, b[9], temp ); + temp = mul_add (a3, b[8], temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a[8], b3, temp ); + c_avx[11] = mul_add (a[9], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[12]); + temp = mul_add (a1, b[11], temp); + temp = mul_add (a[11], b1, temp); + temp = mul_add (a[12], b0, temp); + temp = mul_add (a2, b[10], temp); + temp = mul_add (a3, b[9], temp); + temp = mul_add (a4, b[8], temp); + temp = mul_add (a5, b7, temp); + temp = mul_add (a6, b6, temp); + temp = mul_add (a7, b5, temp); + temp = mul_add (a[8], b4, temp); + temp = mul_add (a[9], b3, temp); + c_avx[12] = mul_add (a[10], b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[13]); + temp = mul_add (a1, b[12], temp ); + temp = mul_add (a[12], b1, temp ); + temp = mul_add (a[13], b0, temp ); + temp = mul_add (a2, b[11], temp ); + temp = mul_add (a3, b[10], temp ); + temp = mul_add (a4, b[9], temp ); + temp = mul_add (a5, b[8], temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a[8], b5, temp ); + temp = mul_add (a[9], b4, temp ); + temp = mul_add (a[10], b3, temp ); + c_avx[13] = mul_add (a[11], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[14]); + temp = mul_add (a1, b[13], temp ); + temp = mul_add (a[13], b1, temp ); + temp = mul_add (a[14], b0, temp ); + temp = mul_add (a2, b[12], temp ); + temp = mul_add (a3, b[11], temp ); + temp = mul_add (a4, b[10], temp ); + temp = mul_add (a5, b[9], temp ); + temp = mul_add (a6, b[8], temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a[8], b6, temp ); + temp = mul_add (a[9], b5, temp ); + temp = mul_add (a[10], b4, temp ); + temp = mul_add (a[11], b3, temp ); + c_avx[14] = mul_add (a[12], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[15]); + temp = mul_add (a1, b[14], temp ); + temp = mul_add (a[14], b1, temp ); + temp = mul_add (a[15], b0, temp ); + temp = mul_add (a2, b[13], temp ); + temp = mul_add (a3, b[12], temp ); + temp = mul_add (a4, b[11], temp ); + temp = mul_add (a5, b[10], temp ); + temp = mul_add (a6, b[9], temp ); + temp = mul_add (a7, b[8], temp ); + temp = mul_add (a[8], b7, temp ); + temp = mul_add (a[9], b6, temp ); + temp = mul_add (a[10], b5, temp ); + temp = mul_add (a[11], b4, temp ); + temp = mul_add (a[12], b3, temp ); + c_avx[15] = mul_add (a[13], b2, temp ); + + + // unrolled second triangle + a0=a[14]; + a1=a[15]; + a2=a[13]; + a3=a[12]; + a4=a[11]; + a5=a[10]; + a6=a[9]; + a7=a[8]; + + b0=b[14]; + b1=b[15]; + b2=b[13]; + b3=b[12]; + b4=b[11]; + b5=b[10]; + b6=b[9]; + b7=b[8]; + + + temp = _mm256_mullo_epi16 (a[1], b1); + temp = mul_add (a[2], b0, temp ); + temp = mul_add (a[3], b2, temp ); + temp = mul_add (a[4], b3, temp ); + temp = mul_add (a[5], b4, temp ); + temp = mul_add (a[6], b5, temp ); + temp = mul_add (a[7], b6, temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a6, b[7], temp ); + temp = mul_add (a5, b[6], temp ); + temp = mul_add (a4, b[5], temp ); + temp = mul_add (a3, b[4], temp ); + temp = mul_add (a2, b[3], temp ); + temp = mul_add (a0, b[2], temp ); + c_avx[16] = mul_add (a1, b[1], temp ); + + temp = _mm256_mullo_epi16 (a[2], b1); + temp = mul_add (a[3], b0, temp ); + temp = mul_add (a[4], b2, temp ); + temp = mul_add (a[5], b3, temp ); + temp = mul_add (a[6], b4, temp ); + temp = mul_add (a[7], b5, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a5, b[7], temp ); + temp = mul_add (a4, b[6], temp ); + temp = mul_add (a3, b[5], temp ); + temp = mul_add (a2, b[4], temp ); + temp = mul_add (a0, b[3], temp ); + c_avx[17] = mul_add (a1, b[2], temp ); + + temp = _mm256_mullo_epi16 (a[3], b1); + temp = mul_add (a[4], b0, temp ); + temp = mul_add (a[5], b2, temp ); + temp = mul_add (a[6], b3, temp ); + temp = mul_add (a[7], b4, temp ); + temp = mul_add (a7, b5, temp ); + temp = mul_add (a6, b6, temp ); + temp = mul_add (a5, b7, temp ); + temp = mul_add (a4, b[7], temp ); + temp = mul_add (a3, b[6], temp ); + temp = mul_add (a2, b[5], temp ); + temp = mul_add (a0, b[4], temp ); + c_avx[18] = mul_add (a1, b[3], temp ); + + temp = _mm256_mullo_epi16 (a[4], b1); + temp = mul_add (a[5], b0, temp ); + temp = mul_add (a[6], b2, temp ); + temp = mul_add (a[7], b3, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a3, b[7], temp ); + temp = mul_add (a2, b[6], temp ); + temp = mul_add (a0, b[5], temp ); + c_avx[19] = mul_add (a1, b[4], temp ); + + temp = _mm256_mullo_epi16 (a[5], b1); + temp = mul_add (a[6], b0, temp ); + temp = mul_add (a[7], b2, temp ); + temp = mul_add (a7, b3, temp ); + temp = mul_add (a6, b4, temp ); + temp = mul_add (a5, b5, temp ); + temp = mul_add (a4, b6, temp ); + temp = mul_add (a3, b7, temp ); + temp = mul_add (a2, b[7], temp ); + temp = mul_add (a0, b[6], temp ); + c_avx[20] = mul_add (a1, b[5], temp ); + + temp = _mm256_mullo_epi16 (a[6], b1); + temp = mul_add (a[7], b0, temp ); + temp = mul_add (a7, b2, temp ); + temp = mul_add (a6, b3, temp ); + temp = mul_add (a5, b4, temp ); + temp = mul_add (a4, b5, temp ); + temp = mul_add (a3, b6, temp ); + temp = mul_add (a2, b7, temp ); + temp = mul_add (a0, b[7], temp ); + c_avx[21] = mul_add (a1, b[6], temp ); + + temp = _mm256_mullo_epi16 (a[7], b1); + temp = mul_add (a7, b0, temp ); + temp = mul_add (a6, b2, temp ); + temp = mul_add (a5, b3, temp ); + temp = mul_add (a4, b4, temp ); + temp = mul_add (a3, b5, temp ); + temp = mul_add (a2, b6, temp ); + temp = mul_add (a0, b7, temp ); + c_avx[22] = mul_add (a1, b[7], temp ); + + temp = _mm256_mullo_epi16 (a7, b1); + temp = mul_add (a6, b0, temp ); + temp = mul_add (a5, b2, temp ); + temp = mul_add (a4, b3, temp ); + temp = mul_add (a3, b4, temp ); + temp = mul_add (a2, b5, temp ); + temp = mul_add (a0, b6, temp ); + c_avx[23] = mul_add (a1, b7, temp ); + + temp = _mm256_mullo_epi16 (a6, b1); + temp = mul_add (a5, b0, temp ); + temp = mul_add (a4, b2, temp ); + temp = mul_add (a3, b3, temp ); + temp = mul_add (a2, b4, temp ); + temp = mul_add (a0, b5, temp ); + c_avx[24] = mul_add (a1, b6, temp ); + + temp = _mm256_mullo_epi16 (a5, b1); + temp = mul_add (a4, b0, temp ); + temp = mul_add (a3, b2, temp ); + temp = mul_add (a2, b3, temp ); + temp = mul_add (a0, b4, temp ); + c_avx[25] = mul_add (a1, b5, temp ); + + temp = _mm256_mullo_epi16 (a4, b1); + temp = mul_add (a3, b0, temp ); + temp = mul_add (a2, b2, temp ); + temp = mul_add (a0, b3, temp ); + c_avx[26] = mul_add (a1, b4, temp ); + + temp = _mm256_mullo_epi16 (a3, b1); + temp = mul_add (a2, b0, temp ); + temp = mul_add (a0, b2, temp ); + c_avx[27] = mul_add (a1, b3, temp ); + + temp = _mm256_mullo_epi16 (a2, b1); + temp = mul_add (a0, b0, temp ); + c_avx[28] = mul_add (a1, b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b1); + c_avx[29] = mul_add (a1, b0, temp); + + c_avx[30] = _mm256_mullo_epi16 (a1, b1); + + + c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); + +} diff --git a/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c b/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c new file mode 100644 index 00000000..78fb86c2 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c @@ -0,0 +1,1010 @@ +/* +Cleaned version for step by step approach look into the _debug file +*/ +//#include "timing.c" +#include "consts.h" +#include "matrix.c" +#include "scm_avx.c" + +static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX. +{ + __m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time + + //uint16_t i; + + register __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + + + //CLOCK1=cpucycles(); + + //------------------AVX evaluation for 1st poly----------------------- + + r0_avx=a[0]; + r1_avx=a[1]; + r2_avx=a[2]; + r3_avx=a[3]; + a_bucket[0]=r0_avx; + a_bucket[1]=r1_avx; + a_bucket[2]=r2_avx; + a_bucket[3]=r3_avx; + a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]); + + + //------------------AVX evaluation for 1st poly ends------------------ + + + //------------------AVX evaluation for 2nd poly----------------------- + r0_avx=a[small_len_avx]; + r1_avx=a[small_len_avx+1]; + r2_avx=a[small_len_avx+2]; + r3_avx=a[small_len_avx+3]; + a_bucket[0+9]=r0_avx; + a_bucket[1+9]=r1_avx; + a_bucket[2+9]=r2_avx; + a_bucket[3+9]=r3_avx; + a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]); + + + //------------------AVX evaluation for 2nd poly ends------------------ + + + //------------------AVX evaluation for 3rd poly----------------------- + r0_avx=a[2*small_len_avx]; + r1_avx=a[2*small_len_avx+1]; + r2_avx=a[2*small_len_avx+2]; + r3_avx=a[2*small_len_avx+3]; + a_bucket[0+18]=r0_avx; + a_bucket[1+18]=r1_avx; + a_bucket[2+18]=r2_avx; + a_bucket[3+18]=r3_avx; + a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]); + + //------------------AVX evaluation for 3rd poly ends------------------ + + + //------------------AVX evaluation for 4th poly----------------------- + + r0_avx=a[3*small_len_avx]; + r1_avx=a[3*small_len_avx+1]; + r2_avx=a[3*small_len_avx+2]; + r3_avx=a[3*small_len_avx+3]; + a_bucket[0+27]=r0_avx; + a_bucket[1+27]=r1_avx; + a_bucket[2+27]=r2_avx; + a_bucket[3+27]=r3_avx; + a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]); + + //------------------AVX evaluation for 4th poly ends------------------ + + //------------------AVX evaluation for 5th poly----------------------- + + r0_avx=a[4*small_len_avx+0]; + r1_avx=a[4*small_len_avx+1]; + r2_avx=a[4*small_len_avx+2]; + r3_avx=a[4*small_len_avx+3]; + a_bucket[0+36]=r0_avx; + a_bucket[1+36]=r1_avx; + a_bucket[2+36]=r2_avx; + a_bucket[3+36]=r3_avx; + a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]); + + //------------------AVX evaluation for 5th poly ends------------------ + + + //------------------AVX evaluation for 6th poly----------------------- + r0_avx=a[5*small_len_avx]; + r1_avx=a[5*small_len_avx+1]; + r2_avx=a[5*small_len_avx+2]; + r3_avx=a[5*small_len_avx+3]; + a_bucket[0+45]=r0_avx; + a_bucket[1+45]=r1_avx; + a_bucket[2+45]=r2_avx; + a_bucket[3+45]=r3_avx; + a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]); + + //------------------AVX evaluation for 6th poly ends------------------ + + //------------------AVX evaluation for 7th poly----------------------- + + r0_avx=a[6*small_len_avx]; + r1_avx=a[6*small_len_avx+1]; + r2_avx=a[6*small_len_avx+2]; + r3_avx=a[6*small_len_avx+3]; + a_bucket[0+54]=r0_avx; + a_bucket[1+54]=r1_avx; + a_bucket[2+54]=r2_avx; + a_bucket[3+54]=r3_avx; + a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]); + + //------------------AVX evaluation for 7th poly ends------------------ + + + + //CLOCK2=cpucycles(); + //CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1); + //printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1); + + + //CLOCK1=cpucycles(); + //-----------------Forward transposes-------------------------------------- + transpose_n1(a_bucket); + transpose_n1(a_bucket+16); + transpose_n1(a_bucket+32); + transpose_n1(a_bucket+48); + + //-----------------Forwatrd transposes ends--------------------------------- + + //----------------------all multiplications--------------------------------- + if(f==0){ + schoolbook_avx_new2(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); + schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); + schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); + } + else{ + schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); + //schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); + schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); + } + /* + schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f); + schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f); + schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f); + schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f); + */ + + + //----------------------all multiplications ends----------------------------- + + + //-----------------Reverse transposes-------------------------------------- + + /* + transpose(c_bucket); + transpose(c_bucket+16); + + transpose(c_bucket+2*SCM_SIZE); + transpose(c_bucket+16+2*SCM_SIZE); + + transpose(c_bucket+4*SCM_SIZE); + transpose(c_bucket+16+4*SCM_SIZE); + + transpose(c_bucket+6*SCM_SIZE); + transpose(c_bucket+16+6*SCM_SIZE); + */ + //-----------------Reverse transposes ends--------------------------------- + + //CLOCK2=cpucycles(); + //CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1); + + //KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6); + +} + +static void KARA_eval(__m256i* b, __m256i *b_bucket){ + + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + + //-------1st poly---------------------------------------------------- + r0_avx=b[0]; + r1_avx=b[1]; + r2_avx=b[2]; + r3_avx=b[3]; + b_bucket[0]=r0_avx; + b_bucket[1]=r1_avx; + b_bucket[2]=r2_avx; + b_bucket[3]=r3_avx; + b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]); + //-------2nd poly---------------------------------------------------- + + r0_avx=b[small_len_avx]; + r1_avx=b[small_len_avx+1]; + r2_avx=b[small_len_avx+2]; + r3_avx=b[small_len_avx+3]; + b_bucket[0+9]=r0_avx; + b_bucket[1+9]=r1_avx; + b_bucket[2+9]=r2_avx; + b_bucket[3+9]=r3_avx; + b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]); + + //-------3rd poly---------------------------------------------------- + + r0_avx=b[2*small_len_avx+0]; + r1_avx=b[2*small_len_avx+1]; + r2_avx=b[2*small_len_avx+2]; + r3_avx=b[2*small_len_avx+3]; + b_bucket[0+18]=r0_avx; + b_bucket[1+18]=r1_avx; + b_bucket[2+18]=r2_avx; + b_bucket[3+18]=r3_avx; + b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]); + + //-------4th poly---------------------------------------------------- + r0_avx=b[3*small_len_avx]; + r1_avx=b[3*small_len_avx+1]; + r2_avx=b[3*small_len_avx+2]; + r3_avx=b[3*small_len_avx+3]; + b_bucket[0+27]=r0_avx; + b_bucket[1+27]=r1_avx; + b_bucket[2+27]=r2_avx; + b_bucket[3+27]=r3_avx; + b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]); + + //-------5th poly---------------------------------------------------- + + r0_avx=b[4*small_len_avx]; + r1_avx=b[4*small_len_avx+1]; + r2_avx=b[4*small_len_avx+2]; + r3_avx=b[4*small_len_avx+3]; + b_bucket[0+36]=r0_avx; + b_bucket[1+36]=r1_avx; + b_bucket[2+36]=r2_avx; + b_bucket[3+36]=r3_avx; + b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]); + + //-------6th poly---------------------------------------------------- + + r0_avx=b[5*small_len_avx]; + r1_avx=b[5*small_len_avx+1]; + r2_avx=b[5*small_len_avx+2]; + r3_avx=b[5*small_len_avx+3]; + b_bucket[0+45]=r0_avx; + b_bucket[1+45]=r1_avx; + b_bucket[2+45]=r2_avx; + b_bucket[3+45]=r3_avx; + b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]); + + //-------7th poly---------------------------------------------------- + + r0_avx=b[6*small_len_avx]; + r1_avx=b[6*small_len_avx+1]; + r2_avx=b[6*small_len_avx+2]; + r3_avx=b[6*small_len_avx+3]; + b_bucket[0+54]=r0_avx; + b_bucket[1+54]=r1_avx; + b_bucket[2+54]=r2_avx; + b_bucket[3+54]=r3_avx; + b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]); + + //--------------Evaluating B poly ends------------------------------- + + transpose_n1(b_bucket); + transpose_n1(b_bucket+16); + transpose_n1(b_bucket+32); + transpose_n1(b_bucket+48); +} + +static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){ + + //int64_t i; + register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results + + __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; + + //CLOCK1=cpucycles(); + + //------------------------AVX interpolation for 1st poly external------------------- + + //loop1 + res_avx0 = c_bucket[0]; + res_avx2 = c_bucket[1]; + res_avx4 = c_bucket[2]; + res_avx6 = c_bucket[3]; + + c6_avx=c_bucket[6]; + c7_avx=c_bucket[7]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx); + + res_avx1 = c_bucket[16]; + res_avx3 = c_bucket[17]; + res_avx5 = c_bucket[18]; + res_avx7 = c_bucket[19]; + + c22_avx=c_bucket[22]; + c23_avx=c_bucket[23]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final0[0]=res_avx0; + result_final0[1]=res_avx1; + + result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final0[6]=res_avx6; + result_final0[7]=res_avx7; + + + //------------------------AVX interpolation for 1st poly ends-------------- + + + //------------------------AVX interpolation for 2nd poly external------------------- + + //loop1 + res_avx0 = c_bucket[9]; //c_bucket0 + res_avx2 = c_bucket[10]; //c_bucket1 + res_avx4 = c_bucket[11]; //c_bucket2 + res_avx6 = c_bucket[12]; //c_bucket3 + + c6_avx=c_bucket[15]; //c_bucket6 + c7_avx=c_bucket[32]; //c_bucket7 + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx); + + res_avx1 = c_bucket[25]; //c_bucket0 + res_avx3 = c_bucket[26]; //c_bucket1 + res_avx5 = c_bucket[27]; //c_bucket2 + res_avx7 = c_bucket[28]; //c_bucket3 + + c22_avx=c_bucket[31]; + c23_avx=c_bucket[48]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final1[0]=res_avx0; + result_final1[1]=res_avx1; + + result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final1[6]=res_avx6; + result_final1[7]=res_avx7; + + + //------------------------AVX interpolation for 2nd poly ends-------------- + + //------------------------AVX interpolation for 3rd poly external------------------- + + //loop1 + res_avx0 = c_bucket[34]; //c_bucket0 + res_avx2 = c_bucket[35]; //c_bucket1 + res_avx4 = c_bucket[36]; + res_avx6 = c_bucket[37]; + + c6_avx=c_bucket[40]; + c7_avx=c_bucket[41]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx); + + res_avx1 = c_bucket[50]; //c_bucket0 + res_avx3 = c_bucket[51]; //c_bucket1 + res_avx5 = c_bucket[52]; + res_avx7 = c_bucket[53]; + + c22_avx=c_bucket[56]; + c23_avx=c_bucket[57]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + //loop4 + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + //loop5 + result_final2[0]=res_avx0; + result_final2[1]=res_avx1; + + result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final2[6]=res_avx6; + result_final2[7]=res_avx7; + + //------------------------AVX interpolation for 3rd poly ends-------------- + + //------------------------AVX interpolation for 4th poly external------------------- + + //loop1 + res_avx0 = c_bucket[43]; + res_avx2 = c_bucket[44]; + res_avx4 = c_bucket[45]; + res_avx6 = c_bucket[46]; + + c6_avx=c_bucket[65]; + c7_avx=c_bucket[66]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx); + + res_avx1 = c_bucket[59]; + res_avx3 = c_bucket[60]; + res_avx5 = c_bucket[61]; + res_avx7 = c_bucket[62]; + + c22_avx=c_bucket[81]; + c23_avx=c_bucket[82]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final3[0]=res_avx0; + result_final3[1]=res_avx1; + + result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final3[6]=res_avx6; + result_final3[7]=res_avx7; + + + //------------------------AVX interpolation for 4th poly ends-------------- + + //------------------------AVX interpolation for 5th poly external------------------- + + //loop1 + res_avx0 = c_bucket[68]; + res_avx2 = c_bucket[69]; + res_avx4 = c_bucket[70]; + res_avx6 = c_bucket[71]; + + c6_avx=c_bucket[74]; + c7_avx=c_bucket[75]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx); + + res_avx1 = c_bucket[84]; + res_avx3 = c_bucket[85]; + res_avx5 = c_bucket[86]; + res_avx7 = c_bucket[87]; + + c22_avx=c_bucket[90]; + c23_avx=c_bucket[91]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final4[0]=res_avx0; + result_final4[1]=res_avx1; + + result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final4[6]=res_avx6; + result_final4[7]=res_avx7; + + + //------------------------AVX interpolation for 5th poly ends-------------- + + //------------------------AVX interpolation for 6th poly external------------------- + + //loop1 + res_avx0 = c_bucket[77]; + res_avx2 = c_bucket[78]; + res_avx4 = c_bucket[79]; + res_avx6 = c_bucket[96]; + + c6_avx=c_bucket[99]; + c7_avx=c_bucket[100]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx); + + res_avx1 = c_bucket[93]; + res_avx3 = c_bucket[94]; + res_avx5 = c_bucket[95]; + res_avx7 = c_bucket[112]; + + c22_avx=c_bucket[115]; + c23_avx=c_bucket[116]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final5[0]=res_avx0; + result_final5[1]=res_avx1; + + result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final5[6]=res_avx6; + result_final5[7]=res_avx7; + + + //------------------------AVX interpolation for 6th poly ends-------------- + + //------------------------AVX interpolation for 7th poly external------------------- + + //loop1 + res_avx0 = c_bucket[102]; + res_avx2 = c_bucket[103]; + res_avx4 = c_bucket[104]; + res_avx6 = c_bucket[105]; + + c6_avx=c_bucket[108]; + c7_avx=c_bucket[109]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx); + + res_avx1 = c_bucket[118]; + res_avx3 = c_bucket[119]; + res_avx5 = c_bucket[120]; + res_avx7 = c_bucket[121]; + + c22_avx=c_bucket[124]; + c23_avx=c_bucket[125]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final6[0]=res_avx0; + result_final6[1]=res_avx1; + + result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final6[6]=res_avx6; + result_final6[7]=res_avx7; + + + //------------------------AVX interpolation for 7th poly ends-------------- + + //CLOCK2=cpucycles(); + //CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1); + //printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1); + + + +} + +static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ + + int i; + +//---------------AVX data----------------------------- + + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i aw_avx[7*small_len_avx]; + +//----------------AVX data---------------------------- + + +// EVALUATION + + //CLOCK1=cpucycles(); + + for (i=0; i>= 63; + return (uint8_t) r; +} + +/* b = 1 means mov, b = 0 means don't mov*/ +void PQCLEAN_LIGHTSABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; + + b = -b; + for (i = 0; i < len; i++) { + r[i] ^= b & (x[i] ^ r[i]); + } +} diff --git a/crypto_kem/lightsaber/avx2/verify.h b/crypto_kem/lightsaber/avx2/verify.h new file mode 100644 index 00000000..f57ee9bc --- /dev/null +++ b/crypto_kem/lightsaber/avx2/verify.h @@ -0,0 +1,22 @@ +#ifndef VERIFY_H +#define VERIFY_H +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ + +#include +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +uint8_t PQCLEAN_LIGHTSABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + + +/* b = 1 means mov, b = 0 means don't mov*/ +void PQCLEAN_LIGHTSABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); + + +#endif diff --git a/crypto_kem/lightsaber/clean/LICENSE b/crypto_kem/lightsaber/clean/LICENSE index 08c799e3..d5d21fff 100644 --- a/crypto_kem/lightsaber/clean/LICENSE +++ b/crypto_kem/lightsaber/clean/LICENSE @@ -1,8 +1 @@ ----------------------------------------------------------------------------------------- -SABER_v1.1 - -Public domain - -Authors: Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, -Frederik Vercauteren ----------------------------------------------------------------------------------------- +Public Domain diff --git a/crypto_kem/lightsaber/clean/Makefile b/crypto_kem/lightsaber/clean/Makefile index b1b532e4..160435dc 100644 --- a/crypto_kem/lightsaber/clean/Makefile +++ b/crypto_kem/lightsaber/clean/Makefile @@ -1,10 +1,10 @@ # This Makefile can be used with GNU Make or BSD Make LIB=liblightsaber_clean.a -HEADERS=api.h cbd.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h pack_unpack.h +HEADERS=api.h cbd.h pack_unpack.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o -CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) all: $(LIB) diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.c b/crypto_kem/lightsaber/clean/SABER_indcpa.c index 20cf1de2..ccb72492 100644 --- a/crypto_kem/lightsaber/clean/SABER_indcpa.c +++ b/crypto_kem/lightsaber/clean/SABER_indcpa.c @@ -3,296 +3,90 @@ #include "fips202.h" #include "pack_unpack.h" #include "poly.h" -#include "poly_mul.h" #include "randombytes.h" #include #include +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + uint16_t A[SABER_L][SABER_L][SABER_N]; + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N] = {0}; -/*----------------------------------------------------------------------------------- - This routine generates a=[Matrix K x K] of 256-coefficient polynomials --------------------------------------------------------------------------------------*/ + uint8_t seed_A[SABER_SEEDBYTES]; + uint8_t seed_s[SABER_NOISE_SEEDBYTES]; + int i, j; -#define h1 4 //2^(EQ-EP-1) + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(seed_s, SABER_NOISE_SEEDBYTES); -#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) + PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A); + PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, seed_s); + PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1); -static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]); -static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose); - -static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec); - -static void GenMatrix(polyvec *a, const unsigned char *seed) { - unsigned char buf[SABER_K * SABER_K * (13 * SABER_N / 8)]; - - uint16_t temp_ar[SABER_N]; - - int i, j, k; - uint16_t mod = (SABER_Q - 1); - - shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_LIGHTSABER_CLEAN_BS2POL(buf + (i * SABER_K + j) * (13 * SABER_N / 8), temp_ar); - for (k = 0; k < SABER_N; k++) { - a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; - } + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_N; j++) { + b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP); } } + + PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); + PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); + memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); } +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + uint16_t A[SABER_L][SABER_L][SABER_N]; + uint16_t sp[SABER_L][SABER_N]; + uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t vp[SABER_N] = {0}; + uint16_t mp[SABER_N]; + uint16_t b[SABER_L][SABER_N]; + int i, j; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk) { - polyvec a[SABER_K]; + PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A); + PQCLEAN_LIGHTSABER_CLEAN_GenSecret(sp, seed_sp); + PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); - uint16_t skpv[SABER_K][SABER_N]; - - unsigned char seed[SABER_SEEDBYTES]; - unsigned char noiseseed[SABER_COINBYTES]; - int32_t i, j; - uint16_t mod_q = SABER_Q - 1; - - - uint16_t res[SABER_K][SABER_N]; - - randombytes(seed, SABER_SEEDBYTES); - - // for not revealing system RNG state - shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); - randombytes(noiseseed, SABER_COINBYTES); - - GenMatrix(a, seed); //sample matrix A - - // generate secret from constant-time binomial distribution - PQCLEAN_LIGHTSABER_CLEAN_GenSecret(skpv, noiseseed); - - // do the matrix vector multiplication and rounding - for (i = 0; i < SABER_K; i++) { + for (i = 0; i < SABER_L; i++) { for (j = 0; j < SABER_N; j++) { - res[i][j] = 0; - } - } - MatrixVectorMul(a, skpv, res, SABER_Q - 1, 1); - - // now rounding - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - // shift right 3 bits - res[i][j] = (res[i][j] + h1) & (mod_q); - res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP)); + bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); } } - // unload and pack sk=3 x (256 coefficients of 14 bits) - PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(sk, skpv, SABER_Q); + PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); + PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, pk); + PQCLEAN_LIGHTSABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); - // unload and pack pk=256 bits seed and 3 x (256 coefficients of 11 bits) - // load the public-key coefficients - PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(pk, res, SABER_P); + PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(mp, m); - - // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. - for (i = 0; i < SABER_SEEDBYTES; i++) { - pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; + for (j = 0; j < SABER_N; j++) { + vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); } + PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); } +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(const unsigned char *message_received, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext) { - uint32_t i, j, k; - polyvec a[SABER_K]; - unsigned char seed[SABER_SEEDBYTES]; - // public key of received by the client - uint16_t pkcl[SABER_K][SABER_N]; - uint16_t skpv1[SABER_K][SABER_N]; - uint16_t message[SABER_KEYBYTES * 8]; - uint16_t res[SABER_K][SABER_N]; - uint16_t mod_p = SABER_P - 1; - uint16_t mod_q = SABER_Q - 1; - uint16_t vprime[SABER_N]; - unsigned char msk_c[SABER_SCALEBYTES_KEM]; + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N]; + uint16_t v[SABER_N] = {0}; + uint16_t cm[SABER_N]; + int i; - // extract the seedbytes from Public Key. - for (i = 0; i < SABER_SEEDBYTES; i++) { - seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; - } + PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk); + PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, ciphertext); + PQCLEAN_LIGHTSABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); + PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); - GenMatrix(a, seed); - - // generate secret from constant-time binomial distribution - PQCLEAN_LIGHTSABER_CLEAN_GenSecret(skpv1, noiseseed); - - // matrix-vector multiplication and rounding - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - res[i][j] = 0; - } - } - MatrixVectorMul(a, skpv1, res, SABER_Q - 1, 0); - - // now rounding - //shift right 3 bits - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - res[i][j] = ( res[i][j] + h1 ) & mod_q; - res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP) ); - } - } - - PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(ciphertext, res, SABER_P); - - // ************client matrix-vector multiplication ends************ - - // now calculate the v' - // unpack the public_key - // pkcl is the b in the protocol - PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(pk, pkcl, SABER_P); for (i = 0; i < SABER_N; i++) { - vprime[i] = 0; - } - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - skpv1[i][j] = skpv1[i][j] & (mod_p); - } + v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); } - // vector-vector scalar multiplication with mod p - InnerProd(pkcl, skpv1, mod_p, vprime); - - // addition of h1 to vprime - for (i = 0; i < SABER_N; i++) { - vprime[i] = vprime[i] + h1; - } - - // unpack message_received; - for (j = 0; j < SABER_KEYBYTES; j++) { - for (i = 0; i < 8; i++) { - message[8 * j + i] = ((message_received[j] >> i) & 0x01); - } - } - - // message encoding - for (i = 0; i < SABER_N; i++) { - message[i] = (message[i] << (SABER_EP - 1)); - } - - for (k = 0; k < SABER_N; k++) { - vprime[k] = ( (vprime[k] - message[k]) & (mod_p) ) >> (SABER_EP - SABER_ET); - } - - - PQCLEAN_LIGHTSABER_CLEAN_pack_3bit(msk_c, vprime); - - for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { - ciphertext[SABER_POLYVECCOMPRESSEDBYTES + j] = msk_c[j]; - } -} - - -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char message_dec[]) { - uint32_t i, j; - // secret key of the server - uint16_t sksv[SABER_K][SABER_N]; - uint16_t pksv[SABER_K][SABER_N]; - uint8_t scale_ar[SABER_SCALEBYTES_KEM]; - uint16_t mod_p = SABER_P - 1; - uint16_t v[SABER_N]; - uint16_t op[SABER_N]; - - // sksv is the secret-key - PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(sk, sksv, SABER_Q); - // pksv is the ciphertext - PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(ciphertext, pksv, SABER_P); - - // vector-vector scalar multiplication with mod p - for (i = 0; i < SABER_N; i++) { - v[i] = 0; - } - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - sksv[i][j] = sksv[i][j] & (mod_p); - } - } - InnerProd(pksv, sksv, mod_p, v); - - //Extraction - for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { - scale_ar[i] = ciphertext[SABER_POLYVECCOMPRESSEDBYTES + i]; - } - - PQCLEAN_LIGHTSABER_CLEAN_un_pack3bit(scale_ar, op); - - //addition of h1 - for (i = 0; i < SABER_N; i++) { - v[i] = ( ( v[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (mod_p) ) >> (SABER_EP - 1); - } - - // pack decrypted message - POL2MSG(v, message_dec); -} -static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose) { - uint16_t acc[SABER_N]; - int32_t i, j, k; - - if (transpose == 1) { - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_LIGHTSABER_CLEAN_pol_mul((uint16_t *)&a[j].vec[i], skpv[j], acc, SABER_Q, SABER_N); - - for (k = 0; k < SABER_N; k++) { - res[i][k] = res[i][k] + acc[k]; - //reduction mod p - res[i][k] = (res[i][k] & mod); - //clear the accumulator - acc[k] = 0; - } - } - } - } else { - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_LIGHTSABER_CLEAN_pol_mul((uint16_t *)&a[i].vec[j], skpv[j], acc, SABER_Q, SABER_N); - for (k = 0; k < SABER_N; k++) { - res[i][k] = res[i][k] + acc[k]; - // reduction - res[i][k] = res[i][k] & mod; - // clear the accumulator - acc[k] = 0; - } - } - } - } -} - -static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec) { - int32_t i, j; - - for (j = 0; j < SABER_KEYBYTES; j++) { - message_dec[j] = 0; - for (i = 0; i < 8; i++) { - message_dec[j] = message_dec[j] | (uint8_t) (message_dec_unpacked[j * 8 + i] << i); - } - } -} - - -static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]) { - uint32_t j, k; - uint16_t acc[SABER_N]; - - // vector-vector scalar multiplication with mod p - for (j = 0; j < SABER_K; j++) { - PQCLEAN_LIGHTSABER_CLEAN_pol_mul(pkcl[j], skpv[j], acc, SABER_P, SABER_N); - - for (k = 0; k < SABER_N; k++) { - res[k] = res[k] + acc[k]; - // reduction - res[k] = res[k] & mod; - // clear the accumulator - acc[k] = 0; - } - } + PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(m, v); } diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.h b/crypto_kem/lightsaber/clean/SABER_indcpa.h index 4f806c55..efccbf5e 100644 --- a/crypto_kem/lightsaber/clean/SABER_indcpa.h +++ b/crypto_kem/lightsaber/clean/SABER_indcpa.h @@ -1,9 +1,13 @@ #ifndef INDCPA_H #define INDCPA_H +#include "SABER_params.h" +#include + +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); + +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); + +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk); -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(const unsigned char *message, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext); -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char *message_dec); #endif - diff --git a/crypto_kem/lightsaber/clean/SABER_params.h b/crypto_kem/lightsaber/clean/SABER_params.h index eb3825f2..a6a9fc55 100644 --- a/crypto_kem/lightsaber/clean/SABER_params.h +++ b/crypto_kem/lightsaber/clean/SABER_params.h @@ -1,50 +1,39 @@ #ifndef PARAMS_H #define PARAMS_H -#include "api.h" -#define SABER_K 2 +/* Change this for different security strengths */ + +/* Don't change anything below this line */ +#define SABER_L 2 #define SABER_MU 10 #define SABER_ET 3 - #define SABER_EQ 13 #define SABER_EP 10 - #define SABER_N 256 -#define SABER_Q 8192 -#define SABER_P 1024 -#define SABER_SEEDBYTES 32 -#define SABER_NOISESEEDBYTES 32 -#define SABER_COINBYTES 32 -#define SABER_KEYBYTES 32 +#define SABER_SEEDBYTES 32 +#define SABER_NOISE_SEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 -#define SABER_HASHBYTES 32 +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) -#define SABER_POLYBYTES 416 //13*256/8 +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) -#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) -#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation - -#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) - -#define SABER_SCALEBYTES (SABER_DELTA*SABER_N/8) - -#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) -#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) - -#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ - - - +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) #endif - diff --git a/crypto_kem/lightsaber/clean/api.h b/crypto_kem/lightsaber/clean/api.h index 4f73c035..f0fe63f1 100644 --- a/crypto_kem/lightsaber/clean/api.h +++ b/crypto_kem/lightsaber/clean/api.h @@ -1,14 +1,18 @@ #ifndef PQCLEAN_LIGHTSABER_CLEAN_API_H #define PQCLEAN_LIGHTSABER_CLEAN_API_H + #define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_ALGNAME "LightSaber" -#define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_SECRETKEYBYTES 1568 -#define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_PUBLICKEYBYTES (2*320+32) #define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_BYTES 32 #define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_CIPHERTEXTBYTES 736 +#define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_PUBLICKEYBYTES 672 +#define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_SECRETKEYBYTES 1568 int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); -int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); -int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + +int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + #endif /* api_h */ diff --git a/crypto_kem/lightsaber/clean/cbd.c b/crypto_kem/lightsaber/clean/cbd.c index f6ebe4d7..7e3f2be2 100644 --- a/crypto_kem/lightsaber/clean/cbd.c +++ b/crypto_kem/lightsaber/clean/cbd.c @@ -1,3 +1,7 @@ +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include /*--------------------------------------------------------------------- This file has been adapted from the implementation (available at, Public Domain https://github.com/pq-crystals/kyber) @@ -6,12 +10,8 @@ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -#include "SABER_params.h" -#include "api.h" -#include "cbd.h" -#include -static uint64_t load_littleendian(const unsigned char *x, int bytes) { +static uint64_t load_littleendian(const uint8_t *x, int bytes) { int i; uint64_t r = x[0]; for (i = 1; i < bytes; i++) { @@ -20,10 +20,7 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { return r; } - -void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) { - uint16_t Qmod_minus1 = SABER_Q - 1; - +void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { uint64_t t, d, a[4], b[4]; int i, j; @@ -34,8 +31,8 @@ void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) { d += (t >> j) & 0x0842108421UL; } - a[0] = d & 0x1f; - b[0] = (d >> 5) & 0x1f; + a[0] = d & 0x1f; + b[0] = (d >> 5) & 0x1f; a[1] = (d >> 10) & 0x1f; b[1] = (d >> 15) & 0x1f; a[2] = (d >> 20) & 0x1f; @@ -43,9 +40,9 @@ void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) { a[3] = (d >> 30) & 0x1f; b[3] = (d >> 35); - r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; - r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; - r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; - r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; + s[4 * i + 0] = (uint16_t)(a[0] - b[0]); + s[4 * i + 1] = (uint16_t)(a[1] - b[1]); + s[4 * i + 2] = (uint16_t)(a[2] - b[2]); + s[4 * i + 3] = (uint16_t)(a[3] - b[3]); } } diff --git a/crypto_kem/lightsaber/clean/cbd.h b/crypto_kem/lightsaber/clean/cbd.h index 37553425..dffd4dc5 100644 --- a/crypto_kem/lightsaber/clean/cbd.h +++ b/crypto_kem/lightsaber/clean/cbd.h @@ -1,6 +1,5 @@ #ifndef CBD_H #define CBD_H - /*--------------------------------------------------------------------- This file has been adapted from the implementation (available at, Public Domain https://github.com/pq-crystals/kyber) @@ -8,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ - -#include "poly.h" +#include "SABER_params.h" #include -void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf); +void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); + #endif diff --git a/crypto_kem/lightsaber/clean/kem.c b/crypto_kem/lightsaber/clean/kem.c index 8aad4302..eb9353b1 100644 --- a/crypto_kem/lightsaber/clean/kem.c +++ b/crypto_kem/lightsaber/clean/kem.c @@ -1,5 +1,6 @@ #include "SABER_indcpa.h" #include "SABER_params.h" +#include "api.h" #include "fips202.h" #include "randombytes.h" #include "verify.h" @@ -7,90 +8,71 @@ #include #include -int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + +int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { int i; - // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk - PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(pk, sk); - - // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { - sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk } - // Then hash(pk) is appended. - sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. - // Remaining part of sk contains a pseudo-random number. - // This is output when check in crypto_kem_dec() fails. - randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES ); + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec() fails. return (0); } -int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) { - // Will contain key, coins - unsigned char kr[64]; - unsigned char buf[64]; +int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { + + uint8_t kr[64]; // Will contain key, coins + uint8_t buf[64]; randombytes(buf, 32); - // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output - sha3_256(buf, buf, 32); + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output - // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM - sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); - - // kr[0:63] <-- Hash(buf[0:63]); - sha3_512(kr, buf, 64); + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); // K^ <-- kr[0:31] // noiseseed (r) <-- kr[32:63]; - // buf[0:31] contains message; kr[32:63] contains randomness r; - PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, ct); + PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; - sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC); + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); - // hash concatenation of pre-k and h(c) to k - sha3_256(ss, kr, 64); + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k return (0); } - -int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) { +int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { int i; - unsigned char fail; - unsigned char cmp[SABER_BYTES_CCA_DEC]; - unsigned char buf[64]; - - // Will contain key, coins - unsigned char kr[64]; - const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES; - - // buf[0:31] <-- message - PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(sk, ct, buf); + uint8_t fail; + uint8_t cmp[SABER_BYTES_CCA_DEC]; + uint8_t buf[64]; + uint8_t kr[64]; // Will contain key, coins + const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message // Multitarget countermeasure for coins + contributory KEM - // Save hash by storing h(pk) in sk - for (i = 0; i < 32; i++) { + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; } sha3_512(kr, buf, 64); - PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, cmp); + PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(cmp, buf, kr + 32, pk); + fail = PQCLEAN_LIGHTSABER_CLEAN_verify(c, cmp, SABER_BYTES_CCA_DEC); - fail = PQCLEAN_LIGHTSABER_CLEAN_verify(ct, cmp, SABER_BYTES_CCA_DEC); - - // overwrite coins in kr with h(c) - sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC); + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) PQCLEAN_LIGHTSABER_CLEAN_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); - // hash concatenation of pre-k and h(c) to k - sha3_256(ss, kr, 64); + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k return (0); } diff --git a/crypto_kem/lightsaber/clean/pack_unpack.c b/crypto_kem/lightsaber/clean/pack_unpack.c index 4b1c409f..2a39a1d7 100644 --- a/crypto_kem/lightsaber/clean/pack_unpack.c +++ b/crypto_kem/lightsaber/clean/pack_unpack.c @@ -1,254 +1,140 @@ +#include "api.h" #include "pack_unpack.h" +#include -void PQCLEAN_LIGHTSABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - +void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 8; j++) { offset_byte = 3 * j; offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | - ((data[offset_data + 1] & 0x7) << 3) | - ((data[offset_data + 2] & 0x3) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | - ((data[offset_data + 3] & 0x7) << 1) | - ((data[offset_data + 4] & 0x7) << 4) | - (((data[offset_data + 5]) & 0x01) << 7); - bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | - ((data[offset_data + 6] & 0x7) << 2) | - ((data[offset_data + 7] & 0x7) << 5); + bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ((data[offset_data + 1] & 0x7) << 3) | ((data[offset_data + 2] & 0x3) << 6); + bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2) & 0x01) | ((data[offset_data + 3] & 0x7) << 1) | ((data[offset_data + 4] & 0x7) << 4) | (((data[offset_data + 5]) & 0x01) << 7); + bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1) & 0x03) | ((data[offset_data + 6] & 0x7) << 2) | ((data[offset_data + 7] & 0x7) << 5); } } -void PQCLEAN_LIGHTSABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 8; j++) { offset_byte = 3 * j; offset_data = 8 * j; data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; - data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3 ) & 0x07; - data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6 ) & 0x03) | - (((bytes[offset_byte + 1]) & 0x01) << 2); - data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1 ) & 0x07; - data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4 ) & 0x07; - data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7 ) & 0x01) | - (((bytes[offset_byte + 2]) & 0x03) << 1); + data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3) & 0x07; + data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6) & 0x03) | (((bytes[offset_byte + 1]) & 0x01) << 2); + data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1) & 0x07; + data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4) & 0x07; + data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7) & 0x01) | (((bytes[offset_byte + 2]) & 0x03) << 1); data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07); data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07); } } -void PQCLEAN_LIGHTSABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data; - - for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - bytes[j] = (data[offset_data] & 0x0f) | - ((data[offset_data + 1] & 0x0f) << 4); - } -} - -void PQCLEAN_LIGHTSABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar) { - uint32_t j; - uint32_t offset_data; - - for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - ar[offset_data] = bytes[j] & 0x0f; - ar[offset_data + 1] = (bytes[j] >> 4) & 0x0f; - } -} - -void PQCLEAN_LIGHTSABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | - ((data[offset_data + 1] & 0x03) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | - ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | - ((data[offset_data + 3] & 0x3f) << 2); - } -} - - -void PQCLEAN_LIGHTSABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; - data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | - ((bytes[offset_byte + 1] & 0x0f) << 2); - data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | - ((bytes[offset_byte + 2] & 0x03) << 4); - data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); - } -} - - -static void POLVECp2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x03) | - ((data[i][offset_data + 1] & 0x3f) << 2); - bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 6) & 0x0f) | - ((data[i][offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 3] = ((data[i][offset_data + 2] >> 4) & 0x3f) | - ((data[i][offset_data + 3] & 0x03) << 6); - bytes[offset_byte + 4] = ((data[i][offset_data + 3] >> 2) & 0xff); - } - } -} - -static void BS2POLVECp(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x03) << 8); - data[i][offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | - ((bytes[offset_byte + 2] & 0x0f) << 6); - data[i][offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | - ((bytes[offset_byte + 3] & 0x3f) << 4); - data[i][offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | - ((bytes[offset_byte + 4] & 0xff) << 2); - } - } -} - - - -static void POLVECq2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x1f) | - ((data[i][offset_data + 1] & 0x07) << 5); - bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 3) & 0xff); - bytes[offset_byte + 3] = ((data[i][offset_data + 1] >> 11) & 0x03) | - ((data[i][offset_data + 2] & 0x3f) << 2); - bytes[offset_byte + 4] = ((data[i][offset_data + 2] >> 6) & 0x7f) | - ((data[i][offset_data + 3] & 0x01) << 7); - bytes[offset_byte + 5] = ((data[i][offset_data + 3] >> 1) & 0xff); - bytes[offset_byte + 6] = ((data[i][offset_data + 3] >> 9) & 0x0f) | - ((data[i][offset_data + 4] & 0x0f) << 4); - bytes[offset_byte + 7] = ((data[i][offset_data + 4] >> 4) & 0xff); - bytes[offset_byte + 8] = ((data[i][offset_data + 4] >> 12) & 0x01) | - ((data[i][offset_data + 5] & 0x7f) << 1); - bytes[offset_byte + 9] = ((data[i][offset_data + 5] >> 7) & 0x3f) | - ((data[i][offset_data + 6] & 0x03) << 6); - bytes[offset_byte + 10] = ((data[i][offset_data + 6] >> 2) & 0xff); - bytes[offset_byte + 11] = ((data[i][offset_data + 6] >> 10) & 0x07) | - ((data[i][offset_data + 7] & 0x1f) << 3); - bytes[offset_byte + 12] = ((data[i][offset_data + 7] >> 5) & 0xff); - } - } -} - -static void BS2POLVECq(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | - ((bytes[offset_byte + 2] & 0xff) << 3) | - ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | - ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | - ((bytes[offset_byte + 5] & 0xff) << 1) | - ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | - ((bytes[offset_byte + 7] & 0xff) << 4) | - ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | - ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | - ((bytes[offset_byte + 10] & 0xff) << 2) | - ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | - ((bytes[offset_byte + 12] & 0xff) << 5); - } - } -} - -//only BS2POLq no BS2POLp -void PQCLEAN_LIGHTSABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]) { - uint32_t j; - uint32_t offset_data, offset_byte; - +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 8; j++) { offset_byte = 13 * j; offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | - ((bytes[offset_byte + 2] & 0xff) << 3) | - ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | - ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | - ((bytes[offset_byte + 5] & 0xff) << 1) | - ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | - ((bytes[offset_byte + 7] & 0xff) << 4) | - ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | - ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | - ((bytes[offset_byte + 10] & 0xff) << 2) | - ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | - ((bytes[offset_byte + 12] & 0xff) << 5); + bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); + bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); + bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); + bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); + bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); + bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); + bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); + bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); + bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); + bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); + bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); + bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); + bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); } } -void PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - if (modulus == 1024) { - POLVECp2BS(bytes, data); - } else if (modulus == 8192) { - POLVECq2BS(bytes, data); +static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 13 * j; + offset_data = 8 * j; + data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); } } -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - if (modulus == 1024) { - BS2POLVECp(bytes, data); - } else if (modulus == 8192) { - BS2POLVECq(bytes, data); +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); + bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); + bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); + bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); + bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); + } +} + +static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 5 * j; + offset_data = 4 * j; + data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); + data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); + data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); + data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(data[i], bytes + i * SABER_POLYBYTES); + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { + size_t i, j; + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + data[j * 8 + i] = ((bytes[j] >> i) & 0x01); + } + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { + size_t i, j; + memset(bytes, 0, SABER_KEYBYTES); + + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); + } } } diff --git a/crypto_kem/lightsaber/clean/pack_unpack.h b/crypto_kem/lightsaber/clean/pack_unpack.h index 86fd2fad..44ccf31a 100644 --- a/crypto_kem/lightsaber/clean/pack_unpack.h +++ b/crypto_kem/lightsaber/clean/pack_unpack.h @@ -1,28 +1,27 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H - #include "SABER_params.h" #include #include +void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); -void PQCLEAN_LIGHTSABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_LIGHTSABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data); - -void PQCLEAN_LIGHTSABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_LIGHTSABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar); - -void PQCLEAN_LIGHTSABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_LIGHTSABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data); +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_LIGHTSABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]); +void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); -void PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus); +void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); + + +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); + +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); + +void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus); #endif diff --git a/crypto_kem/lightsaber/clean/poly.c b/crypto_kem/lightsaber/clean/poly.c index fc86ab3c..1c1e22cc 100644 --- a/crypto_kem/lightsaber/clean/poly.c +++ b/crypto_kem/lightsaber/clean/poly.c @@ -1,21 +1,49 @@ -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ -#include "SABER_params.h" +#include "api.h" #include "cbd.h" #include "fips202.h" +#include "pack_unpack.h" #include "poly.h" +#include "poly_mul.h" +#include -void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed) { - uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; - - shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); - - for (size_t i = 0; i < SABER_K; i++) { - PQCLEAN_LIGHTSABER_CLEAN_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); +void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { + int i, j; + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_L; j++) { + if (transpose == 1) { + PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]); + } else { + PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]); + } + } + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { + int j; + for (j = 0; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(b[j], s[j], res); + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; + int i; + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); + } +} + +void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; + size_t i; + + shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); } } diff --git a/crypto_kem/lightsaber/clean/poly.h b/crypto_kem/lightsaber/clean/poly.h index 47ceeebb..1f50c48e 100644 --- a/crypto_kem/lightsaber/clean/poly.h +++ b/crypto_kem/lightsaber/clean/poly.h @@ -1,26 +1,15 @@ #ifndef POLY_H #define POLY_H - -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ - - #include "SABER_params.h" #include -typedef struct { - uint16_t coeffs[SABER_N]; -} poly; +void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); -typedef struct { - poly vec[SABER_K]; -} polyvec; +void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); + +void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); -void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed); #endif diff --git a/crypto_kem/lightsaber/clean/poly_mul.c b/crypto_kem/lightsaber/clean/poly_mul.c index 926910b5..5e37a024 100644 --- a/crypto_kem/lightsaber/clean/poly_mul.c +++ b/crypto_kem/lightsaber/clean/poly_mul.c @@ -228,19 +228,15 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re } } -void PQCLEAN_LIGHTSABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n) { - uint32_t i; - // normal multiplication - uint16_t c[512]; - - for (i = 0; i < 512; i++) { - c[i] = 0; - } +/* res += a*b */ +void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) { + uint16_t c[2 * SABER_N] = {0}; + int i; toom_cook_4way(a, b, c); - // reduction - for (i = n; i < 2 * n; i++) { - res[i - n] = (c[i - n] - c[i]) & (p - 1); + /* reduction */ + for (i = SABER_N; i < 2 * SABER_N; i++) { + res[i - SABER_N] += (c[i - SABER_N] - c[i]); } } diff --git a/crypto_kem/lightsaber/clean/poly_mul.h b/crypto_kem/lightsaber/clean/poly_mul.h index 8d634584..0d5cf6ed 100644 --- a/crypto_kem/lightsaber/clean/poly_mul.h +++ b/crypto_kem/lightsaber/clean/poly_mul.h @@ -1,9 +1,9 @@ -#ifndef POLYMUL_H -#define POLYMUL_H - +#ifndef POLY_MUL_H +#define POLY_MUL_H #include "SABER_params.h" #include -void PQCLEAN_LIGHTSABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n); +void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]); + #endif diff --git a/crypto_kem/lightsaber/clean/verify.c b/crypto_kem/lightsaber/clean/verify.c index 52c6969b..05e564da 100644 --- a/crypto_kem/lightsaber/clean/verify.c +++ b/crypto_kem/lightsaber/clean/verify.c @@ -1,3 +1,5 @@ +#include "verify.h" + /*------------------------------------------------- This file has been adapted from the implementation (available at https://github.com/pq-crystals/kyber) of @@ -5,26 +7,25 @@ This file has been adapted from the implementation by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------*/ -#include "verify.h" -#include + /* returns 0 for equal strings, 1 for non-equal strings */ -unsigned char PQCLEAN_LIGHTSABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len) { +uint8_t PQCLEAN_LIGHTSABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { uint64_t r; size_t i; - r = 0; + for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } r = (~r + 1); // Two's complement r >>= 63; - return (unsigned char)r; + return (uint8_t) r; } /* b = 1 means mov, b = 0 means don't mov*/ -void PQCLEAN_LIGHTSABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { +void PQCLEAN_LIGHTSABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { size_t i; b = -b; diff --git a/crypto_kem/lightsaber/clean/verify.h b/crypto_kem/lightsaber/clean/verify.h index 32c2adb5..4f538e6f 100644 --- a/crypto_kem/lightsaber/clean/verify.h +++ b/crypto_kem/lightsaber/clean/verify.h @@ -1,6 +1,5 @@ #ifndef VERIFY_H #define VERIFY_H - /*------------------------------------------------- This file has been adapted from the implementation (available at https://github.com/pq-crystals/kyber) of @@ -13,9 +12,11 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle #include /* returns 0 for equal strings, 1 for non-equal strings */ -unsigned char PQCLEAN_LIGHTSABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len); +uint8_t PQCLEAN_LIGHTSABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + /* b = 1 means mov, b = 0 means don't mov*/ -void PQCLEAN_LIGHTSABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); +void PQCLEAN_LIGHTSABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); + #endif diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml index 4de4f1c8..50250180 100644 --- a/crypto_kem/saber/META.yml +++ b/crypto_kem/saber/META.yml @@ -14,4 +14,13 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/commit/14ede83f1ff3bcc41f0464543542366c68b55871 + version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + - name: avx2 + version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: + - avx2 diff --git a/crypto_kem/saber/avx2/LICENSE b/crypto_kem/saber/avx2/LICENSE new file mode 100644 index 00000000..d5d21fff --- /dev/null +++ b/crypto_kem/saber/avx2/LICENSE @@ -0,0 +1 @@ +Public Domain diff --git a/crypto_kem/saber/avx2/Makefile b/crypto_kem/saber/avx2/Makefile new file mode 100644 index 00000000..65cc21ef --- /dev/null +++ b/crypto_kem/saber/avx2/Makefile @@ -0,0 +1,22 @@ +# This Makefile can be used with GNU Make or BSD Make + +LIB=libsaber_avx2.a +HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h +OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o + +CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) + +all: $(LIB) + +%.o: %.s $(HEADERS) + $(AS) -o $@ $< + +%.o: %.c $(HEADERS) + $(CC) $(CFLAGS) -c -o $@ $< + +$(LIB): $(OBJECTS) + $(AR) -r $@ $(OBJECTS) + +clean: + $(RM) $(OBJECTS) + $(RM) $(LIB) diff --git a/crypto_kem/saber/avx2/SABER_indcpa.c b/crypto_kem/saber/avx2/SABER_indcpa.c new file mode 100644 index 00000000..d16a7a06 --- /dev/null +++ b/crypto_kem/saber/avx2/SABER_indcpa.c @@ -0,0 +1,416 @@ +#include "./polymul/toom-cook_4way.c" +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include "fips202.h" +#include "pack_unpack.h" +#include "randombytes.h" +#include +#include +#include +//#include "randombytes.h" +//#include "./polymul/toom_cook_4/toom-cook_4way.c" + +#define h1 4 //2^(EQ-EP-1) + +#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) + + +static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { + int32_t i, j; + + for (j = 0; j < SABER_KEYBYTES; j++) { + message_dec[j] = 0; + for (i = 0; i < 8; i++) { + message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); + } + } +} + +/*----------------------------------------------------------------------------------- + This routine generates a=[Matrix K x K] of 256-coefficient polynomials +-------------------------------------------------------------------------------------*/ + +static void GenMatrix(polyvec *a, const uint8_t *seed) { + uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; + + uint16_t temp_ar[SABER_N]; + + int i, j, k; + uint16_t mod = (SABER_Q - 1); + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + PQCLEAN_SABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); + for (k = 0; k < SABER_N; k++) { + a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; + } + } + } +} + +static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { + + uint32_t i; + + uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; + + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); + + for (i = 0; i < SABER_K; i++) { + PQCLEAN_SABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); + } +} + +//********************************matrix-vector mul routines***************************************************** +static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) { + int64_t i, j; + + __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time + + for (i = 0; i < NUM_POLY; i++) { + for (j = 0; j < NUM_POLY; j++) { + + if (isTranspose == 0) { + toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); + } else { + toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); + } + } + + TC_interpol(c_bucket, res_avx[i]); + } + +} + +static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) { + + int64_t i; + + __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time + + for (i = 0; i < NUM_POLY; i++) { + toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); + } + TC_interpol(c_bucket, res_avx); +} + +//********************************matrix-vector mul routines***************************************************** + +void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { + + polyvec a[SABER_K]; + + uint16_t skpv1[SABER_K][SABER_N]; + + + + uint8_t seed[SABER_SEEDBYTES]; + uint8_t noiseseed[SABER_COINBYTES]; + int32_t i, j, k; + + +//--------------AVX declaration------------------ + + __m256i sk_avx[SABER_K][SABER_N / 16]; + __m256i mod; + __m256i res_avx[SABER_K][SABER_N / 16]; + __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; + //__m256i acc[2*SABER_N/16]; + + mod = _mm256_set1_epi16(SABER_Q - 1); + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + +//--------------AVX declaration ends------------------ + + randombytes(seed, SABER_SEEDBYTES); + + shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(noiseseed, SABER_COINBYTES); + + + GenMatrix(a, seed); //sample matrix A + + GenSecret(skpv1, noiseseed); + + +// Load sk into avx vectors + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + } + + } + + // Load a into avx vectors + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + for (k = 0; k < SABER_N / 16; k++) { + a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); + } + } + } + + + + //------------------------do the matrix vector multiplication and rounding------------ + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sk_avx[j], b_bucket[j]); + } + matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order + + // Now truncation + + + for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N / 16; j++) { + res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); + res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); + res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); + } + } + + //------------------Pack sk into byte string------- + + PQCLEAN_SABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); + + //------------------Pack pk into byte string------- + + for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); + } + } + PQCLEAN_SABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string + + + for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. + pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; + } + +} + + +void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + + + uint32_t i, j, k; + polyvec a[SABER_K]; // skpv; + uint8_t seed[SABER_SEEDBYTES]; + uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client + + + uint16_t skpv1[SABER_K][SABER_N]; + uint16_t temp[SABER_K][SABER_N]; + uint16_t message[SABER_KEYBYTES * 8]; + + uint8_t msk_c[SABER_SCALEBYTES_KEM]; + + //--------------AVX declaration------------------ + + __m256i sk_avx[SABER_K][SABER_N / 16]; + __m256i mod, mod_p; + __m256i res_avx[SABER_K][SABER_N / 16]; + __m256i vprime_avx[SABER_N / 16]; + __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; + //__m256i acc[2*SABER_N/16]; + + __m256i pkcl_avx[SABER_K][SABER_N / 16]; + + __m256i message_avx[SABER_N / 16]; + + mod = _mm256_set1_epi16(SABER_Q - 1); + mod_p = _mm256_set1_epi16(SABER_P - 1); + + + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + + //--------------AVX declaration ends------------------ + for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. + seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; + } + + GenMatrix(a, seed); + GenSecret(skpv1, noiseseed); + + // ----------- Load skpv1 into avx vectors ---------- + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + } + } + + // ----------- Load skpv1 into avx vectors ---------- + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_K; j++) { + for (k = 0; k < SABER_N / 16; k++) { + a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); + } + } + } + //-----------------matrix-vector multiplication and rounding + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sk_avx[j], b_bucket[j]); + } + matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order + + // Now truncation + + for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N / 16; j++) { + res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); + res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); + res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); + + } + } + + + //-----this result should be put in b_prime for later use in server. + for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); + } + } + + PQCLEAN_SABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string + +//**************client matrix-vector multiplication ends******************// + + //------now calculate the v' + + //-------unpack the public_key + PQCLEAN_SABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); + } + } + + // InnerProduct + //for(k=0;k> i) & 0x01); + } + } + // message encoding + for (i = 0; i < SABER_N / 16; i++) { + message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); + message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); + } + + // SHIFTRIGHT(v'+h1-m mod p, EP-ET) + for (k = 0; k < SABER_N / 16; k++) { + vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); + vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); + vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); + } + + // Unpack avx + for (j = 0; j < SABER_N / 16; j++) { + _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); + } + + PQCLEAN_SABER_AVX2_SABER_pack_4bit(msk_c, temp[0]); + + + for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { + ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; + } + +} + + +void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + + uint32_t i, j; + uint16_t sksv[SABER_K][SABER_N]; //secret key of the server + uint16_t pksv[SABER_K][SABER_N]; + uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; + uint8_t scale_ar[SABER_SCALEBYTES_KEM]; + uint16_t op[SABER_N]; + + //--------------AVX declaration------------------ + + + //__m256i mod_p; + + __m256i v_avx[SABER_N / 16]; + + //__m256i acc[2*SABER_N/16]; + + __m256i sksv_avx[SABER_K][SABER_N / 16]; + __m256i pksv_avx[SABER_K][SABER_N / 16]; + + //mod_p=_mm256_set1_epi16(SABER_P-1); + + __m256i b_bucket[NUM_POLY][SCHB_N * 4]; + //--------------AVX declaration ends------------------ + + //-------unpack the public_key + + PQCLEAN_SABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key + PQCLEAN_SABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext + + for (i = 0; i < SABER_K; i++) { + for (j = 0; j < SABER_N / 16; j++) { + sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); + pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); + } + } + + for (i = 0; i < SABER_N / 16; i++) { + v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); + } + + + // InnerProduct(b', s, mod p) + + for (j = 0; j < NUM_POLY; j++) { + TC_eval(sksv_avx[j], b_bucket[j]); + } + + vector_vector_mul(pksv_avx, b_bucket, v_avx); + + for (i = 0; i < SABER_N / 16; i++) { + _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); + } + + + for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { + scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; + } + + PQCLEAN_SABER_AVX2_SABER_un_pack4bit(op, scale_ar); + + + //addition of h2 + for (i = 0; i < SABER_N; i++) { + message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); + } + + + POL2MSG(m, message_dec_unpacked); +} diff --git a/crypto_kem/saber/avx2/SABER_indcpa.h b/crypto_kem/saber/avx2/SABER_indcpa.h new file mode 100644 index 00000000..acdda606 --- /dev/null +++ b/crypto_kem/saber/avx2/SABER_indcpa.h @@ -0,0 +1,13 @@ +#ifndef INDCPA_H +#define INDCPA_H +#include "SABER_params.h" +#include + +void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); + +void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); + +void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); + + +#endif diff --git a/crypto_kem/saber/avx2/SABER_params.h b/crypto_kem/saber/avx2/SABER_params.h new file mode 100644 index 00000000..9b0edafe --- /dev/null +++ b/crypto_kem/saber/avx2/SABER_params.h @@ -0,0 +1,46 @@ +#ifndef PARAMS_H +#define PARAMS_H +#include "api.h" + + + + +#define SABER_K 3 +#define SABER_MU 8 +#define SABER_ET 4 + + +#define SABER_EQ 13 +#define SABER_EP 10 + +#define SABER_N 256 +#define SABER_Q 8192 //2^13 +#define SABER_P 1024 + +#define SABER_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 +#define SABER_COINBYTES 32 +#define SABER_KEYBYTES 32 + +#define SABER_HASHBYTES 32 + +#define SABER_POLYBYTES 416 //13*256/8 + +#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) + +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation + +#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) + +#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) + +#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) +#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) + +#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) + +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) + +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ + +#endif diff --git a/crypto_kem/saber/avx2/api.h b/crypto_kem/saber/avx2/api.h new file mode 100644 index 00000000..20bf0df3 --- /dev/null +++ b/crypto_kem/saber/avx2/api.h @@ -0,0 +1,18 @@ +#ifndef PQCLEAN_SABER_AVX2_API_H +#define PQCLEAN_SABER_AVX2_API_H + + +#define PQCLEAN_SABER_AVX2_CRYPTO_ALGNAME "Saber" +#define PQCLEAN_SABER_AVX2_CRYPTO_BYTES 32 +#define PQCLEAN_SABER_AVX2_CRYPTO_CIPHERTEXTBYTES 1088 +#define PQCLEAN_SABER_AVX2_CRYPTO_PUBLICKEYBYTES 992 +#define PQCLEAN_SABER_AVX2_CRYPTO_SECRETKEYBYTES 2304 + +int PQCLEAN_SABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_SABER_AVX2_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_SABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + + +#endif /* PQCLEAN_SABER_AVX2_API_H */ diff --git a/crypto_kem/saber/avx2/cbd.c b/crypto_kem/saber/avx2/cbd.c new file mode 100644 index 00000000..7639d7d2 --- /dev/null +++ b/crypto_kem/saber/avx2/cbd.c @@ -0,0 +1,51 @@ +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ + + +static uint64_t load_littleendian(const unsigned char *x, int bytes) { + int i; + uint64_t r = x[0]; + for (i = 1; i < bytes; i++) { + r |= (uint64_t)x[i] << (8 * i); + } + return r; +} + + +void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { + uint16_t Qmod_minus1 = SABER_Q - 1; + + uint32_t t, d, a[4], b[4]; + int i, j; + + for (i = 0; i < SABER_N / 4; i++) { + t = load_littleendian(buf + 4 * i, 4); + d = 0; + for (j = 0; j < 4; j++) { + d += (t >> j) & 0x11111111; + } + + a[0] = d & 0xf; + b[0] = (d >> 4) & 0xf; + a[1] = (d >> 8) & 0xf; + b[1] = (d >> 12) & 0xf; + a[2] = (d >> 16) & 0xf; + b[2] = (d >> 20) & 0xf; + a[3] = (d >> 24) & 0xf; + b[3] = (d >> 28); + + r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; + r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; + r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; + r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; + } +} diff --git a/crypto_kem/saber/avx2/cbd.h b/crypto_kem/saber/avx2/cbd.h new file mode 100644 index 00000000..e80ffc75 --- /dev/null +++ b/crypto_kem/saber/avx2/cbd.h @@ -0,0 +1,16 @@ +#ifndef CBD_H +#define CBD_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ +#include "poly.h" +#include + +void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); + + +#endif diff --git a/crypto_kem/saber/avx2/kem.c b/crypto_kem/saber/avx2/kem.c new file mode 100644 index 00000000..c88bb315 --- /dev/null +++ b/crypto_kem/saber/avx2/kem.c @@ -0,0 +1,79 @@ +#include "SABER_indcpa.h" +#include "SABER_params.h" +#include "api.h" +#include "fips202.h" +#include "randombytes.h" +#include "verify.h" +#include +#include +#include +#include + + +int PQCLEAN_SABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { + int i; + + PQCLEAN_SABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk + for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + } + + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. + + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in PQCLEAN_SABER_AVX2_crypto_kem_dec() fails. + return (0); +} + +int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { + + uint8_t kr[64]; // Will contain key, coins + uint8_t buf[64]; + + randombytes(buf, 32); + + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output + + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); + // K^ <-- kr[0:31] + // noiseseed (r) <-- kr[32:63]; + PQCLEAN_SABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} + +int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { + int i; + uint8_t fail; + uint8_t cmp[SABER_BYTES_CCA_DEC]; + uint8_t buf[64]; + uint8_t kr[64]; // Will contain key, coins + const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + + PQCLEAN_SABER_AVX2_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message + + // Multitarget countermeasure for coins + contributory KEM + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk + buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; + } + + sha3_512(kr, buf, 64); + + PQCLEAN_SABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); + + fail = PQCLEAN_SABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); + + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) + + PQCLEAN_SABER_AVX2_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); + + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k + + return (0); +} diff --git a/crypto_kem/saber/avx2/kem.h b/crypto_kem/saber/avx2/kem.h new file mode 100644 index 00000000..612ff4ff --- /dev/null +++ b/crypto_kem/saber/avx2/kem.h @@ -0,0 +1,35 @@ +#ifndef INDCPA_H +#define INDCPA_H + +#include + +void PQCLEAN_SABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); + + +void PQCLEAN_SABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); + + +void PQCLEAN_SABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); + + +void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); + +void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); + +void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); + + +int PQCLEAN_SABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +int PQCLEAN_SABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_SABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); + + + +//uint64_t clock1,clock2; + +//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; + + +#endif diff --git a/crypto_kem/saber/avx2/pack_unpack.c b/crypto_kem/saber/avx2/pack_unpack.c new file mode 100644 index 00000000..00bf9c08 --- /dev/null +++ b/crypto_kem/saber/avx2/pack_unpack.c @@ -0,0 +1,502 @@ +#include "pack_unpack.h" + + +void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 3 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); + bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); + bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); + } +} + +void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 3 * j; + offset_data = 8 * j; + data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; + data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; + data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); + data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; + data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; + data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); + data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); + data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); + } + +} + +void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0; + + for (j = 0; j < SABER_N / 2; j++) { + offset_data = 2 * j; + bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); + } +} + +void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0; + + for (j = 0; j < SABER_N / 2; j++) { + offset_data = 2 * j; + data[offset_data] = bytes[j] & 0x0f; + data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; + } +} + +void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 3 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); + bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); + bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); + } +} + + +void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 3 * j; + offset_data = 4 * j; + data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; + data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; + data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; + data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); + } + +} + +void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); + } + } +} + +void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); + } + } +} + +void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); + + bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); + + bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); + + } + } + + +} + +void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 13 * j; + offset_data = 8 * j; + data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } +} + + + +void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); + data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); + data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); + data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); + + } + } +} + +void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + } + + +} + + + +void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 10) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 5 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); + data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); + data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); + data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); + + } + } + + +} + + +void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); + + bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); + + bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); + + } + } + + +} + +void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 13) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 13 * j; + offset_data = 8 * j; + data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + } + + +} + +void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { + + uint32_t j; + uint32_t offset_data = 0, offset_byte = 0; + + //for(i=0;i> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + } + //} + + +} + + +void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + /*This function packs 11 bit data stream into 8 bits of data. + */ + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 11) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 11 * j; + offset_data = 8 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); + + bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); + + bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); + + bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); + + bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); + + } + } + +} + +void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 11) / 8; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = offset_byte1 + 11 * j; + offset_data = 8 * j; + + data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); + + data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); + + data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); + + data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); + + data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); + + data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); + + data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); + + data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); + } + } + + +} + +void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 14) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 7 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); + + bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); + + bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); + + bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); + + bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); + + bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); + + bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); + } + } + + +} + + +void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { + + uint32_t i, j; + uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; + + for (i = 0; i < SABER_K; i++) { + offset_byte1 = i * (SABER_N * 14) / 8; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = offset_byte1 + 7 * j; + offset_data = 4 * j; + data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); + + data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); + + data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); + + data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); + } + } + + +} + +void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { + + if (modulus == 1024) { + PQCLEAN_SABER_AVX2_POLVECp2BS(bytes, data); + } else if (modulus == 8192) { + PQCLEAN_SABER_AVX2_POLVECq2BS(bytes, data); + } +} + +void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { + + if (modulus == 1024) { + PQCLEAN_SABER_AVX2_BS2POLVECp(data, bytes); + } else if (modulus == 8192) { + PQCLEAN_SABER_AVX2_BS2POLVECq(data, bytes); + } + +} diff --git a/crypto_kem/saber/avx2/pack_unpack.h b/crypto_kem/saber/avx2/pack_unpack.h new file mode 100644 index 00000000..e1608d4c --- /dev/null +++ b/crypto_kem/saber/avx2/pack_unpack.h @@ -0,0 +1,56 @@ +#ifndef PACK_UNPACK_H +#define PACK_UNPACK_H +#include "SABER_params.h" +#include +#include + +void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); + +void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); + +void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + + +void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); + +void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + + +void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); + +void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + +void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); + + +void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); + + +void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); + +void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + +void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); + + +#endif diff --git a/crypto_kem/saber/avx2/poly.h b/crypto_kem/saber/avx2/poly.h new file mode 100644 index 00000000..2978d0d8 --- /dev/null +++ b/crypto_kem/saber/avx2/poly.h @@ -0,0 +1,27 @@ +#ifndef POLY_H +#define POLY_H +/*--------------------------------------------------------------------- +This file has been adapted from the implementation +(available at, Public Domain https://github.com/pq-crystals/kyber) +of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" +by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------------------------*/ +#include "SABER_params.h" +#include + +typedef struct { + uint16_t coeffs[SABER_N]; +} poly; + +typedef struct { + poly vec[SABER_K]; +} polyvec; + +void PQCLEAN_SABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); + + +void PQCLEAN_SABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); + + +#endif diff --git a/crypto_kem/saber/avx2/polymul/consts.h b/crypto_kem/saber/avx2/polymul/consts.h new file mode 100644 index 00000000..40826398 --- /dev/null +++ b/crypto_kem/saber/avx2/polymul/consts.h @@ -0,0 +1,20 @@ +#include "../SABER_params.h" + +#define AVX_N (SABER_N >> 4) +#define small_len_avx (AVX_N >> 2) + +#define SCHB_N 16 + +#define N_SB (SABER_N >> 2) +#define N_SB_RES (2*N_SB-1) + +#define N_SB_16 (N_SB >> 2) +#define N_SB_16_RES (2*N_SB_16-1) + +#define AVX_N1 16 /*N/16*/ + +#define SCM_SIZE 16 + +// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements +#define NUM_POLY SABER_K +//int NUM_POLY=2; diff --git a/crypto_kem/saber/avx2/polymul/matrix.c b/crypto_kem/saber/avx2/polymul/matrix.c new file mode 100644 index 00000000..5fa35783 --- /dev/null +++ b/crypto_kem/saber/avx2/polymul/matrix.c @@ -0,0 +1,303 @@ +#include + +static void transpose_n1(__m256i *M) +{ + //int i; + register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; + register __m256i temp, temp0, temp1, temp2; + + //for(i=0; i<8; i=i+1) + //{ + r0 = _mm256_unpacklo_epi16(M[0], M[1]); + r1 = _mm256_unpacklo_epi16(M[2], M[3]); + r2 = _mm256_unpacklo_epi16(M[4], M[5]); + r3 = _mm256_unpacklo_epi16(M[6], M[7]); + r4 = _mm256_unpacklo_epi16(M[8], M[9]); + r5 = _mm256_unpacklo_epi16(M[10], M[11]); + r6 = _mm256_unpacklo_epi16(M[12], M[13]); + r7 = _mm256_unpacklo_epi16(M[14], M[15]); + + + temp = _mm256_unpacklo_epi32(r0, r1); + temp0 = _mm256_unpacklo_epi32(r2, r3); + temp1 = _mm256_unpacklo_epi32(r4, r5); + temp2 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpackhi_epi32(r0, r1); + r9 = _mm256_unpackhi_epi32(r2, r3); + r10 = _mm256_unpackhi_epi32(r4, r5); + r11 = _mm256_unpackhi_epi32(r6, r7); + + r0 = _mm256_unpacklo_epi64(temp, temp0); + r2 = _mm256_unpackhi_epi64(temp, temp0); + + r1 = _mm256_unpacklo_epi64(temp1, temp2); + r3 = _mm256_unpackhi_epi64(temp1, temp2); + + temp = _mm256_unpackhi_epi16(M[0], M[1]); + temp0 = _mm256_unpackhi_epi16(M[2], M[3]); + temp1 = _mm256_unpackhi_epi16(M[4], M[5]); + temp2 = _mm256_unpackhi_epi16(M[6], M[7]); + r4 = _mm256_unpackhi_epi16(M[8], M[9]); + + M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); + M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); + + + r5 = _mm256_unpackhi_epi16(M[10], M[11]); + r6 = _mm256_unpackhi_epi16(M[12], M[13]); + r7 = _mm256_unpackhi_epi16(M[14], M[15]); + + + + r0 = _mm256_unpacklo_epi64(r8, r9); + r1 = _mm256_unpacklo_epi64(r10, r11); + + r2 = _mm256_unpackhi_epi64(r8, r9); + r3 = _mm256_unpackhi_epi64(r10, r11); + + + + M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); + M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); + + + //for(i=0; i<4; i=i+1) + //{ + r0 = _mm256_unpacklo_epi32(temp, temp0); + r1 = _mm256_unpacklo_epi32(temp1, temp2); + r2 = _mm256_unpacklo_epi32(r4, r5); + r3 = _mm256_unpacklo_epi32(r6, r7); + + //} + + + //for(i=0; i<2; i=i+1) + //{ + r8 = _mm256_unpacklo_epi64(r0, r1); + r10 = _mm256_unpackhi_epi64(r0, r1); + + r9 = _mm256_unpacklo_epi64(r2, r3); + r11 = _mm256_unpackhi_epi64(r2, r3); + + M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); + M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); + M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); + M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); + + r0 = _mm256_unpackhi_epi32(temp, temp0); + r1 = _mm256_unpackhi_epi32(temp1, temp2); + r2 = _mm256_unpackhi_epi32(r4, r5); + r3 = _mm256_unpackhi_epi32(r6, r7); + + //} +// for(i=0; i<2; i=i+1) +// { + r4 = _mm256_unpacklo_epi64(r0, r1); + r6 = _mm256_unpackhi_epi64(r0, r1); + + r5 = _mm256_unpacklo_epi64(r2, r3); + r7 = _mm256_unpackhi_epi64(r2, r3); + +// } + + //------------------------------------------------------- + + M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); + M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); + M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); + M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); +} + +/* +void transpose_unrolled(__m256i *M) +{ + int i; + __m256i tL[8], tH[8]; + __m256i bL[4], bH[4], cL[4], cH[4]; + __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; + + __m256i r0, r1, r2, r3, r4, r5, r6, r7; + + //for(i=0; i<8; i=i+1) + //{ + tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); + tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); + + tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); + tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); + + tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); + tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); + + tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); + tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); + + tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); + tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); + + tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); + tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); + + tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); + tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); + + tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); + tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); + + //} + + //------------------------------------------------------- + //for(i=0; i<4; i=i+1) + //{ + bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); + bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); + + bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); + bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); + + bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); + bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); + + bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); + bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); + + //} + + //for(i=0; i<2; i=i+1) + //{ + dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); + dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); + + dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); + dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); + + M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); + M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); + M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); + M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); + + //} + //for(i=0; i<2; i=i+1) + //{ + eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); + eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); + + eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); + eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); + + //} + + //------------------------------------------------------- + + //------------------------------------------------------- + for(i=0; i<4; i=i+1) + { + cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); + cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); + } + + + for(i=0; i<2; i=i+1) + { + fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); + fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); + gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); + } + + //------------------------------------------------------- + + + + M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); + M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); + M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); + M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); + + M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); + M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); + M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); + M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); + + M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); + M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); + M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); + M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); +} + + +void transpose1(__m256i *M) +{ + int i; + __m256i tL[8], tH[8]; + __m256i bL[4], bH[4], cL[4], cH[4]; + __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; + + for(i=0; i<8; i=i+1) + { + tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); + tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); + } + + for(i=0; i<4; i=i+1) + { + bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); + bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); + } + for(i=0; i<4; i=i+1) + { + cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); + cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); + } + + for(i=0; i<2; i=i+1) + { + dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); + dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); + eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); + } + + for(i=0; i<2; i=i+1) + { + fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); + fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); + } + for(i=0; i<2; i=i+1) + { + gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); + gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); + } + + M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); + M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); + M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); + M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); + + M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); + M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); + M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); + M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); + + M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); + M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); + M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); + M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); + + M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); + M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); + M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); + M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); +} +*/ diff --git a/crypto_kem/saber/avx2/polymul/scm_avx.c b/crypto_kem/saber/avx2/polymul/scm_avx.c new file mode 100644 index 00000000..4e4f11f8 --- /dev/null +++ b/crypto_kem/saber/avx2/polymul/scm_avx.c @@ -0,0 +1,753 @@ +//#define SCM_SIZE 16 + +//#pragma STDC FP_CONTRACT ON + +#include + +inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { + return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); +} + + +static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched + //the c_avx are added cummulatively +{ + + register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + register __m256i temp; + + + a0=a[0]; + a1=a[1]; + a2=a[2]; + a3=a[3]; + a4=a[4]; + a5=a[5]; + a6=a[6]; + a7=a[7]; + + b0=b[0]; + b1=b[1]; + b2=b[2]; + b3=b[3]; + b4=b[4]; + b5=b[5]; + b6=b[6]; + b7=b[7]; + + // New Unrolled first triangle + + //otherwise accumulate + c_avx[0] = mul_add(a0, b0, c_avx[0]); + + + temp = _mm256_mullo_epi16 (a0, b1); + temp=mul_add(a1, b0, temp); + c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); + + + temp = _mm256_mullo_epi16 (a0, b2); + temp = mul_add(a1, b1, temp); + temp=mul_add(a2, b0, temp); + c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); + + + temp = _mm256_mullo_epi16 (a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + temp=mul_add(a3, b0, temp); + c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); + + temp = _mm256_mullo_epi16 (a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + temp=mul_add(a2, b2, temp); + c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); + + + temp = _mm256_mullo_epi16 (a0, b5); + temp = mul_add(a1, b4 , temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + temp=mul_add(a5, b0, temp); + c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); + + temp = _mm256_mullo_epi16 (a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + temp=mul_add(a4, b2, temp); + c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); + + + temp = _mm256_mullo_epi16 (a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add (a6, b1, temp); + temp = mul_add (a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add (a3, b4, temp); + temp = mul_add (a4, b3, temp); + temp=mul_add(a5, b2, temp); + c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); + + temp = _mm256_mullo_epi16 (a0, b[8]); + temp = mul_add (a1, b7, temp); + temp = mul_add (a7, b1, temp); + temp = mul_add (a[8], b0, temp); + temp = mul_add (a2, b6,temp); + temp = mul_add(a3, b5, temp); + temp = mul_add (a4, b4,temp); + temp = mul_add (a5, b3, temp); + + temp=mul_add(a6, b2, temp); + c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); + + + temp = _mm256_mullo_epi16 (a0, b[9]); + temp = mul_add (a1, b[8], temp); + temp = mul_add (a[8], b1, temp); + temp = mul_add (a[9], b0, temp); + temp = mul_add (a2, b7, temp); + temp = mul_add (a3, b6, temp); + temp = mul_add (a4, b5, temp); + temp = mul_add (a5, b4, temp); + temp = mul_add (a6, b3, temp); + temp=mul_add(a7, b2, temp); + c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); + + + temp= _mm256_mullo_epi16 (a0, b[10]); + temp = mul_add (a1, b[9], temp); + temp = mul_add (a[9], b1, temp); + temp = mul_add (a[10], b0, temp); + temp = mul_add (a2, b[8], temp); + temp = mul_add (a3, b7, temp); + temp = mul_add (a4, b6, temp); + temp = mul_add (a5, b5, temp); + temp = mul_add (a6, b4, temp); + temp = mul_add (a7, b3, temp); + temp=mul_add(a[8], b2, temp); + c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); + + + temp = _mm256_mullo_epi16 (a0, b[11]); + temp = mul_add (a1, b[10], temp ); + temp = mul_add (a[10], b1, temp ); + temp = mul_add (a[11], b0, temp ); + temp = mul_add (a2, b[9], temp ); + temp = mul_add (a3, b[8], temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a[8], b3, temp ); + temp=mul_add(a[9], b2, temp); + c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); + + + temp = _mm256_mullo_epi16 (a0, b[12]); + temp = mul_add (a1, b[11], temp); + temp = mul_add (a[11], b1, temp); + temp = mul_add (a[12], b0, temp); + temp = mul_add (a2, b[10], temp); + temp = mul_add (a3, b[9], temp); + temp = mul_add (a4, b[8], temp); + temp = mul_add (a5, b7, temp); + temp = mul_add (a6, b6, temp); + temp = mul_add (a7, b5, temp); + temp = mul_add (a[8], b4, temp); + temp = mul_add (a[9], b3, temp); + temp=mul_add(a[10], b2, temp); + c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); + + + temp = _mm256_mullo_epi16 (a0, b[13]); + temp = mul_add (a1, b[12], temp ); + temp = mul_add (a[12], b1, temp ); + temp = mul_add (a[13], b0, temp ); + temp = mul_add (a2, b[11], temp ); + temp = mul_add (a3, b[10], temp ); + temp = mul_add (a4, b[9], temp ); + temp = mul_add (a5, b[8], temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a[8], b5, temp ); + temp = mul_add (a[9], b4, temp ); + temp = mul_add (a[10], b3, temp ); + temp=mul_add(a[11], b2, temp); + c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); + + + + temp = _mm256_mullo_epi16 (a0, b[14]); + temp = mul_add (a1, b[13], temp ); + temp = mul_add (a[13], b1, temp ); + temp = mul_add (a[14], b0, temp ); + temp = mul_add (a2, b[12], temp ); + temp = mul_add (a3, b[11], temp ); + temp = mul_add (a4, b[10], temp ); + temp = mul_add (a5, b[9], temp ); + temp = mul_add (a6, b[8], temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a[8], b6, temp ); + temp = mul_add (a[9], b5, temp ); + temp = mul_add (a[10], b4, temp ); + temp = mul_add (a[11], b3, temp ); + temp=mul_add(a[12], b2, temp); + c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); + + + temp = _mm256_mullo_epi16 (a0, b[15]); + temp = mul_add (a1, b[14], temp ); + temp = mul_add (a[14], b1, temp ); + temp = mul_add (a[15], b0, temp ); + temp = mul_add (a2, b[13], temp ); + temp = mul_add (a3, b[12], temp ); + temp = mul_add (a4, b[11], temp ); + temp = mul_add (a5, b[10], temp ); + temp = mul_add (a6, b[9], temp ); + temp = mul_add (a7, b[8], temp ); + temp = mul_add (a[8], b7, temp ); + temp = mul_add (a[9], b6, temp ); + temp = mul_add (a[10], b5, temp ); + temp = mul_add (a[11], b4, temp ); + temp = mul_add (a[12], b3, temp ); + temp=mul_add(a[13], b2, temp); + c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); + + + // unrolled second triangle + a0=a[14]; + a1=a[15]; + a2=a[13]; + a3=a[12]; + a4=a[11]; + a5=a[10]; + a6=a[9]; + a7=a[8]; + + b0=b[14]; + b1=b[15]; + b2=b[13]; + b3=b[12]; + b4=b[11]; + b5=b[10]; + b6=b[9]; + b7=b[8]; + + temp = _mm256_mullo_epi16 (a[1], b1); + temp = mul_add (a[2], b0, temp ); + temp = mul_add (a[3], b2, temp ); + temp = mul_add (a[4], b3, temp ); + temp = mul_add (a[5], b4, temp ); + temp = mul_add (a[6], b5, temp ); + temp = mul_add (a[7], b6, temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a6, b[7], temp ); + temp = mul_add (a5, b[6], temp ); + temp = mul_add (a4, b[5], temp ); + temp = mul_add (a3, b[4], temp ); + temp = mul_add (a2, b[3], temp ); + temp = mul_add (a0, b[2], temp ); + temp=mul_add(a1, b[1], temp); + c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); + + + temp = _mm256_mullo_epi16 (a[2], b1); + temp = mul_add (a[3], b0, temp ); + temp = mul_add (a[4], b2, temp ); + temp = mul_add (a[5], b3, temp ); + temp = mul_add (a[6], b4, temp ); + temp = mul_add (a[7], b5, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a5, b[7], temp ); + temp = mul_add (a4, b[6], temp ); + temp = mul_add (a3, b[5], temp ); + temp = mul_add (a2, b[4], temp ); + temp = mul_add (a0, b[3], temp ); + temp=mul_add(a1, b[2], temp); + c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); + + + temp = _mm256_mullo_epi16 (a[3], b1); + temp = mul_add (a[4], b0, temp ); + temp = mul_add (a[5], b2, temp ); + temp = mul_add (a[6], b3, temp ); + temp = mul_add (a[7], b4, temp ); + temp = mul_add (a7, b5, temp ); + temp = mul_add (a6, b6, temp ); + temp = mul_add (a5, b7, temp ); + temp = mul_add (a4, b[7], temp ); + temp = mul_add (a3, b[6], temp ); + temp = mul_add (a2, b[5], temp ); + temp = mul_add (a0, b[4], temp ); + temp=mul_add(a1, b[3], temp); + c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); + + + temp = _mm256_mullo_epi16 (a[4], b1); + temp = mul_add (a[5], b0, temp ); + temp = mul_add (a[6], b2, temp ); + temp = mul_add (a[7], b3, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a3, b[7], temp ); + temp = mul_add (a2, b[6], temp ); + temp = mul_add (a0, b[5], temp ); + temp=mul_add(a1, b[4], temp); + c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); + + + temp = _mm256_mullo_epi16 (a[5], b1); + temp = mul_add (a[6], b0, temp ); + temp = mul_add (a[7], b2, temp ); + temp = mul_add (a7, b3, temp ); + temp = mul_add (a6, b4, temp ); + temp = mul_add (a5, b5, temp ); + temp = mul_add (a4, b6, temp ); + temp = mul_add (a3, b7, temp ); + temp = mul_add (a2, b[7], temp ); + temp = mul_add (a0, b[6], temp ); + temp=mul_add(a1, b[5], temp); + c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); + + + temp = _mm256_mullo_epi16 (a[6], b1); + temp = mul_add (a[7], b0, temp ); + temp = mul_add (a7, b2, temp ); + temp = mul_add (a6, b3, temp ); + temp = mul_add (a5, b4, temp ); + temp = mul_add (a4, b5, temp ); + temp = mul_add (a3, b6, temp ); + temp = mul_add (a2, b7, temp ); + temp = mul_add (a0, b[7], temp ); + temp=mul_add(a1, b[6], temp); + c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); + + + temp = _mm256_mullo_epi16 (a[7], b1); + temp = mul_add (a7, b0, temp ); + temp = mul_add (a6, b2, temp ); + temp = mul_add (a5, b3, temp ); + temp = mul_add (a4, b4, temp ); + temp = mul_add (a3, b5, temp ); + temp = mul_add (a2, b6, temp ); + temp = mul_add (a0, b7, temp ); + temp=mul_add(a1, b[7], temp); + c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); + + + temp = _mm256_mullo_epi16 (a7, b1); + temp = mul_add (a6, b0, temp ); + temp = mul_add (a5, b2, temp ); + temp = mul_add (a4, b3, temp ); + temp = mul_add (a3, b4, temp ); + temp = mul_add (a2, b5, temp ); + temp = mul_add (a0, b6, temp ); + temp=mul_add(a1, b7, temp); + c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); + + + temp = _mm256_mullo_epi16 (a6, b1); + temp = mul_add (a5, b0, temp ); + temp = mul_add (a4, b2, temp ); + temp = mul_add (a3, b3, temp ); + temp = mul_add (a2, b4, temp ); + temp = mul_add (a0, b5, temp ); + temp=mul_add(a1, b6, temp); + c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); + + + temp = _mm256_mullo_epi16 (a5, b1); + temp = mul_add (a4, b0, temp ); + temp = mul_add (a3, b2, temp ); + temp = mul_add (a2, b3, temp ); + temp = mul_add (a0, b4, temp ); + temp=mul_add(a1, b5, temp); + c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); + + + temp = _mm256_mullo_epi16 (a4, b1); + temp = mul_add (a3, b0, temp ); + temp = mul_add (a2, b2, temp ); + temp = mul_add (a0, b3, temp ); + temp=mul_add(a1, b4, temp); + c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); + + + temp = _mm256_mullo_epi16 (a3, b1); + temp = mul_add (a2, b0, temp ); + temp = mul_add (a0, b2, temp ); + temp=mul_add(a1, b3, temp); + c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); + + + temp = _mm256_mullo_epi16 (a2, b1); + temp = mul_add (a0, b0, temp ); + temp=mul_add(a1, b2, temp); + c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); + + + temp = _mm256_mullo_epi16 (a0, b1); + temp=mul_add(a1, b0, temp); + c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); + + + c_avx[30] = mul_add(a1, b1, c_avx[30]); + + + + c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); + + +} + + + +static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched + //the c_avx are not added cummulatively +{ + + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + + a0=a[0]; + a1=a[1]; + a2=a[2]; + a3=a[3]; + a4=a[4]; + a5=a[5]; + a6=a[6]; + a7=a[7]; + + b0=b[0]; + b1=b[1]; + b2=b[2]; + b3=b[3]; + b4=b[4]; + b5=b[5]; + b6=b[6]; + b7=b[7]; + + // New Unrolled first triangle + c_avx[0] = _mm256_mullo_epi16 (a0, b0); + + temp = _mm256_mullo_epi16 (a0, b1); + c_avx[1]=mul_add(a1, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b2); + + temp = mul_add(a1, b1, temp); + c_avx[2]= mul_add(a2, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + c_avx[3]= mul_add(a3, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + c_avx[4]= mul_add(a2, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b5); + temp = mul_add(a1, b4 , temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + c_avx[5] = mul_add(a5, b0, temp); + + temp = _mm256_mullo_epi16 (a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + c_avx[6] = mul_add(a4, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add (a6, b1, temp); + temp = mul_add (a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add (a3, b4, temp); + temp = mul_add (a4, b3, temp); + c_avx[7] = mul_add (a5, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[8]); + temp = mul_add (a1, b7, temp); + temp = mul_add (a7, b1, temp); + temp = mul_add (a[8], b0, temp); + temp = mul_add (a2, b6,temp); + temp = mul_add(a3, b5, temp); + temp = mul_add (a4, b4,temp); + temp = mul_add (a5, b3, temp); + c_avx[8] = mul_add (a6, b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[9]); + temp = mul_add (a1, b[8], temp); + temp = mul_add (a[8], b1, temp); + temp = mul_add (a[9], b0, temp); + temp = mul_add (a2, b7, temp); + temp = mul_add (a3, b6, temp); + temp = mul_add (a4, b5, temp); + temp = mul_add (a5, b4, temp); + temp = mul_add (a6, b3, temp); + c_avx[9] = mul_add (a7, b2, temp); + + temp= _mm256_mullo_epi16 (a0, b[10]); + temp = mul_add (a1, b[9], temp); + temp = mul_add (a[9], b1, temp); + temp = mul_add (a[10], b0, temp); + temp = mul_add (a2, b[8], temp); + temp = mul_add (a3, b7, temp); + temp = mul_add (a4, b6, temp); + temp = mul_add (a5, b5, temp); + temp = mul_add (a6, b4, temp); + temp = mul_add (a7, b3, temp); + c_avx[10] = mul_add (a[8], b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[11]); + temp = mul_add (a1, b[10], temp ); + temp = mul_add (a[10], b1, temp ); + temp = mul_add (a[11], b0, temp ); + temp = mul_add (a2, b[9], temp ); + temp = mul_add (a3, b[8], temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a[8], b3, temp ); + c_avx[11] = mul_add (a[9], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[12]); + temp = mul_add (a1, b[11], temp); + temp = mul_add (a[11], b1, temp); + temp = mul_add (a[12], b0, temp); + temp = mul_add (a2, b[10], temp); + temp = mul_add (a3, b[9], temp); + temp = mul_add (a4, b[8], temp); + temp = mul_add (a5, b7, temp); + temp = mul_add (a6, b6, temp); + temp = mul_add (a7, b5, temp); + temp = mul_add (a[8], b4, temp); + temp = mul_add (a[9], b3, temp); + c_avx[12] = mul_add (a[10], b2, temp); + + temp = _mm256_mullo_epi16 (a0, b[13]); + temp = mul_add (a1, b[12], temp ); + temp = mul_add (a[12], b1, temp ); + temp = mul_add (a[13], b0, temp ); + temp = mul_add (a2, b[11], temp ); + temp = mul_add (a3, b[10], temp ); + temp = mul_add (a4, b[9], temp ); + temp = mul_add (a5, b[8], temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a[8], b5, temp ); + temp = mul_add (a[9], b4, temp ); + temp = mul_add (a[10], b3, temp ); + c_avx[13] = mul_add (a[11], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[14]); + temp = mul_add (a1, b[13], temp ); + temp = mul_add (a[13], b1, temp ); + temp = mul_add (a[14], b0, temp ); + temp = mul_add (a2, b[12], temp ); + temp = mul_add (a3, b[11], temp ); + temp = mul_add (a4, b[10], temp ); + temp = mul_add (a5, b[9], temp ); + temp = mul_add (a6, b[8], temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a[8], b6, temp ); + temp = mul_add (a[9], b5, temp ); + temp = mul_add (a[10], b4, temp ); + temp = mul_add (a[11], b3, temp ); + c_avx[14] = mul_add (a[12], b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b[15]); + temp = mul_add (a1, b[14], temp ); + temp = mul_add (a[14], b1, temp ); + temp = mul_add (a[15], b0, temp ); + temp = mul_add (a2, b[13], temp ); + temp = mul_add (a3, b[12], temp ); + temp = mul_add (a4, b[11], temp ); + temp = mul_add (a5, b[10], temp ); + temp = mul_add (a6, b[9], temp ); + temp = mul_add (a7, b[8], temp ); + temp = mul_add (a[8], b7, temp ); + temp = mul_add (a[9], b6, temp ); + temp = mul_add (a[10], b5, temp ); + temp = mul_add (a[11], b4, temp ); + temp = mul_add (a[12], b3, temp ); + c_avx[15] = mul_add (a[13], b2, temp ); + + + // unrolled second triangle + a0=a[14]; + a1=a[15]; + a2=a[13]; + a3=a[12]; + a4=a[11]; + a5=a[10]; + a6=a[9]; + a7=a[8]; + + b0=b[14]; + b1=b[15]; + b2=b[13]; + b3=b[12]; + b4=b[11]; + b5=b[10]; + b6=b[9]; + b7=b[8]; + + + temp = _mm256_mullo_epi16 (a[1], b1); + temp = mul_add (a[2], b0, temp ); + temp = mul_add (a[3], b2, temp ); + temp = mul_add (a[4], b3, temp ); + temp = mul_add (a[5], b4, temp ); + temp = mul_add (a[6], b5, temp ); + temp = mul_add (a[7], b6, temp ); + temp = mul_add (a7, b7, temp ); + temp = mul_add (a6, b[7], temp ); + temp = mul_add (a5, b[6], temp ); + temp = mul_add (a4, b[5], temp ); + temp = mul_add (a3, b[4], temp ); + temp = mul_add (a2, b[3], temp ); + temp = mul_add (a0, b[2], temp ); + c_avx[16] = mul_add (a1, b[1], temp ); + + temp = _mm256_mullo_epi16 (a[2], b1); + temp = mul_add (a[3], b0, temp ); + temp = mul_add (a[4], b2, temp ); + temp = mul_add (a[5], b3, temp ); + temp = mul_add (a[6], b4, temp ); + temp = mul_add (a[7], b5, temp ); + temp = mul_add (a7, b6, temp ); + temp = mul_add (a6, b7, temp ); + temp = mul_add (a5, b[7], temp ); + temp = mul_add (a4, b[6], temp ); + temp = mul_add (a3, b[5], temp ); + temp = mul_add (a2, b[4], temp ); + temp = mul_add (a0, b[3], temp ); + c_avx[17] = mul_add (a1, b[2], temp ); + + temp = _mm256_mullo_epi16 (a[3], b1); + temp = mul_add (a[4], b0, temp ); + temp = mul_add (a[5], b2, temp ); + temp = mul_add (a[6], b3, temp ); + temp = mul_add (a[7], b4, temp ); + temp = mul_add (a7, b5, temp ); + temp = mul_add (a6, b6, temp ); + temp = mul_add (a5, b7, temp ); + temp = mul_add (a4, b[7], temp ); + temp = mul_add (a3, b[6], temp ); + temp = mul_add (a2, b[5], temp ); + temp = mul_add (a0, b[4], temp ); + c_avx[18] = mul_add (a1, b[3], temp ); + + temp = _mm256_mullo_epi16 (a[4], b1); + temp = mul_add (a[5], b0, temp ); + temp = mul_add (a[6], b2, temp ); + temp = mul_add (a[7], b3, temp ); + temp = mul_add (a7, b4, temp ); + temp = mul_add (a6, b5, temp ); + temp = mul_add (a5, b6, temp ); + temp = mul_add (a4, b7, temp ); + temp = mul_add (a3, b[7], temp ); + temp = mul_add (a2, b[6], temp ); + temp = mul_add (a0, b[5], temp ); + c_avx[19] = mul_add (a1, b[4], temp ); + + temp = _mm256_mullo_epi16 (a[5], b1); + temp = mul_add (a[6], b0, temp ); + temp = mul_add (a[7], b2, temp ); + temp = mul_add (a7, b3, temp ); + temp = mul_add (a6, b4, temp ); + temp = mul_add (a5, b5, temp ); + temp = mul_add (a4, b6, temp ); + temp = mul_add (a3, b7, temp ); + temp = mul_add (a2, b[7], temp ); + temp = mul_add (a0, b[6], temp ); + c_avx[20] = mul_add (a1, b[5], temp ); + + temp = _mm256_mullo_epi16 (a[6], b1); + temp = mul_add (a[7], b0, temp ); + temp = mul_add (a7, b2, temp ); + temp = mul_add (a6, b3, temp ); + temp = mul_add (a5, b4, temp ); + temp = mul_add (a4, b5, temp ); + temp = mul_add (a3, b6, temp ); + temp = mul_add (a2, b7, temp ); + temp = mul_add (a0, b[7], temp ); + c_avx[21] = mul_add (a1, b[6], temp ); + + temp = _mm256_mullo_epi16 (a[7], b1); + temp = mul_add (a7, b0, temp ); + temp = mul_add (a6, b2, temp ); + temp = mul_add (a5, b3, temp ); + temp = mul_add (a4, b4, temp ); + temp = mul_add (a3, b5, temp ); + temp = mul_add (a2, b6, temp ); + temp = mul_add (a0, b7, temp ); + c_avx[22] = mul_add (a1, b[7], temp ); + + temp = _mm256_mullo_epi16 (a7, b1); + temp = mul_add (a6, b0, temp ); + temp = mul_add (a5, b2, temp ); + temp = mul_add (a4, b3, temp ); + temp = mul_add (a3, b4, temp ); + temp = mul_add (a2, b5, temp ); + temp = mul_add (a0, b6, temp ); + c_avx[23] = mul_add (a1, b7, temp ); + + temp = _mm256_mullo_epi16 (a6, b1); + temp = mul_add (a5, b0, temp ); + temp = mul_add (a4, b2, temp ); + temp = mul_add (a3, b3, temp ); + temp = mul_add (a2, b4, temp ); + temp = mul_add (a0, b5, temp ); + c_avx[24] = mul_add (a1, b6, temp ); + + temp = _mm256_mullo_epi16 (a5, b1); + temp = mul_add (a4, b0, temp ); + temp = mul_add (a3, b2, temp ); + temp = mul_add (a2, b3, temp ); + temp = mul_add (a0, b4, temp ); + c_avx[25] = mul_add (a1, b5, temp ); + + temp = _mm256_mullo_epi16 (a4, b1); + temp = mul_add (a3, b0, temp ); + temp = mul_add (a2, b2, temp ); + temp = mul_add (a0, b3, temp ); + c_avx[26] = mul_add (a1, b4, temp ); + + temp = _mm256_mullo_epi16 (a3, b1); + temp = mul_add (a2, b0, temp ); + temp = mul_add (a0, b2, temp ); + c_avx[27] = mul_add (a1, b3, temp ); + + temp = _mm256_mullo_epi16 (a2, b1); + temp = mul_add (a0, b0, temp ); + c_avx[28] = mul_add (a1, b2, temp ); + + temp = _mm256_mullo_epi16 (a0, b1); + c_avx[29] = mul_add (a1, b0, temp); + + c_avx[30] = _mm256_mullo_epi16 (a1, b1); + + + c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); + +} diff --git a/crypto_kem/saber/avx2/polymul/toom-cook_4way.c b/crypto_kem/saber/avx2/polymul/toom-cook_4way.c new file mode 100644 index 00000000..78fb86c2 --- /dev/null +++ b/crypto_kem/saber/avx2/polymul/toom-cook_4way.c @@ -0,0 +1,1010 @@ +/* +Cleaned version for step by step approach look into the _debug file +*/ +//#include "timing.c" +#include "consts.h" +#include "matrix.c" +#include "scm_avx.c" + +static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX. +{ + __m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time + + //uint16_t i; + + register __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + + + //CLOCK1=cpucycles(); + + //------------------AVX evaluation for 1st poly----------------------- + + r0_avx=a[0]; + r1_avx=a[1]; + r2_avx=a[2]; + r3_avx=a[3]; + a_bucket[0]=r0_avx; + a_bucket[1]=r1_avx; + a_bucket[2]=r2_avx; + a_bucket[3]=r3_avx; + a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]); + + + //------------------AVX evaluation for 1st poly ends------------------ + + + //------------------AVX evaluation for 2nd poly----------------------- + r0_avx=a[small_len_avx]; + r1_avx=a[small_len_avx+1]; + r2_avx=a[small_len_avx+2]; + r3_avx=a[small_len_avx+3]; + a_bucket[0+9]=r0_avx; + a_bucket[1+9]=r1_avx; + a_bucket[2+9]=r2_avx; + a_bucket[3+9]=r3_avx; + a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]); + + + //------------------AVX evaluation for 2nd poly ends------------------ + + + //------------------AVX evaluation for 3rd poly----------------------- + r0_avx=a[2*small_len_avx]; + r1_avx=a[2*small_len_avx+1]; + r2_avx=a[2*small_len_avx+2]; + r3_avx=a[2*small_len_avx+3]; + a_bucket[0+18]=r0_avx; + a_bucket[1+18]=r1_avx; + a_bucket[2+18]=r2_avx; + a_bucket[3+18]=r3_avx; + a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]); + + //------------------AVX evaluation for 3rd poly ends------------------ + + + //------------------AVX evaluation for 4th poly----------------------- + + r0_avx=a[3*small_len_avx]; + r1_avx=a[3*small_len_avx+1]; + r2_avx=a[3*small_len_avx+2]; + r3_avx=a[3*small_len_avx+3]; + a_bucket[0+27]=r0_avx; + a_bucket[1+27]=r1_avx; + a_bucket[2+27]=r2_avx; + a_bucket[3+27]=r3_avx; + a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]); + + //------------------AVX evaluation for 4th poly ends------------------ + + //------------------AVX evaluation for 5th poly----------------------- + + r0_avx=a[4*small_len_avx+0]; + r1_avx=a[4*small_len_avx+1]; + r2_avx=a[4*small_len_avx+2]; + r3_avx=a[4*small_len_avx+3]; + a_bucket[0+36]=r0_avx; + a_bucket[1+36]=r1_avx; + a_bucket[2+36]=r2_avx; + a_bucket[3+36]=r3_avx; + a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]); + + //------------------AVX evaluation for 5th poly ends------------------ + + + //------------------AVX evaluation for 6th poly----------------------- + r0_avx=a[5*small_len_avx]; + r1_avx=a[5*small_len_avx+1]; + r2_avx=a[5*small_len_avx+2]; + r3_avx=a[5*small_len_avx+3]; + a_bucket[0+45]=r0_avx; + a_bucket[1+45]=r1_avx; + a_bucket[2+45]=r2_avx; + a_bucket[3+45]=r3_avx; + a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]); + + //------------------AVX evaluation for 6th poly ends------------------ + + //------------------AVX evaluation for 7th poly----------------------- + + r0_avx=a[6*small_len_avx]; + r1_avx=a[6*small_len_avx+1]; + r2_avx=a[6*small_len_avx+2]; + r3_avx=a[6*small_len_avx+3]; + a_bucket[0+54]=r0_avx; + a_bucket[1+54]=r1_avx; + a_bucket[2+54]=r2_avx; + a_bucket[3+54]=r3_avx; + a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); + a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); + a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); + a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); + a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]); + + //------------------AVX evaluation for 7th poly ends------------------ + + + + //CLOCK2=cpucycles(); + //CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1); + //printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1); + + + //CLOCK1=cpucycles(); + //-----------------Forward transposes-------------------------------------- + transpose_n1(a_bucket); + transpose_n1(a_bucket+16); + transpose_n1(a_bucket+32); + transpose_n1(a_bucket+48); + + //-----------------Forwatrd transposes ends--------------------------------- + + //----------------------all multiplications--------------------------------- + if(f==0){ + schoolbook_avx_new2(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); + schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); + schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); + } + else{ + schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); + //schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket); + schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); + schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); + } + /* + schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f); + schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f); + schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f); + schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f); + */ + + + //----------------------all multiplications ends----------------------------- + + + //-----------------Reverse transposes-------------------------------------- + + /* + transpose(c_bucket); + transpose(c_bucket+16); + + transpose(c_bucket+2*SCM_SIZE); + transpose(c_bucket+16+2*SCM_SIZE); + + transpose(c_bucket+4*SCM_SIZE); + transpose(c_bucket+16+4*SCM_SIZE); + + transpose(c_bucket+6*SCM_SIZE); + transpose(c_bucket+16+6*SCM_SIZE); + */ + //-----------------Reverse transposes ends--------------------------------- + + //CLOCK2=cpucycles(); + //CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1); + + //KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6); + +} + +static void KARA_eval(__m256i* b, __m256i *b_bucket){ + + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + + //-------1st poly---------------------------------------------------- + r0_avx=b[0]; + r1_avx=b[1]; + r2_avx=b[2]; + r3_avx=b[3]; + b_bucket[0]=r0_avx; + b_bucket[1]=r1_avx; + b_bucket[2]=r2_avx; + b_bucket[3]=r3_avx; + b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]); + //-------2nd poly---------------------------------------------------- + + r0_avx=b[small_len_avx]; + r1_avx=b[small_len_avx+1]; + r2_avx=b[small_len_avx+2]; + r3_avx=b[small_len_avx+3]; + b_bucket[0+9]=r0_avx; + b_bucket[1+9]=r1_avx; + b_bucket[2+9]=r2_avx; + b_bucket[3+9]=r3_avx; + b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]); + + //-------3rd poly---------------------------------------------------- + + r0_avx=b[2*small_len_avx+0]; + r1_avx=b[2*small_len_avx+1]; + r2_avx=b[2*small_len_avx+2]; + r3_avx=b[2*small_len_avx+3]; + b_bucket[0+18]=r0_avx; + b_bucket[1+18]=r1_avx; + b_bucket[2+18]=r2_avx; + b_bucket[3+18]=r3_avx; + b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]); + + //-------4th poly---------------------------------------------------- + r0_avx=b[3*small_len_avx]; + r1_avx=b[3*small_len_avx+1]; + r2_avx=b[3*small_len_avx+2]; + r3_avx=b[3*small_len_avx+3]; + b_bucket[0+27]=r0_avx; + b_bucket[1+27]=r1_avx; + b_bucket[2+27]=r2_avx; + b_bucket[3+27]=r3_avx; + b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]); + + //-------5th poly---------------------------------------------------- + + r0_avx=b[4*small_len_avx]; + r1_avx=b[4*small_len_avx+1]; + r2_avx=b[4*small_len_avx+2]; + r3_avx=b[4*small_len_avx+3]; + b_bucket[0+36]=r0_avx; + b_bucket[1+36]=r1_avx; + b_bucket[2+36]=r2_avx; + b_bucket[3+36]=r3_avx; + b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]); + + //-------6th poly---------------------------------------------------- + + r0_avx=b[5*small_len_avx]; + r1_avx=b[5*small_len_avx+1]; + r2_avx=b[5*small_len_avx+2]; + r3_avx=b[5*small_len_avx+3]; + b_bucket[0+45]=r0_avx; + b_bucket[1+45]=r1_avx; + b_bucket[2+45]=r2_avx; + b_bucket[3+45]=r3_avx; + b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]); + + //-------7th poly---------------------------------------------------- + + r0_avx=b[6*small_len_avx]; + r1_avx=b[6*small_len_avx+1]; + r2_avx=b[6*small_len_avx+2]; + r3_avx=b[6*small_len_avx+3]; + b_bucket[0+54]=r0_avx; + b_bucket[1+54]=r1_avx; + b_bucket[2+54]=r2_avx; + b_bucket[3+54]=r3_avx; + b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); + b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); + b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); + b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); + b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]); + + //--------------Evaluating B poly ends------------------------------- + + transpose_n1(b_bucket); + transpose_n1(b_bucket+16); + transpose_n1(b_bucket+32); + transpose_n1(b_bucket+48); +} + +static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){ + + //int64_t i; + register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results + + __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; + + //CLOCK1=cpucycles(); + + //------------------------AVX interpolation for 1st poly external------------------- + + //loop1 + res_avx0 = c_bucket[0]; + res_avx2 = c_bucket[1]; + res_avx4 = c_bucket[2]; + res_avx6 = c_bucket[3]; + + c6_avx=c_bucket[6]; + c7_avx=c_bucket[7]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx); + + res_avx1 = c_bucket[16]; + res_avx3 = c_bucket[17]; + res_avx5 = c_bucket[18]; + res_avx7 = c_bucket[19]; + + c22_avx=c_bucket[22]; + c23_avx=c_bucket[23]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final0[0]=res_avx0; + result_final0[1]=res_avx1; + + result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final0[6]=res_avx6; + result_final0[7]=res_avx7; + + + //------------------------AVX interpolation for 1st poly ends-------------- + + + //------------------------AVX interpolation for 2nd poly external------------------- + + //loop1 + res_avx0 = c_bucket[9]; //c_bucket0 + res_avx2 = c_bucket[10]; //c_bucket1 + res_avx4 = c_bucket[11]; //c_bucket2 + res_avx6 = c_bucket[12]; //c_bucket3 + + c6_avx=c_bucket[15]; //c_bucket6 + c7_avx=c_bucket[32]; //c_bucket7 + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx); + + res_avx1 = c_bucket[25]; //c_bucket0 + res_avx3 = c_bucket[26]; //c_bucket1 + res_avx5 = c_bucket[27]; //c_bucket2 + res_avx7 = c_bucket[28]; //c_bucket3 + + c22_avx=c_bucket[31]; + c23_avx=c_bucket[48]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final1[0]=res_avx0; + result_final1[1]=res_avx1; + + result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final1[6]=res_avx6; + result_final1[7]=res_avx7; + + + //------------------------AVX interpolation for 2nd poly ends-------------- + + //------------------------AVX interpolation for 3rd poly external------------------- + + //loop1 + res_avx0 = c_bucket[34]; //c_bucket0 + res_avx2 = c_bucket[35]; //c_bucket1 + res_avx4 = c_bucket[36]; + res_avx6 = c_bucket[37]; + + c6_avx=c_bucket[40]; + c7_avx=c_bucket[41]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx); + + res_avx1 = c_bucket[50]; //c_bucket0 + res_avx3 = c_bucket[51]; //c_bucket1 + res_avx5 = c_bucket[52]; + res_avx7 = c_bucket[53]; + + c22_avx=c_bucket[56]; + c23_avx=c_bucket[57]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + //loop4 + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + //loop5 + result_final2[0]=res_avx0; + result_final2[1]=res_avx1; + + result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final2[6]=res_avx6; + result_final2[7]=res_avx7; + + //------------------------AVX interpolation for 3rd poly ends-------------- + + //------------------------AVX interpolation for 4th poly external------------------- + + //loop1 + res_avx0 = c_bucket[43]; + res_avx2 = c_bucket[44]; + res_avx4 = c_bucket[45]; + res_avx6 = c_bucket[46]; + + c6_avx=c_bucket[65]; + c7_avx=c_bucket[66]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx); + + res_avx1 = c_bucket[59]; + res_avx3 = c_bucket[60]; + res_avx5 = c_bucket[61]; + res_avx7 = c_bucket[62]; + + c22_avx=c_bucket[81]; + c23_avx=c_bucket[82]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final3[0]=res_avx0; + result_final3[1]=res_avx1; + + result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final3[6]=res_avx6; + result_final3[7]=res_avx7; + + + //------------------------AVX interpolation for 4th poly ends-------------- + + //------------------------AVX interpolation for 5th poly external------------------- + + //loop1 + res_avx0 = c_bucket[68]; + res_avx2 = c_bucket[69]; + res_avx4 = c_bucket[70]; + res_avx6 = c_bucket[71]; + + c6_avx=c_bucket[74]; + c7_avx=c_bucket[75]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx); + + res_avx1 = c_bucket[84]; + res_avx3 = c_bucket[85]; + res_avx5 = c_bucket[86]; + res_avx7 = c_bucket[87]; + + c22_avx=c_bucket[90]; + c23_avx=c_bucket[91]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final4[0]=res_avx0; + result_final4[1]=res_avx1; + + result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final4[6]=res_avx6; + result_final4[7]=res_avx7; + + + //------------------------AVX interpolation for 5th poly ends-------------- + + //------------------------AVX interpolation for 6th poly external------------------- + + //loop1 + res_avx0 = c_bucket[77]; + res_avx2 = c_bucket[78]; + res_avx4 = c_bucket[79]; + res_avx6 = c_bucket[96]; + + c6_avx=c_bucket[99]; + c7_avx=c_bucket[100]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx); + + res_avx1 = c_bucket[93]; + res_avx3 = c_bucket[94]; + res_avx5 = c_bucket[95]; + res_avx7 = c_bucket[112]; + + c22_avx=c_bucket[115]; + c23_avx=c_bucket[116]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final5[0]=res_avx0; + result_final5[1]=res_avx1; + + result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final5[6]=res_avx6; + result_final5[7]=res_avx7; + + + //------------------------AVX interpolation for 6th poly ends-------------- + + //------------------------AVX interpolation for 7th poly external------------------- + + //loop1 + res_avx0 = c_bucket[102]; + res_avx2 = c_bucket[103]; + res_avx4 = c_bucket[104]; + res_avx6 = c_bucket[105]; + + c6_avx=c_bucket[108]; + c7_avx=c_bucket[109]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx); + + res_avx1 = c_bucket[118]; + res_avx3 = c_bucket[119]; + res_avx5 = c_bucket[120]; + res_avx7 = c_bucket[121]; + + c22_avx=c_bucket[124]; + c23_avx=c_bucket[125]; + + c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7); + + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx); + + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3); + + temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + + c22_avx=_mm256_add_epi16(c22_avx, c8_avx); + + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + + c7_avx=_mm256_add_epi16(c7_avx, c24_avx); + + + //loop4 + + c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + + c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + //loop5 + result_final6[0]=res_avx0; + result_final6[1]=res_avx1; + + result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx); + result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx); + + + result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx); + result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx); + + result_final6[6]=res_avx6; + result_final6[7]=res_avx7; + + + //------------------------AVX interpolation for 7th poly ends-------------- + + //CLOCK2=cpucycles(); + //CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1); + //printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1); + + + +} + +static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ + + int i; + +//---------------AVX data----------------------------- + + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i aw_avx[7*small_len_avx]; + +//----------------AVX data---------------------------- + + +// EVALUATION + + //CLOCK1=cpucycles(); + + for (i=0; i>= 63; + return (uint8_t) r; +} + +/* b = 1 means mov, b = 0 means don't mov*/ +void PQCLEAN_SABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { + size_t i; + + b = -b; + for (i = 0; i < len; i++) { + r[i] ^= b & (x[i] ^ r[i]); + } +} diff --git a/crypto_kem/saber/avx2/verify.h b/crypto_kem/saber/avx2/verify.h new file mode 100644 index 00000000..32edf5d0 --- /dev/null +++ b/crypto_kem/saber/avx2/verify.h @@ -0,0 +1,22 @@ +#ifndef VERIFY_H +#define VERIFY_H +/*------------------------------------------------- +This file has been adapted from the implementation +(available at https://github.com/pq-crystals/kyber) of +"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" + by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, +Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle +----------------------------------------------------*/ + +#include +#include + +/* returns 0 for equal strings, 1 for non-equal strings */ +uint8_t PQCLEAN_SABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len); + + +/* b = 1 means mov, b = 0 means don't mov*/ +void PQCLEAN_SABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); + + +#endif diff --git a/crypto_kem/saber/clean/LICENSE b/crypto_kem/saber/clean/LICENSE index 08c799e3..d5d21fff 100644 --- a/crypto_kem/saber/clean/LICENSE +++ b/crypto_kem/saber/clean/LICENSE @@ -1,8 +1 @@ ----------------------------------------------------------------------------------------- -SABER_v1.1 - -Public domain - -Authors: Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy, -Frederik Vercauteren ----------------------------------------------------------------------------------------- +Public Domain diff --git a/crypto_kem/saber/clean/Makefile b/crypto_kem/saber/clean/Makefile index 2052d200..cbc1357c 100644 --- a/crypto_kem/saber/clean/Makefile +++ b/crypto_kem/saber/clean/Makefile @@ -1,10 +1,10 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libsaber_clean.a -HEADERS=api.h cbd.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h pack_unpack.h +HEADERS=api.h cbd.h pack_unpack.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o -CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS) +CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) all: $(LIB) diff --git a/crypto_kem/saber/clean/SABER_indcpa.c b/crypto_kem/saber/clean/SABER_indcpa.c index d381194c..fe54f4ca 100644 --- a/crypto_kem/saber/clean/SABER_indcpa.c +++ b/crypto_kem/saber/clean/SABER_indcpa.c @@ -3,296 +3,90 @@ #include "fips202.h" #include "pack_unpack.h" #include "poly.h" -#include "poly_mul.h" #include "randombytes.h" #include #include +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) +void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + uint16_t A[SABER_L][SABER_L][SABER_N]; + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N] = {0}; -/*----------------------------------------------------------------------------------- - This routine generates a=[Matrix K x K] of 256-coefficient polynomials --------------------------------------------------------------------------------------*/ + uint8_t seed_A[SABER_SEEDBYTES]; + uint8_t seed_s[SABER_NOISE_SEEDBYTES]; + int i, j; -#define h1 4 //2^(EQ-EP-1) + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state + randombytes(seed_s, SABER_NOISE_SEEDBYTES); -#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) + PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A); + PQCLEAN_SABER_CLEAN_GenSecret(s, seed_s); + PQCLEAN_SABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1); -static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]); -static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose); - -static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec); - -static void GenMatrix(polyvec *a, const unsigned char *seed) { - unsigned char buf[SABER_K * SABER_K * (13 * SABER_N / 8)]; - - uint16_t temp_ar[SABER_N]; - - int i, j, k; - uint16_t mod = (SABER_Q - 1); - - shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_SABER_CLEAN_BS2POL(buf + (i * SABER_K + j) * (13 * SABER_N / 8), temp_ar); - for (k = 0; k < SABER_N; k++) { - a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; - } + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_N; j++) { + b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP); } } + + PQCLEAN_SABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); + PQCLEAN_SABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); + memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); } +void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + uint16_t A[SABER_L][SABER_L][SABER_N]; + uint16_t sp[SABER_L][SABER_N]; + uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t vp[SABER_N] = {0}; + uint16_t mp[SABER_N]; + uint16_t b[SABER_L][SABER_N]; + int i, j; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; -void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk) { - polyvec a[SABER_K]; + PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A); + PQCLEAN_SABER_CLEAN_GenSecret(sp, seed_sp); + PQCLEAN_SABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); - uint16_t skpv[SABER_K][SABER_N]; - - unsigned char seed[SABER_SEEDBYTES]; - unsigned char noiseseed[SABER_COINBYTES]; - int32_t i, j; - uint16_t mod_q = SABER_Q - 1; - - - uint16_t res[SABER_K][SABER_N]; - - randombytes(seed, SABER_SEEDBYTES); - - // for not revealing system RNG state - shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); - randombytes(noiseseed, SABER_COINBYTES); - - GenMatrix(a, seed); //sample matrix A - - // generate secret from constant-time binomial distribution - PQCLEAN_SABER_CLEAN_GenSecret(skpv, noiseseed); - - // do the matrix vector multiplication and rounding - for (i = 0; i < SABER_K; i++) { + for (i = 0; i < SABER_L; i++) { for (j = 0; j < SABER_N; j++) { - res[i][j] = 0; - } - } - MatrixVectorMul(a, skpv, res, SABER_Q - 1, 1); - - // now rounding - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - // shift right 3 bits - res[i][j] = (res[i][j] + h1) & (mod_q); - res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP)); + bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); } } - // unload and pack sk=3 x (256 coefficients of 14 bits) - PQCLEAN_SABER_CLEAN_POLVEC2BS(sk, skpv, SABER_Q); + PQCLEAN_SABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); + PQCLEAN_SABER_CLEAN_BS2POLVECp(b, pk); + PQCLEAN_SABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); - // unload and pack pk=256 bits seed and 3 x (256 coefficients of 11 bits) - // load the public-key coefficients - PQCLEAN_SABER_CLEAN_POLVEC2BS(pk, res, SABER_P); + PQCLEAN_SABER_CLEAN_BS2POLmsg(mp, m); - - // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. - for (i = 0; i < SABER_SEEDBYTES; i++) { - pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; + for (j = 0; j < SABER_N; j++) { + vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); } + PQCLEAN_SABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); } +void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { -void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(const unsigned char *message_received, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext) { - uint32_t i, j, k; - polyvec a[SABER_K]; - unsigned char seed[SABER_SEEDBYTES]; - // public key of received by the client - uint16_t pkcl[SABER_K][SABER_N]; - uint16_t skpv1[SABER_K][SABER_N]; - uint16_t message[SABER_KEYBYTES * 8]; - uint16_t res[SABER_K][SABER_N]; - uint16_t mod_p = SABER_P - 1; - uint16_t mod_q = SABER_Q - 1; - uint16_t vprime[SABER_N]; - unsigned char msk_c[SABER_SCALEBYTES_KEM]; + uint16_t s[SABER_L][SABER_N]; + uint16_t b[SABER_L][SABER_N]; + uint16_t v[SABER_N] = {0}; + uint16_t cm[SABER_N]; + int i; - // extract the seedbytes from Public Key. - for (i = 0; i < SABER_SEEDBYTES; i++) { - seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; - } + PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk); + PQCLEAN_SABER_CLEAN_BS2POLVECp(b, ciphertext); + PQCLEAN_SABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); + PQCLEAN_SABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); - GenMatrix(a, seed); - - // generate secret from constant-time binomial distribution - PQCLEAN_SABER_CLEAN_GenSecret(skpv1, noiseseed); - - // matrix-vector multiplication and rounding - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - res[i][j] = 0; - } - } - MatrixVectorMul(a, skpv1, res, SABER_Q - 1, 0); - - // now rounding - //shift right 3 bits - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - res[i][j] = ( res[i][j] + h1 ) & mod_q; - res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP) ); - } - } - - PQCLEAN_SABER_CLEAN_POLVEC2BS(ciphertext, res, SABER_P); - - // ************client matrix-vector multiplication ends************ - - // now calculate the v' - // unpack the public_key - // pkcl is the b in the protocol - PQCLEAN_SABER_CLEAN_BS2POLVEC(pk, pkcl, SABER_P); for (i = 0; i < SABER_N; i++) { - vprime[i] = 0; - } - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - skpv1[i][j] = skpv1[i][j] & (mod_p); - } + v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); } - // vector-vector scalar multiplication with mod p - InnerProd(pkcl, skpv1, mod_p, vprime); - - // addition of h1 to vprime - for (i = 0; i < SABER_N; i++) { - vprime[i] = vprime[i] + h1; - } - - // unpack message_received; - for (j = 0; j < SABER_KEYBYTES; j++) { - for (i = 0; i < 8; i++) { - message[8 * j + i] = ((message_received[j] >> i) & 0x01); - } - } - - // message encoding - for (i = 0; i < SABER_N; i++) { - message[i] = (message[i] << (SABER_EP - 1)); - } - - for (k = 0; k < SABER_N; k++) { - vprime[k] = ( (vprime[k] - message[k]) & (mod_p) ) >> (SABER_EP - SABER_ET); - } - - - PQCLEAN_SABER_CLEAN_pack_4bit(msk_c, vprime); - - for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { - ciphertext[SABER_POLYVECCOMPRESSEDBYTES + j] = msk_c[j]; - } -} - - -void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char message_dec[]) { - uint32_t i, j; - // secret key of the server - uint16_t sksv[SABER_K][SABER_N]; - uint16_t pksv[SABER_K][SABER_N]; - uint8_t scale_ar[SABER_SCALEBYTES_KEM]; - uint16_t mod_p = SABER_P - 1; - uint16_t v[SABER_N]; - uint16_t op[SABER_N]; - - // sksv is the secret-key - PQCLEAN_SABER_CLEAN_BS2POLVEC(sk, sksv, SABER_Q); - // pksv is the ciphertext - PQCLEAN_SABER_CLEAN_BS2POLVEC(ciphertext, pksv, SABER_P); - - // vector-vector scalar multiplication with mod p - for (i = 0; i < SABER_N; i++) { - v[i] = 0; - } - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N; j++) { - sksv[i][j] = sksv[i][j] & (mod_p); - } - } - InnerProd(pksv, sksv, mod_p, v); - - //Extraction - for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { - scale_ar[i] = ciphertext[SABER_POLYVECCOMPRESSEDBYTES + i]; - } - - PQCLEAN_SABER_CLEAN_un_pack4bit(scale_ar, op); - - //addition of h1 - for (i = 0; i < SABER_N; i++) { - v[i] = ( ( v[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (mod_p) ) >> (SABER_EP - 1); - } - - // pack decrypted message - POL2MSG(v, message_dec); -} -static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose) { - uint16_t acc[SABER_N]; - int32_t i, j, k; - - if (transpose == 1) { - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_SABER_CLEAN_pol_mul((uint16_t *)&a[j].vec[i], skpv[j], acc, SABER_Q, SABER_N); - - for (k = 0; k < SABER_N; k++) { - res[i][k] = res[i][k] + acc[k]; - //reduction mod p - res[i][k] = (res[i][k] & mod); - //clear the accumulator - acc[k] = 0; - } - } - } - } else { - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_SABER_CLEAN_pol_mul((uint16_t *)&a[i].vec[j], skpv[j], acc, SABER_Q, SABER_N); - for (k = 0; k < SABER_N; k++) { - res[i][k] = res[i][k] + acc[k]; - // reduction - res[i][k] = res[i][k] & mod; - // clear the accumulator - acc[k] = 0; - } - } - } - } -} - -static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec) { - int32_t i, j; - - for (j = 0; j < SABER_KEYBYTES; j++) { - message_dec[j] = 0; - for (i = 0; i < 8; i++) { - message_dec[j] = message_dec[j] | (uint8_t) (message_dec_unpacked[j * 8 + i] << i); - } - } -} - - -static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]) { - uint32_t j, k; - uint16_t acc[SABER_N]; - - // vector-vector scalar multiplication with mod p - for (j = 0; j < SABER_K; j++) { - PQCLEAN_SABER_CLEAN_pol_mul(pkcl[j], skpv[j], acc, SABER_P, SABER_N); - - for (k = 0; k < SABER_N; k++) { - res[k] = res[k] + acc[k]; - // reduction - res[k] = res[k] & mod; - // clear the accumulator - acc[k] = 0; - } - } + PQCLEAN_SABER_CLEAN_POLmsg2BS(m, v); } diff --git a/crypto_kem/saber/clean/SABER_indcpa.h b/crypto_kem/saber/clean/SABER_indcpa.h index f8503f66..3be3ce1c 100644 --- a/crypto_kem/saber/clean/SABER_indcpa.h +++ b/crypto_kem/saber/clean/SABER_indcpa.h @@ -1,9 +1,13 @@ #ifndef INDCPA_H #define INDCPA_H +#include "SABER_params.h" +#include + +void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); + +void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); + +void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); -void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk); -void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(const unsigned char *message, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext); -void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char *message_dec); #endif - diff --git a/crypto_kem/saber/clean/SABER_params.h b/crypto_kem/saber/clean/SABER_params.h index faa9f6db..200ed0e6 100644 --- a/crypto_kem/saber/clean/SABER_params.h +++ b/crypto_kem/saber/clean/SABER_params.h @@ -1,50 +1,39 @@ #ifndef PARAMS_H #define PARAMS_H -#include "api.h" -#define SABER_K 3 +/* Change this for different security strengths */ + +/* Don't change anything below this line */ +#define SABER_L 3 #define SABER_MU 8 #define SABER_ET 4 - #define SABER_EQ 13 #define SABER_EP 10 - #define SABER_N 256 -#define SABER_Q 8192 -#define SABER_P 1024 -#define SABER_SEEDBYTES 32 -#define SABER_NOISESEEDBYTES 32 -#define SABER_COINBYTES 32 -#define SABER_KEYBYTES 32 +#define SABER_SEEDBYTES 32 +#define SABER_NOISE_SEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 -#define SABER_HASHBYTES 32 +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) -#define SABER_POLYBYTES 416 //13*256/8 +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) -#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) -#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation - -#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) - -#define SABER_SCALEBYTES (SABER_DELTA*SABER_N/8) - -#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) -#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) - -#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ - - - +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) #endif - diff --git a/crypto_kem/saber/clean/api.h b/crypto_kem/saber/clean/api.h index 66c3b8bf..699a19f4 100644 --- a/crypto_kem/saber/clean/api.h +++ b/crypto_kem/saber/clean/api.h @@ -1,14 +1,18 @@ #ifndef PQCLEAN_SABER_CLEAN_API_H #define PQCLEAN_SABER_CLEAN_API_H + #define PQCLEAN_SABER_CLEAN_CRYPTO_ALGNAME "Saber" -#define PQCLEAN_SABER_CLEAN_CRYPTO_SECRETKEYBYTES 2304 -#define PQCLEAN_SABER_CLEAN_CRYPTO_PUBLICKEYBYTES (3*320+32) #define PQCLEAN_SABER_CLEAN_CRYPTO_BYTES 32 #define PQCLEAN_SABER_CLEAN_CRYPTO_CIPHERTEXTBYTES 1088 +#define PQCLEAN_SABER_CLEAN_CRYPTO_PUBLICKEYBYTES 992 +#define PQCLEAN_SABER_CLEAN_CRYPTO_SECRETKEYBYTES 2304 int PQCLEAN_SABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); -int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); -int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + +int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk); + +int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); + #endif /* api_h */ diff --git a/crypto_kem/saber/clean/cbd.c b/crypto_kem/saber/clean/cbd.c index a2d9fcdd..e0ccef9d 100644 --- a/crypto_kem/saber/clean/cbd.c +++ b/crypto_kem/saber/clean/cbd.c @@ -1,3 +1,7 @@ +#include "SABER_params.h" +#include "api.h" +#include "cbd.h" +#include /*--------------------------------------------------------------------- This file has been adapted from the implementation (available at, Public Domain https://github.com/pq-crystals/kyber) @@ -6,12 +10,8 @@ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -#include "SABER_params.h" -#include "api.h" -#include "cbd.h" -#include -static uint64_t load_littleendian(const unsigned char *x, int bytes) { +static uint64_t load_littleendian(const uint8_t *x, int bytes) { int i; uint64_t r = x[0]; for (i = 1; i < bytes; i++) { @@ -20,32 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { return r; } - -void PQCLEAN_SABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) { - uint16_t Qmod_minus1 = SABER_Q - 1; - +void PQCLEAN_SABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { uint32_t t, d, a[4], b[4]; int i, j; for (i = 0; i < SABER_N / 4; i++) { - t = (uint32_t) load_littleendian(buf + 4 * i, 4); + t = load_littleendian(buf + 4 * i, 4); d = 0; for (j = 0; j < 4; j++) { d += (t >> j) & 0x11111111; } - a[0] = d & 0xf; - b[0] = (d >> 4) & 0xf; - a[1] = (d >> 8) & 0xf; + a[0] = d & 0xf; + b[0] = (d >> 4) & 0xf; + a[1] = (d >> 8) & 0xf; b[1] = (d >> 12) & 0xf; a[2] = (d >> 16) & 0xf; b[2] = (d >> 20) & 0xf; a[3] = (d >> 24) & 0xf; b[3] = (d >> 28); - r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; - r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; - r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; - r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; + s[4 * i + 0] = (uint16_t)(a[0] - b[0]); + s[4 * i + 1] = (uint16_t)(a[1] - b[1]); + s[4 * i + 2] = (uint16_t)(a[2] - b[2]); + s[4 * i + 3] = (uint16_t)(a[3] - b[3]); } } diff --git a/crypto_kem/saber/clean/cbd.h b/crypto_kem/saber/clean/cbd.h index b307921f..88b0b0b5 100644 --- a/crypto_kem/saber/clean/cbd.h +++ b/crypto_kem/saber/clean/cbd.h @@ -1,6 +1,5 @@ #ifndef CBD_H #define CBD_H - /*--------------------------------------------------------------------- This file has been adapted from the implementation (available at, Public Domain https://github.com/pq-crystals/kyber) @@ -8,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ - -#include "poly.h" +#include "SABER_params.h" #include -void PQCLEAN_SABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf); +void PQCLEAN_SABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); + #endif diff --git a/crypto_kem/saber/clean/kem.c b/crypto_kem/saber/clean/kem.c index 9e5b01f4..ed8e3ac7 100644 --- a/crypto_kem/saber/clean/kem.c +++ b/crypto_kem/saber/clean/kem.c @@ -1,5 +1,6 @@ #include "SABER_indcpa.h" #include "SABER_params.h" +#include "api.h" #include "fips202.h" #include "randombytes.h" #include "verify.h" @@ -7,90 +8,71 @@ #include #include -int PQCLEAN_SABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) { + +int PQCLEAN_SABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { int i; - // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk - PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(pk, sk); - - // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk + PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { - sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; + sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i]; // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk } - // Then hash(pk) is appended. - sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); + sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended. - // Remaining part of sk contains a pseudo-random number. - // This is output when check in crypto_kem_dec() fails. - randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES ); + randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number. + // This is output when check in PQCLEAN_SABER_CLEAN_crypto_kem_dec() fails. return (0); } -int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) { - // Will contain key, coins - unsigned char kr[64]; - unsigned char buf[64]; +int PQCLEAN_SABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) { + + uint8_t kr[64]; // Will contain key, coins + uint8_t buf[64]; randombytes(buf, 32); - // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output - sha3_256(buf, buf, 32); + sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output - // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM - sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); - - // kr[0:63] <-- Hash(buf[0:63]); - sha3_512(kr, buf, 64); + sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key); Multitarget countermeasure for coins + contributory KEM + sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); // K^ <-- kr[0:31] // noiseseed (r) <-- kr[32:63]; - // buf[0:31] contains message; kr[32:63] contains randomness r; - PQCLEAN_SABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, ct); + PQCLEAN_SABER_CLEAN_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; - sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC); + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); - // hash concatenation of pre-k and h(c) to k - sha3_256(ss, kr, 64); + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k return (0); } - -int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) { +int PQCLEAN_SABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { int i; - unsigned char fail; - unsigned char cmp[SABER_BYTES_CCA_DEC]; - unsigned char buf[64]; - - // Will contain key, coins - unsigned char kr[64]; - const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES; - - // buf[0:31] <-- message - PQCLEAN_SABER_CLEAN_indcpa_kem_dec(sk, ct, buf); + uint8_t fail; + uint8_t cmp[SABER_BYTES_CCA_DEC]; + uint8_t buf[64]; + uint8_t kr[64]; // Will contain key, coins + const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES; + PQCLEAN_SABER_CLEAN_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message // Multitarget countermeasure for coins + contributory KEM - // Save hash by storing h(pk) in sk - for (i = 0; i < 32; i++) { + for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i]; } sha3_512(kr, buf, 64); - PQCLEAN_SABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, cmp); + PQCLEAN_SABER_CLEAN_indcpa_kem_enc(cmp, buf, kr + 32, pk); + fail = PQCLEAN_SABER_CLEAN_verify(c, cmp, SABER_BYTES_CCA_DEC); - fail = PQCLEAN_SABER_CLEAN_verify(ct, cmp, SABER_BYTES_CCA_DEC); - - // overwrite coins in kr with h(c) - sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC); + sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c) PQCLEAN_SABER_CLEAN_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail); - // hash concatenation of pre-k and h(c) to k - sha3_256(ss, kr, 64); + sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k return (0); } diff --git a/crypto_kem/saber/clean/pack_unpack.c b/crypto_kem/saber/clean/pack_unpack.c index 06a74778..e196bd34 100644 --- a/crypto_kem/saber/clean/pack_unpack.c +++ b/crypto_kem/saber/clean/pack_unpack.c @@ -1,254 +1,132 @@ +#include "api.h" #include "pack_unpack.h" +#include -void PQCLEAN_SABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | - ((data[offset_data + 1] & 0x7) << 3) | - ((data[offset_data + 2] & 0x3) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | - ((data[offset_data + 3] & 0x7) << 1) | - ((data[offset_data + 4] & 0x7) << 4) | - (((data[offset_data + 5]) & 0x01) << 7); - bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | - ((data[offset_data + 6] & 0x7) << 2) | - ((data[offset_data + 7] & 0x7) << 5); - } -} - -void PQCLEAN_SABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; - data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3 ) & 0x07; - data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6 ) & 0x03) | - (((bytes[offset_byte + 1]) & 0x01) << 2); - data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1 ) & 0x07; - data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4 ) & 0x07; - data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7 ) & 0x01) | - (((bytes[offset_byte + 2]) & 0x03) << 1); - data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07); - data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07); - } -} - -void PQCLEAN_SABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data; - +void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 2; j++) { + offset_byte = j; offset_data = 2 * j; - bytes[j] = (data[offset_data] & 0x0f) | - ((data[offset_data + 1] & 0x0f) << 4); + bytes[offset_byte] = (data[offset_data] & 0x0f) | ((data[offset_data + 1] & 0x0f) << 4); } } -void PQCLEAN_SABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar) { - uint32_t j; - uint32_t offset_data; - +void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 2; j++) { + offset_byte = j; offset_data = 2 * j; - ar[offset_data] = bytes[j] & 0x0f; - ar[offset_data + 1] = (bytes[j] >> 4) & 0x0f; + data[offset_data] = bytes[offset_byte] & 0x0f; + data[offset_data + 1] = (bytes[offset_byte] >> 4) & 0x0f; } } -void PQCLEAN_SABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | - ((data[offset_data + 1] & 0x03) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | - ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | - ((data[offset_data + 3] & 0x3f) << 2); - } -} - - -void PQCLEAN_SABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data) { - uint32_t j; - uint32_t offset_data, offset_byte; - - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; - data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | - ((bytes[offset_byte + 1] & 0x0f) << 2); - data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | - ((bytes[offset_byte + 2] & 0x03) << 4); - data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); - } -} - - -static void POLVECp2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x03) | - ((data[i][offset_data + 1] & 0x3f) << 2); - bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 6) & 0x0f) | - ((data[i][offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 3] = ((data[i][offset_data + 2] >> 4) & 0x3f) | - ((data[i][offset_data + 3] & 0x03) << 6); - bytes[offset_byte + 4] = ((data[i][offset_data + 3] >> 2) & 0xff); - } - } -} - -static void BS2POLVECp(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x03) << 8); - data[i][offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | - ((bytes[offset_byte + 2] & 0x0f) << 6); - data[i][offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | - ((bytes[offset_byte + 3] & 0x3f) << 4); - data[i][offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | - ((bytes[offset_byte + 4] & 0xff) << 2); - } - } -} - - - -static void POLVECq2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x1f) | - ((data[i][offset_data + 1] & 0x07) << 5); - bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 3) & 0xff); - bytes[offset_byte + 3] = ((data[i][offset_data + 1] >> 11) & 0x03) | - ((data[i][offset_data + 2] & 0x3f) << 2); - bytes[offset_byte + 4] = ((data[i][offset_data + 2] >> 6) & 0x7f) | - ((data[i][offset_data + 3] & 0x01) << 7); - bytes[offset_byte + 5] = ((data[i][offset_data + 3] >> 1) & 0xff); - bytes[offset_byte + 6] = ((data[i][offset_data + 3] >> 9) & 0x0f) | - ((data[i][offset_data + 4] & 0x0f) << 4); - bytes[offset_byte + 7] = ((data[i][offset_data + 4] >> 4) & 0xff); - bytes[offset_byte + 8] = ((data[i][offset_data + 4] >> 12) & 0x01) | - ((data[i][offset_data + 5] & 0x7f) << 1); - bytes[offset_byte + 9] = ((data[i][offset_data + 5] >> 7) & 0x3f) | - ((data[i][offset_data + 6] & 0x03) << 6); - bytes[offset_byte + 10] = ((data[i][offset_data + 6] >> 2) & 0xff); - bytes[offset_byte + 11] = ((data[i][offset_data + 6] >> 10) & 0x07) | - ((data[i][offset_data + 7] & 0x1f) << 3); - bytes[offset_byte + 12] = ((data[i][offset_data + 7] >> 5) & 0xff); - } - } -} - -static void BS2POLVECq(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) { - uint32_t i, j; - uint32_t offset_data, offset_byte, offset_byte1; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | - ((bytes[offset_byte + 2] & 0xff) << 3) | - ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | - ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | - ((bytes[offset_byte + 5] & 0xff) << 1) | - ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | - ((bytes[offset_byte + 7] & 0xff) << 4) | - ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | - ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | - ((bytes[offset_byte + 10] & 0xff) << 2) | - ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | - ((bytes[offset_byte + 12] & 0xff) << 5); - } - } -} - -//only BS2POLq no BS2POLp -void PQCLEAN_SABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]) { - uint32_t j; - uint32_t offset_data, offset_byte; - +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; for (j = 0; j < SABER_N / 8; j++) { offset_byte = 13 * j; offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | - ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | - ((bytes[offset_byte + 2] & 0xff) << 3) | - ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | - ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | - ((bytes[offset_byte + 5] & 0xff) << 1) | - ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | - ((bytes[offset_byte + 7] & 0xff) << 4) | - ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | - ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | - ((bytes[offset_byte + 10] & 0xff) << 2) | - ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | - ((bytes[offset_byte + 12] & 0xff) << 5); + bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); + bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); + bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); + bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); + bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); + bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); + bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); + bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); + bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); + bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); + bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); + bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); + bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); } } -void PQCLEAN_SABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - if (modulus == 1024) { - POLVECp2BS(bytes, data); - } else if (modulus == 8192) { - POLVECq2BS(bytes, data); +static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 8; j++) { + offset_byte = 13 * j; + offset_data = 8 * j; + data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); + data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); + data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); + data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); + data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); + data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); + data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); + data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); } } -void PQCLEAN_SABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - if (modulus == 1024) { - BS2POLVECp(bytes, data); - } else if (modulus == 8192) { - BS2POLVECq(bytes, data); +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 5 * j; + offset_data = 4 * j; + bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); + bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); + bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); + bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); + bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); + } +} + +static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j, offset_byte, offset_data; + for (j = 0; j < SABER_N / 4; j++) { + offset_byte = 5 * j; + offset_data = 4 * j; + data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); + data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); + data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); + data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); + } +} + +void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + } +} + +void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(data[i], bytes + i * SABER_POLYBYTES); + } +} + +void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + } +} + +void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); + } +} + +void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { + size_t i, j; + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + data[j * 8 + i] = ((bytes[j] >> i) & 0x01); + } + } +} + +void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { + size_t i, j; + memset(bytes, 0, SABER_KEYBYTES); + + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); + } } } diff --git a/crypto_kem/saber/clean/pack_unpack.h b/crypto_kem/saber/clean/pack_unpack.h index 2431a217..52537c07 100644 --- a/crypto_kem/saber/clean/pack_unpack.h +++ b/crypto_kem/saber/clean/pack_unpack.h @@ -1,28 +1,27 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H - #include "SABER_params.h" #include #include +void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); -void PQCLEAN_SABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_SABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data); - -void PQCLEAN_SABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_SABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar); - -void PQCLEAN_SABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_SABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data); +void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_SABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]); +void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); -void PQCLEAN_SABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus); +void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); + + +void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); + +void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); + + +void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); + +void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); -void PQCLEAN_SABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus); #endif diff --git a/crypto_kem/saber/clean/poly.c b/crypto_kem/saber/clean/poly.c index 93f55fde..f0403ccf 100644 --- a/crypto_kem/saber/clean/poly.c +++ b/crypto_kem/saber/clean/poly.c @@ -1,21 +1,49 @@ -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ -#include "SABER_params.h" +#include "api.h" #include "cbd.h" #include "fips202.h" +#include "pack_unpack.h" #include "poly.h" +#include "poly_mul.h" +#include -void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed) { - uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; - - shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); - - for (size_t i = 0; i < SABER_K; i++) { - PQCLEAN_SABER_CLEAN_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); +void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { + int i, j; + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_L; j++) { + if (transpose == 1) { + PQCLEAN_SABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]); + } else { + PQCLEAN_SABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]); + } + } + } +} + +void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { + int j; + for (j = 0; j < SABER_L; j++) { + PQCLEAN_SABER_CLEAN_poly_mul_acc(b[j], s[j], res); + } +} + +void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; + int i; + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_CLEAN_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); + } +} + +void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; + size_t i; + + shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); } } diff --git a/crypto_kem/saber/clean/poly.h b/crypto_kem/saber/clean/poly.h index 9d216804..dd882cb7 100644 --- a/crypto_kem/saber/clean/poly.h +++ b/crypto_kem/saber/clean/poly.h @@ -1,26 +1,15 @@ #ifndef POLY_H #define POLY_H - -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ - - #include "SABER_params.h" #include -typedef struct { - uint16_t coeffs[SABER_N]; -} poly; +void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); -typedef struct { - poly vec[SABER_K]; -} polyvec; +void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); + +void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); -void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed); #endif diff --git a/crypto_kem/saber/clean/poly_mul.c b/crypto_kem/saber/clean/poly_mul.c index dc1cc779..0655383b 100644 --- a/crypto_kem/saber/clean/poly_mul.c +++ b/crypto_kem/saber/clean/poly_mul.c @@ -228,19 +228,15 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re } } -void PQCLEAN_SABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n) { - uint32_t i; - // normal multiplication - uint16_t c[512]; - - for (i = 0; i < 512; i++) { - c[i] = 0; - } +/* res += a*b */ +void PQCLEAN_SABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) { + uint16_t c[2 * SABER_N] = {0}; + int i; toom_cook_4way(a, b, c); - // reduction - for (i = n; i < 2 * n; i++) { - res[i - n] = (c[i - n] - c[i]) & (p - 1); + /* reduction */ + for (i = SABER_N; i < 2 * SABER_N; i++) { + res[i - SABER_N] += (c[i - SABER_N] - c[i]); } } diff --git a/crypto_kem/saber/clean/poly_mul.h b/crypto_kem/saber/clean/poly_mul.h index f813be10..e0f10043 100644 --- a/crypto_kem/saber/clean/poly_mul.h +++ b/crypto_kem/saber/clean/poly_mul.h @@ -1,9 +1,9 @@ -#ifndef POLYMUL_H -#define POLYMUL_H - +#ifndef POLY_MUL_H +#define POLY_MUL_H #include "SABER_params.h" #include -void PQCLEAN_SABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n); +void PQCLEAN_SABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]); + #endif diff --git a/crypto_kem/saber/clean/verify.c b/crypto_kem/saber/clean/verify.c index 81f30604..72f4dd34 100644 --- a/crypto_kem/saber/clean/verify.c +++ b/crypto_kem/saber/clean/verify.c @@ -1,3 +1,5 @@ +#include "verify.h" + /*------------------------------------------------- This file has been adapted from the implementation (available at https://github.com/pq-crystals/kyber) of @@ -5,26 +7,25 @@ This file has been adapted from the implementation by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------*/ -#include "verify.h" -#include + /* returns 0 for equal strings, 1 for non-equal strings */ -unsigned char PQCLEAN_SABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len) { +uint8_t PQCLEAN_SABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) { uint64_t r; size_t i; - r = 0; + for (i = 0; i < len; i++) { r |= a[i] ^ b[i]; } r = (~r + 1); // Two's complement r >>= 63; - return (unsigned char)r; + return (uint8_t) r; } /* b = 1 means mov, b = 0 means don't mov*/ -void PQCLEAN_SABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) { +void PQCLEAN_SABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) { size_t i; b = -b; diff --git a/crypto_kem/saber/clean/verify.h b/crypto_kem/saber/clean/verify.h index cacb2ee6..f88fe396 100644 --- a/crypto_kem/saber/clean/verify.h +++ b/crypto_kem/saber/clean/verify.h @@ -1,6 +1,5 @@ #ifndef VERIFY_H #define VERIFY_H - /*------------------------------------------------- This file has been adapted from the implementation (available at https://github.com/pq-crystals/kyber) of @@ -13,9 +12,11 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle #include /* returns 0 for equal strings, 1 for non-equal strings */ -unsigned char PQCLEAN_SABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len); +uint8_t PQCLEAN_SABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len); + /* b = 1 means mov, b = 0 means don't mov*/ -void PQCLEAN_SABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b); +void PQCLEAN_SABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b); + #endif diff --git a/test/duplicate_consistency/firesaber_avx2.yml b/test/duplicate_consistency/firesaber_avx2.yml new file mode 100644 index 00000000..1790559f --- /dev/null +++ b/test/duplicate_consistency/firesaber_avx2.yml @@ -0,0 +1,7 @@ +consistency_checks: + - source: + scheme: firesaber + implementation: clean + files: + - verify.h + - verify.c diff --git a/test/duplicate_consistency/firesaber_clean.yml b/test/duplicate_consistency/firesaber_clean.yml index 60a1a153..3e93674e 100644 --- a/test/duplicate_consistency/firesaber_clean.yml +++ b/test/duplicate_consistency/firesaber_clean.yml @@ -1,31 +1,7 @@ consistency_checks: -- source: - scheme: lightsaber - implementation: clean - files: - - cbd.h - - kem.c - - pack_unpack.c - - pack_unpack.h - - poly.c - - poly.h - - poly_mul.c - - poly_mul.h - - SABER_indcpa.h - - verify.c - - verify.h -- source: - scheme: saber - implementation: clean - files: - - cbd.h - - kem.c - - pack_unpack.c - - pack_unpack.h - - poly.c - - poly.h - - poly_mul.c - - poly_mul.h - - SABER_indcpa.h - - verify.c - - verify.h + - source: + scheme: firesaber + implementation: avx2 + files: + - verify.h + - verify.c diff --git a/test/duplicate_consistency/lightsaber_avx2.yml b/test/duplicate_consistency/lightsaber_avx2.yml new file mode 100644 index 00000000..9239f8f0 --- /dev/null +++ b/test/duplicate_consistency/lightsaber_avx2.yml @@ -0,0 +1,45 @@ +consistency_checks: + - source: + scheme: lightsaber + implementation: clean + files: + - verify.h + - verify.c + - source: + scheme: saber + implementation: clean + files: + - verify.h + - verify.c + - source: + scheme: saber + implementation: avx2 + files: + - cbd.h + - kem.h + - pack_unpack.h + - poly.h + - SABER_indcpa.h + - verify.h + - kem.c + - pack_unpack.c + - verify.c + - source: + scheme: firesaber + implementation: clean + files: + - verify.h + - verify.c + - source: + scheme: firesaber + implementation: avx2 + files: + - cbd.h + - kem.h + - pack_unpack.h + - poly.h + - SABER_indcpa.h + - verify.h + - kem.c + - pack_unpack.c + - verify.c diff --git a/test/duplicate_consistency/lightsaber_clean.yml b/test/duplicate_consistency/lightsaber_clean.yml index a4d483be..14c8975d 100644 --- a/test/duplicate_consistency/lightsaber_clean.yml +++ b/test/duplicate_consistency/lightsaber_clean.yml @@ -1,31 +1,49 @@ consistency_checks: -- source: - scheme: saber - implementation: clean - files: - - cbd.h - - kem.c - - pack_unpack.c - - pack_unpack.h - - poly.c - - poly.h - - poly_mul.c - - poly_mul.h - - SABER_indcpa.h - - verify.c - - verify.h -- source: - scheme: firesaber - implementation: clean - files: - - cbd.h - - kem.c - - pack_unpack.c - - pack_unpack.h - - poly.c - - poly.h - - poly_mul.c - - poly_mul.h - - SABER_indcpa.h - - verify.c - - verify.h + - source: + scheme: lightsaber + implementation: avx2 + files: + - verify.h + - verify.c + - source: + scheme: saber + implementation: clean + files: + - cbd.h + - pack_unpack.h + - poly.h + - poly_mul.h + - SABER_indcpa.h + - verify.h + - kem.c + - poly.c + - poly_mul.c + - SABER_indcpa.c + - verify.c + - source: + scheme: saber + implementation: avx2 + files: + - verify.h + - verify.c + - source: + scheme: firesaber + implementation: clean + files: + - cbd.h + - pack_unpack.h + - poly.h + - poly_mul.h + - SABER_indcpa.h + - verify.h + - kem.c + - poly.c + - poly_mul.c + - SABER_indcpa.c + - verify.c + - source: + scheme: firesaber + implementation: avx2 + files: + - verify.h + - verify.c diff --git a/test/duplicate_consistency/saber_avx2.yml b/test/duplicate_consistency/saber_avx2.yml new file mode 100644 index 00000000..010ac0c9 --- /dev/null +++ b/test/duplicate_consistency/saber_avx2.yml @@ -0,0 +1,26 @@ +consistency_checks: + - source: + scheme: saber + implementation: clean + files: + - verify.h + - verify.c + - source: + scheme: firesaber + implementation: clean + files: + - verify.h + - verify.c + - source: + scheme: firesaber + implementation: avx2 + files: + - cbd.h + - kem.h + - pack_unpack.h + - poly.h + - SABER_indcpa.h + - verify.h + - kem.c + - pack_unpack.c + - verify.c diff --git a/test/duplicate_consistency/saber_clean.yml b/test/duplicate_consistency/saber_clean.yml index 0e1b89dd..7f01d619 100644 --- a/test/duplicate_consistency/saber_clean.yml +++ b/test/duplicate_consistency/saber_clean.yml @@ -1,31 +1,28 @@ consistency_checks: -- source: - scheme: lightsaber - implementation: clean - files: - - cbd.h - - kem.c - - pack_unpack.c - - pack_unpack.h - - poly.c - - poly.h - - poly_mul.c - - poly_mul.h - - SABER_indcpa.h - - verify.c - - verify.h -- source: - scheme: firesaber - implementation: clean - files: - - cbd.h - - kem.c - - pack_unpack.c - - pack_unpack.h - - poly.c - - poly.h - - poly_mul.c - - poly_mul.h - - SABER_indcpa.h - - verify.c - - verify.h + - source: + scheme: saber + implementation: avx2 + files: + - verify.h + - verify.c + - source: + scheme: firesaber + implementation: clean + files: + - cbd.h + - pack_unpack.h + - poly.h + - poly_mul.h + - SABER_indcpa.h + - verify.h + - kem.c + - poly.c + - poly_mul.c + - SABER_indcpa.c + - verify.c + - source: + scheme: firesaber + implementation: avx2 + files: + - verify.h + - verify.c From baa309ea7a0b8e7c2b7d6ab3829713ae179ec790 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Thu, 15 Oct 2020 20:45:21 -0400 Subject: [PATCH 02/10] *saber/avx2: declare mul_add static --- crypto_kem/firesaber/avx2/polymul/scm_avx.c | 2 +- crypto_kem/lightsaber/avx2/polymul/scm_avx.c | 2 +- crypto_kem/saber/avx2/polymul/scm_avx.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/crypto_kem/firesaber/avx2/polymul/scm_avx.c b/crypto_kem/firesaber/avx2/polymul/scm_avx.c index 4e4f11f8..48870f51 100644 --- a/crypto_kem/firesaber/avx2/polymul/scm_avx.c +++ b/crypto_kem/firesaber/avx2/polymul/scm_avx.c @@ -4,7 +4,7 @@ #include -inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { +static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); } diff --git a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c index 4e4f11f8..48870f51 100644 --- a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c +++ b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c @@ -4,7 +4,7 @@ #include -inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { +static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); } diff --git a/crypto_kem/saber/avx2/polymul/scm_avx.c b/crypto_kem/saber/avx2/polymul/scm_avx.c index 4e4f11f8..48870f51 100644 --- a/crypto_kem/saber/avx2/polymul/scm_avx.c +++ b/crypto_kem/saber/avx2/polymul/scm_avx.c @@ -4,7 +4,7 @@ #include -inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { +static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); } From 2f97b11031d2329a257f1e0ea8b7c9dc22e88e00 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Fri, 16 Oct 2020 07:48:23 -0400 Subject: [PATCH 03/10] saber: fix initialization and casting issues --- crypto_kem/firesaber/clean/SABER_indcpa.c | 4 ++-- crypto_kem/firesaber/clean/cbd.c | 2 +- crypto_kem/lightsaber/clean/SABER_indcpa.c | 4 ++-- crypto_kem/saber/clean/SABER_indcpa.c | 4 ++-- crypto_kem/saber/clean/cbd.c | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.c b/crypto_kem/firesaber/clean/SABER_indcpa.c index 8f4364e7..76156e79 100644 --- a/crypto_kem/firesaber/clean/SABER_indcpa.c +++ b/crypto_kem/firesaber/clean/SABER_indcpa.c @@ -13,7 +13,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { uint16_t A[SABER_L][SABER_L][SABER_N]; uint16_t s[SABER_L][SABER_N]; - uint16_t b[SABER_L][SABER_N] = {0}; + uint16_t b[SABER_L][SABER_N] = {{0}}; uint8_t seed_A[SABER_SEEDBYTES]; uint8_t seed_s[SABER_NOISE_SEEDBYTES]; @@ -41,7 +41,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKE void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { uint16_t A[SABER_L][SABER_L][SABER_N]; uint16_t sp[SABER_L][SABER_N]; - uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t bp[SABER_L][SABER_N] = {{0}}; uint16_t vp[SABER_N] = {0}; uint16_t mp[SABER_N]; uint16_t b[SABER_L][SABER_N]; diff --git a/crypto_kem/firesaber/clean/cbd.c b/crypto_kem/firesaber/clean/cbd.c index 8032eb5c..28fbc61c 100644 --- a/crypto_kem/firesaber/clean/cbd.c +++ b/crypto_kem/firesaber/clean/cbd.c @@ -25,7 +25,7 @@ void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_PO int i, j; for (i = 0; i < SABER_N / 4; i++) { - t = load_littleendian(buf + 3 * i, 3); + t = (uint32_t) load_littleendian(buf + 3 * i, 3); d = 0; for (j = 0; j < 3; j++) { d += (t >> j) & 0x249249; diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.c b/crypto_kem/lightsaber/clean/SABER_indcpa.c index ccb72492..4b9cb150 100644 --- a/crypto_kem/lightsaber/clean/SABER_indcpa.c +++ b/crypto_kem/lightsaber/clean/SABER_indcpa.c @@ -13,7 +13,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { uint16_t A[SABER_L][SABER_L][SABER_N]; uint16_t s[SABER_L][SABER_N]; - uint16_t b[SABER_L][SABER_N] = {0}; + uint16_t b[SABER_L][SABER_N] = {{0}}; uint8_t seed_A[SABER_SEEDBYTES]; uint8_t seed_s[SABER_NOISE_SEEDBYTES]; @@ -41,7 +41,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICK void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { uint16_t A[SABER_L][SABER_L][SABER_N]; uint16_t sp[SABER_L][SABER_N]; - uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t bp[SABER_L][SABER_N] = {{0}}; uint16_t vp[SABER_N] = {0}; uint16_t mp[SABER_N]; uint16_t b[SABER_L][SABER_N]; diff --git a/crypto_kem/saber/clean/SABER_indcpa.c b/crypto_kem/saber/clean/SABER_indcpa.c index fe54f4ca..c36f02ea 100644 --- a/crypto_kem/saber/clean/SABER_indcpa.c +++ b/crypto_kem/saber/clean/SABER_indcpa.c @@ -13,7 +13,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { uint16_t A[SABER_L][SABER_L][SABER_N]; uint16_t s[SABER_L][SABER_N]; - uint16_t b[SABER_L][SABER_N] = {0}; + uint16_t b[SABER_L][SABER_N] = {{0}}; uint8_t seed_A[SABER_SEEDBYTES]; uint8_t seed_s[SABER_NOISE_SEEDBYTES]; @@ -41,7 +41,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYT void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { uint16_t A[SABER_L][SABER_L][SABER_N]; uint16_t sp[SABER_L][SABER_N]; - uint16_t bp[SABER_L][SABER_N] = {0}; + uint16_t bp[SABER_L][SABER_N] = {{0}}; uint16_t vp[SABER_N] = {0}; uint16_t mp[SABER_N]; uint16_t b[SABER_L][SABER_N]; diff --git a/crypto_kem/saber/clean/cbd.c b/crypto_kem/saber/clean/cbd.c index e0ccef9d..b8dee33b 100644 --- a/crypto_kem/saber/clean/cbd.c +++ b/crypto_kem/saber/clean/cbd.c @@ -25,7 +25,7 @@ void PQCLEAN_SABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCO int i, j; for (i = 0; i < SABER_N / 4; i++) { - t = load_littleendian(buf + 4 * i, 4); + t = (uint32_t) load_littleendian(buf + 4 * i, 4); d = 0; for (j = 0; j < 4; j++) { d += (t >> j) & 0x11111111; From e92a052ea48563d5d06bcb539e76b9f36b351d13 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Mon, 19 Oct 2020 12:23:48 -0400 Subject: [PATCH 04/10] saber: output pointers on left, and size_t for indexing --- crypto_kem/firesaber/META.yml | 4 ++-- crypto_kem/firesaber/avx2/SABER_indcpa.c | 12 +++++----- crypto_kem/firesaber/clean/SABER_indcpa.c | 6 ++--- crypto_kem/firesaber/clean/kem.c | 7 +++--- crypto_kem/firesaber/clean/poly.c | 14 ++++++------ crypto_kem/firesaber/clean/poly_mul.c | 26 +++++++++++----------- crypto_kem/firesaber/clean/poly_mul.h | 2 +- crypto_kem/lightsaber/META.yml | 4 ++-- crypto_kem/lightsaber/avx2/SABER_indcpa.c | 12 +++++----- crypto_kem/lightsaber/clean/SABER_indcpa.c | 6 ++--- crypto_kem/lightsaber/clean/kem.c | 7 +++--- crypto_kem/lightsaber/clean/poly.c | 14 ++++++------ crypto_kem/lightsaber/clean/poly_mul.c | 26 +++++++++++----------- crypto_kem/lightsaber/clean/poly_mul.h | 2 +- crypto_kem/saber/META.yml | 4 ++-- crypto_kem/saber/avx2/SABER_indcpa.c | 12 +++++----- crypto_kem/saber/clean/SABER_indcpa.c | 6 ++--- crypto_kem/saber/clean/kem.c | 7 +++--- crypto_kem/saber/clean/poly.c | 14 ++++++------ crypto_kem/saber/clean/poly_mul.c | 26 +++++++++++----------- crypto_kem/saber/clean/poly_mul.h | 2 +- 21 files changed, 105 insertions(+), 108 deletions(-) diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml index def16e46..d1781339 100644 --- a/crypto_kem/firesaber/META.yml +++ b/crypto_kem/firesaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/firesaber/avx2/SABER_indcpa.c b/crypto_kem/firesaber/avx2/SABER_indcpa.c index ab017224..625a3f6b 100644 --- a/crypto_kem/firesaber/avx2/SABER_indcpa.c +++ b/crypto_kem/firesaber/avx2/SABER_indcpa.c @@ -66,7 +66,7 @@ static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { } //********************************matrix-vector mul routines***************************************************** -static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) { +static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { int64_t i, j; __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time @@ -86,7 +86,7 @@ static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1 } -static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) { +static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { int64_t i; @@ -162,7 +162,7 @@ void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { for (j = 0; j < NUM_POLY; j++) { TC_eval(sk_avx[j], b_bucket[j]); } - matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order + matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order // Now truncation @@ -259,7 +259,7 @@ void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DE for (j = 0; j < NUM_POLY; j++) { TC_eval(sk_avx[j], b_bucket[j]); } - matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order + matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order // Now truncation @@ -302,7 +302,7 @@ void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DE // vector-vector scalar multiplication with mod p - vector_vector_mul(pkcl_avx, b_bucket, vprime_avx); + vector_vector_mul(vprime_avx, pkcl_avx, b_bucket); // Computation of v'+h1 for (i = 0; i < SABER_N / 16; i++) { //adding h1 @@ -392,7 +392,7 @@ void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint TC_eval(sksv_avx[j], b_bucket[j]); } - vector_vector_mul(pksv_avx, b_bucket, v_avx); + vector_vector_mul(v_avx, pksv_avx, b_bucket); for (i = 0; i < SABER_N / 16; i++) { _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.c b/crypto_kem/firesaber/clean/SABER_indcpa.c index 76156e79..342eb7ca 100644 --- a/crypto_kem/firesaber/clean/SABER_indcpa.c +++ b/crypto_kem/firesaber/clean/SABER_indcpa.c @@ -17,7 +17,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKE uint8_t seed_A[SABER_SEEDBYTES]; uint8_t seed_s[SABER_NOISE_SEEDBYTES]; - int i, j; + size_t i, j; randombytes(seed_A, SABER_SEEDBYTES); shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state @@ -45,7 +45,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_D uint16_t vp[SABER_N] = {0}; uint16_t mp[SABER_N]; uint16_t b[SABER_L][SABER_N]; - int i, j; + size_t i, j; const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A); @@ -77,7 +77,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uin uint16_t b[SABER_L][SABER_N]; uint16_t v[SABER_N] = {0}; uint16_t cm[SABER_N]; - int i; + size_t i; PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk); PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, ciphertext); diff --git a/crypto_kem/firesaber/clean/kem.c b/crypto_kem/firesaber/clean/kem.c index e94219a6..2ffe4e75 100644 --- a/crypto_kem/firesaber/clean/kem.c +++ b/crypto_kem/firesaber/clean/kem.c @@ -4,13 +4,12 @@ #include "fips202.h" #include "randombytes.h" #include "verify.h" +#include #include -#include -#include int PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - int i; + size_t i; PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { @@ -48,7 +47,7 @@ int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t } int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { - int i; + size_t i; uint8_t fail; uint8_t cmp[SABER_BYTES_CCA_DEC]; uint8_t buf[64]; diff --git a/crypto_kem/firesaber/clean/poly.c b/crypto_kem/firesaber/clean/poly.c index c65175fe..c6d729ba 100644 --- a/crypto_kem/firesaber/clean/poly.c +++ b/crypto_kem/firesaber/clean/poly.c @@ -4,31 +4,31 @@ #include "pack_unpack.h" #include "poly.h" #include "poly_mul.h" -#include +#include void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { - int i, j; + size_t i, j; for (i = 0; i < SABER_L; i++) { for (j = 0; j < SABER_L; j++) { if (transpose == 1) { - PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]); + PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); } else { - PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]); + PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); } } } } void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { - int j; + size_t j; for (j = 0; j < SABER_L; j++) { - PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(b[j], s[j], res); + PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res, b[j], s[j]); } } void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { uint8_t buf[SABER_L * SABER_POLYVECBYTES]; - int i; + size_t i; shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); diff --git a/crypto_kem/firesaber/clean/poly_mul.c b/crypto_kem/firesaber/clean/poly_mul.c index 27c92f29..6b527c21 100644 --- a/crypto_kem/firesaber/clean/poly_mul.c +++ b/crypto_kem/firesaber/clean/poly_mul.c @@ -11,13 +11,13 @@ #define OVERFLOWING_MUL(X, Y) ((uint16_t)((uint32_t)(X) * (uint32_t)(Y))) #define KARATSUBA_N 64 -static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t *result_final) { +static void karatsuba_simple(uint16_t *result_final, const uint16_t *a_1, const uint16_t *b_1) { uint16_t d01[KARATSUBA_N / 2 - 1]; uint16_t d0123[KARATSUBA_N / 2 - 1]; uint16_t d23[KARATSUBA_N / 2 - 1]; uint16_t result_d01[KARATSUBA_N - 1]; - int32_t i, j; + size_t i, j; memset(result_d01, 0, (KARATSUBA_N - 1)*sizeof(uint16_t)); memset(d01, 0, (KARATSUBA_N / 2 - 1)*sizeof(uint16_t)); @@ -110,7 +110,7 @@ static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t -static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *result) { +static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t *b1) { uint16_t inv3 = 43691, inv9 = 36409, inv15 = 61167; uint16_t aw1[N_SB], aw2[N_SB], aw3[N_SB], aw4[N_SB], aw5[N_SB], aw6[N_SB], aw7[N_SB]; @@ -181,13 +181,13 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re // MULTIPLICATION - karatsuba_simple(aw1, bw1, w1); - karatsuba_simple(aw2, bw2, w2); - karatsuba_simple(aw3, bw3, w3); - karatsuba_simple(aw4, bw4, w4); - karatsuba_simple(aw5, bw5, w5); - karatsuba_simple(aw6, bw6, w6); - karatsuba_simple(aw7, bw7, w7); + karatsuba_simple(w1, aw1, bw1); + karatsuba_simple(w2, aw2, bw2); + karatsuba_simple(w3, aw3, bw3); + karatsuba_simple(w4, aw4, bw4); + karatsuba_simple(w5, aw5, bw5); + karatsuba_simple(w6, aw6, bw6); + karatsuba_simple(w7, aw7, bw7); // INTERPOLATION for (i = 0; i < N_SB_RES; ++i) { @@ -229,11 +229,11 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re } /* res += a*b */ -void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) { +void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { uint16_t c[2 * SABER_N] = {0}; - int i; + size_t i; - toom_cook_4way(a, b, c); + toom_cook_4way(c, a, b); /* reduction */ for (i = SABER_N; i < 2 * SABER_N; i++) { diff --git a/crypto_kem/firesaber/clean/poly_mul.h b/crypto_kem/firesaber/clean/poly_mul.h index e554d60c..b6911577 100644 --- a/crypto_kem/firesaber/clean/poly_mul.h +++ b/crypto_kem/firesaber/clean/poly_mul.h @@ -3,7 +3,7 @@ #include "SABER_params.h" #include -void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]); +void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); #endif diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml index 1cc06c9a..7e1dd2eb 100644 --- a/crypto_kem/lightsaber/META.yml +++ b/crypto_kem/lightsaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/lightsaber/avx2/SABER_indcpa.c b/crypto_kem/lightsaber/avx2/SABER_indcpa.c index 3270a8c9..47f760e9 100644 --- a/crypto_kem/lightsaber/avx2/SABER_indcpa.c +++ b/crypto_kem/lightsaber/avx2/SABER_indcpa.c @@ -66,7 +66,7 @@ static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { } //********************************matrix-vector mul routines***************************************************** -static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) { +static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { int64_t i, j; __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time @@ -86,7 +86,7 @@ static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1 } -static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) { +static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { int64_t i; @@ -162,7 +162,7 @@ void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { for (j = 0; j < NUM_POLY; j++) { TC_eval(sk_avx[j], b_bucket[j]); } - matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order + matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order // Now truncation @@ -259,7 +259,7 @@ void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_D for (j = 0; j < NUM_POLY; j++) { TC_eval(sk_avx[j], b_bucket[j]); } - matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order + matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order // Now truncation @@ -302,7 +302,7 @@ void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_D // vector-vector scalar multiplication with mod p - vector_vector_mul(pkcl_avx, b_bucket, vprime_avx); + vector_vector_mul(vprime_avx, pkcl_avx, b_bucket); // Computation of v'+h1 for (i = 0; i < SABER_N / 16; i++) { //adding h1 @@ -392,7 +392,7 @@ void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uin TC_eval(sksv_avx[j], b_bucket[j]); } - vector_vector_mul(pksv_avx, b_bucket, v_avx); + vector_vector_mul(v_avx, pksv_avx, b_bucket); for (i = 0; i < SABER_N / 16; i++) { _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.c b/crypto_kem/lightsaber/clean/SABER_indcpa.c index 4b9cb150..9dcdfb93 100644 --- a/crypto_kem/lightsaber/clean/SABER_indcpa.c +++ b/crypto_kem/lightsaber/clean/SABER_indcpa.c @@ -17,7 +17,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICK uint8_t seed_A[SABER_SEEDBYTES]; uint8_t seed_s[SABER_NOISE_SEEDBYTES]; - int i, j; + size_t i, j; randombytes(seed_A, SABER_SEEDBYTES); shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state @@ -45,7 +45,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_ uint16_t vp[SABER_N] = {0}; uint16_t mp[SABER_N]; uint16_t b[SABER_L][SABER_N]; - int i, j; + size_t i, j; const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A); @@ -77,7 +77,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const ui uint16_t b[SABER_L][SABER_N]; uint16_t v[SABER_N] = {0}; uint16_t cm[SABER_N]; - int i; + size_t i; PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk); PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, ciphertext); diff --git a/crypto_kem/lightsaber/clean/kem.c b/crypto_kem/lightsaber/clean/kem.c index eb9353b1..d0a67736 100644 --- a/crypto_kem/lightsaber/clean/kem.c +++ b/crypto_kem/lightsaber/clean/kem.c @@ -4,13 +4,12 @@ #include "fips202.h" #include "randombytes.h" #include "verify.h" +#include #include -#include -#include int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - int i; + size_t i; PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { @@ -48,7 +47,7 @@ int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_ } int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { - int i; + size_t i; uint8_t fail; uint8_t cmp[SABER_BYTES_CCA_DEC]; uint8_t buf[64]; diff --git a/crypto_kem/lightsaber/clean/poly.c b/crypto_kem/lightsaber/clean/poly.c index 1c1e22cc..9bb55afe 100644 --- a/crypto_kem/lightsaber/clean/poly.c +++ b/crypto_kem/lightsaber/clean/poly.c @@ -4,31 +4,31 @@ #include "pack_unpack.h" #include "poly.h" #include "poly_mul.h" -#include +#include void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { - int i, j; + size_t i, j; for (i = 0; i < SABER_L; i++) { for (j = 0; j < SABER_L; j++) { if (transpose == 1) { - PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]); + PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); } else { - PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]); + PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); } } } } void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { - int j; + size_t j; for (j = 0; j < SABER_L; j++) { - PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(b[j], s[j], res); + PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res, b[j], s[j]); } } void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { uint8_t buf[SABER_L * SABER_POLYVECBYTES]; - int i; + size_t i; shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); diff --git a/crypto_kem/lightsaber/clean/poly_mul.c b/crypto_kem/lightsaber/clean/poly_mul.c index 5e37a024..c7f5c424 100644 --- a/crypto_kem/lightsaber/clean/poly_mul.c +++ b/crypto_kem/lightsaber/clean/poly_mul.c @@ -11,13 +11,13 @@ #define OVERFLOWING_MUL(X, Y) ((uint16_t)((uint32_t)(X) * (uint32_t)(Y))) #define KARATSUBA_N 64 -static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t *result_final) { +static void karatsuba_simple(uint16_t *result_final, const uint16_t *a_1, const uint16_t *b_1) { uint16_t d01[KARATSUBA_N / 2 - 1]; uint16_t d0123[KARATSUBA_N / 2 - 1]; uint16_t d23[KARATSUBA_N / 2 - 1]; uint16_t result_d01[KARATSUBA_N - 1]; - int32_t i, j; + size_t i, j; memset(result_d01, 0, (KARATSUBA_N - 1)*sizeof(uint16_t)); memset(d01, 0, (KARATSUBA_N / 2 - 1)*sizeof(uint16_t)); @@ -110,7 +110,7 @@ static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t -static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *result) { +static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t *b1) { uint16_t inv3 = 43691, inv9 = 36409, inv15 = 61167; uint16_t aw1[N_SB], aw2[N_SB], aw3[N_SB], aw4[N_SB], aw5[N_SB], aw6[N_SB], aw7[N_SB]; @@ -181,13 +181,13 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re // MULTIPLICATION - karatsuba_simple(aw1, bw1, w1); - karatsuba_simple(aw2, bw2, w2); - karatsuba_simple(aw3, bw3, w3); - karatsuba_simple(aw4, bw4, w4); - karatsuba_simple(aw5, bw5, w5); - karatsuba_simple(aw6, bw6, w6); - karatsuba_simple(aw7, bw7, w7); + karatsuba_simple(w1, aw1, bw1); + karatsuba_simple(w2, aw2, bw2); + karatsuba_simple(w3, aw3, bw3); + karatsuba_simple(w4, aw4, bw4); + karatsuba_simple(w5, aw5, bw5); + karatsuba_simple(w6, aw6, bw6); + karatsuba_simple(w7, aw7, bw7); // INTERPOLATION for (i = 0; i < N_SB_RES; ++i) { @@ -229,11 +229,11 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re } /* res += a*b */ -void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) { +void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { uint16_t c[2 * SABER_N] = {0}; - int i; + size_t i; - toom_cook_4way(a, b, c); + toom_cook_4way(c, a, b); /* reduction */ for (i = SABER_N; i < 2 * SABER_N; i++) { diff --git a/crypto_kem/lightsaber/clean/poly_mul.h b/crypto_kem/lightsaber/clean/poly_mul.h index 0d5cf6ed..5ec233bb 100644 --- a/crypto_kem/lightsaber/clean/poly_mul.h +++ b/crypto_kem/lightsaber/clean/poly_mul.h @@ -3,7 +3,7 @@ #include "SABER_params.h" #include -void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]); +void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); #endif diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml index 50250180..87187702 100644 --- a/crypto_kem/saber/META.yml +++ b/crypto_kem/saber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/saber/avx2/SABER_indcpa.c b/crypto_kem/saber/avx2/SABER_indcpa.c index d16a7a06..5515c610 100644 --- a/crypto_kem/saber/avx2/SABER_indcpa.c +++ b/crypto_kem/saber/avx2/SABER_indcpa.c @@ -66,7 +66,7 @@ static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { } //********************************matrix-vector mul routines***************************************************** -static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) { +static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { int64_t i, j; __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time @@ -86,7 +86,7 @@ static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1 } -static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) { +static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { int64_t i; @@ -162,7 +162,7 @@ void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { for (j = 0; j < NUM_POLY; j++) { TC_eval(sk_avx[j], b_bucket[j]); } - matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order + matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order // Now truncation @@ -259,7 +259,7 @@ void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], for (j = 0; j < NUM_POLY; j++) { TC_eval(sk_avx[j], b_bucket[j]); } - matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order + matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order // Now truncation @@ -302,7 +302,7 @@ void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], // vector-vector scalar multiplication with mod p - vector_vector_mul(pkcl_avx, b_bucket, vprime_avx); + vector_vector_mul(vprime_avx, pkcl_avx, b_bucket); // Computation of v'+h1 for (i = 0; i < SABER_N / 16; i++) { //adding h1 @@ -392,7 +392,7 @@ void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t TC_eval(sksv_avx[j], b_bucket[j]); } - vector_vector_mul(pksv_avx, b_bucket, v_avx); + vector_vector_mul(v_avx, pksv_avx, b_bucket); for (i = 0; i < SABER_N / 16; i++) { _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); diff --git a/crypto_kem/saber/clean/SABER_indcpa.c b/crypto_kem/saber/clean/SABER_indcpa.c index c36f02ea..23325749 100644 --- a/crypto_kem/saber/clean/SABER_indcpa.c +++ b/crypto_kem/saber/clean/SABER_indcpa.c @@ -17,7 +17,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYT uint8_t seed_A[SABER_SEEDBYTES]; uint8_t seed_s[SABER_NOISE_SEEDBYTES]; - int i, j; + size_t i, j; randombytes(seed_A, SABER_SEEDBYTES); shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state @@ -45,7 +45,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], uint16_t vp[SABER_N] = {0}; uint16_t mp[SABER_N]; uint16_t b[SABER_L][SABER_N]; - int i, j; + size_t i, j; const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A); @@ -77,7 +77,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t uint16_t b[SABER_L][SABER_N]; uint16_t v[SABER_N] = {0}; uint16_t cm[SABER_N]; - int i; + size_t i; PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk); PQCLEAN_SABER_CLEAN_BS2POLVECp(b, ciphertext); diff --git a/crypto_kem/saber/clean/kem.c b/crypto_kem/saber/clean/kem.c index ed8e3ac7..6a7f20c4 100644 --- a/crypto_kem/saber/clean/kem.c +++ b/crypto_kem/saber/clean/kem.c @@ -4,13 +4,12 @@ #include "fips202.h" #include "randombytes.h" #include "verify.h" +#include #include -#include -#include int PQCLEAN_SABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - int i; + size_t i; PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { @@ -48,7 +47,7 @@ int PQCLEAN_SABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk } int PQCLEAN_SABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { - int i; + size_t i; uint8_t fail; uint8_t cmp[SABER_BYTES_CCA_DEC]; uint8_t buf[64]; diff --git a/crypto_kem/saber/clean/poly.c b/crypto_kem/saber/clean/poly.c index f0403ccf..2c44e962 100644 --- a/crypto_kem/saber/clean/poly.c +++ b/crypto_kem/saber/clean/poly.c @@ -4,31 +4,31 @@ #include "pack_unpack.h" #include "poly.h" #include "poly_mul.h" -#include +#include void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { - int i, j; + size_t i, j; for (i = 0; i < SABER_L; i++) { for (j = 0; j < SABER_L; j++) { if (transpose == 1) { - PQCLEAN_SABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]); + PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); } else { - PQCLEAN_SABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]); + PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); } } } } void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { - int j; + size_t j; for (j = 0; j < SABER_L; j++) { - PQCLEAN_SABER_CLEAN_poly_mul_acc(b[j], s[j], res); + PQCLEAN_SABER_CLEAN_poly_mul_acc(res, b[j], s[j]); } } void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { uint8_t buf[SABER_L * SABER_POLYVECBYTES]; - int i; + size_t i; shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); diff --git a/crypto_kem/saber/clean/poly_mul.c b/crypto_kem/saber/clean/poly_mul.c index 0655383b..686960dc 100644 --- a/crypto_kem/saber/clean/poly_mul.c +++ b/crypto_kem/saber/clean/poly_mul.c @@ -11,13 +11,13 @@ #define OVERFLOWING_MUL(X, Y) ((uint16_t)((uint32_t)(X) * (uint32_t)(Y))) #define KARATSUBA_N 64 -static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t *result_final) { +static void karatsuba_simple(uint16_t *result_final, const uint16_t *a_1, const uint16_t *b_1) { uint16_t d01[KARATSUBA_N / 2 - 1]; uint16_t d0123[KARATSUBA_N / 2 - 1]; uint16_t d23[KARATSUBA_N / 2 - 1]; uint16_t result_d01[KARATSUBA_N - 1]; - int32_t i, j; + size_t i, j; memset(result_d01, 0, (KARATSUBA_N - 1)*sizeof(uint16_t)); memset(d01, 0, (KARATSUBA_N / 2 - 1)*sizeof(uint16_t)); @@ -110,7 +110,7 @@ static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t -static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *result) { +static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t *b1) { uint16_t inv3 = 43691, inv9 = 36409, inv15 = 61167; uint16_t aw1[N_SB], aw2[N_SB], aw3[N_SB], aw4[N_SB], aw5[N_SB], aw6[N_SB], aw7[N_SB]; @@ -181,13 +181,13 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re // MULTIPLICATION - karatsuba_simple(aw1, bw1, w1); - karatsuba_simple(aw2, bw2, w2); - karatsuba_simple(aw3, bw3, w3); - karatsuba_simple(aw4, bw4, w4); - karatsuba_simple(aw5, bw5, w5); - karatsuba_simple(aw6, bw6, w6); - karatsuba_simple(aw7, bw7, w7); + karatsuba_simple(w1, aw1, bw1); + karatsuba_simple(w2, aw2, bw2); + karatsuba_simple(w3, aw3, bw3); + karatsuba_simple(w4, aw4, bw4); + karatsuba_simple(w5, aw5, bw5); + karatsuba_simple(w6, aw6, bw6); + karatsuba_simple(w7, aw7, bw7); // INTERPOLATION for (i = 0; i < N_SB_RES; ++i) { @@ -229,11 +229,11 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re } /* res += a*b */ -void PQCLEAN_SABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) { +void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { uint16_t c[2 * SABER_N] = {0}; - int i; + size_t i; - toom_cook_4way(a, b, c); + toom_cook_4way(c, a, b); /* reduction */ for (i = SABER_N; i < 2 * SABER_N; i++) { diff --git a/crypto_kem/saber/clean/poly_mul.h b/crypto_kem/saber/clean/poly_mul.h index e0f10043..82140f5b 100644 --- a/crypto_kem/saber/clean/poly_mul.h +++ b/crypto_kem/saber/clean/poly_mul.h @@ -3,7 +3,7 @@ #include "SABER_params.h" #include -void PQCLEAN_SABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]); +void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); #endif From 135f95e15b66d6cb6c8e0273c0b63948b5e0586c Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Wed, 28 Oct 2020 12:02:04 -0400 Subject: [PATCH 05/10] Clean up AVX2 code --- crypto_kem/firesaber/META.yml | 4 +- crypto_kem/firesaber/avx2/Makefile | 2 +- crypto_kem/firesaber/avx2/SABER_indcpa.c | 435 +---- crypto_kem/firesaber/avx2/SABER_params.h | 40 +- crypto_kem/firesaber/avx2/cbd.c | 26 +- crypto_kem/firesaber/avx2/cbd.h | 4 +- crypto_kem/firesaber/avx2/kem.c | 12 +- crypto_kem/firesaber/avx2/kem.h | 32 - crypto_kem/firesaber/avx2/pack_unpack.c | 587 ++----- crypto_kem/firesaber/avx2/pack_unpack.h | 46 +- crypto_kem/firesaber/avx2/poly.c | 62 + crypto_kem/firesaber/avx2/poly.h | 37 +- crypto_kem/firesaber/avx2/poly_mul.c | 1524 +++++++++++++++++ crypto_kem/firesaber/avx2/polymul/consts.h | 20 - crypto_kem/firesaber/avx2/polymul/matrix.c | 303 ---- crypto_kem/firesaber/avx2/polymul/scm_avx.c | 753 -------- .../firesaber/avx2/polymul/toom-cook_4way.c | 1010 ----------- crypto_kem/firesaber/clean/SABER_indcpa.c | 123 +- crypto_kem/firesaber/clean/SABER_indcpa.h | 2 +- crypto_kem/firesaber/clean/SABER_params.h | 12 +- crypto_kem/firesaber/clean/api.h | 2 +- crypto_kem/firesaber/clean/pack_unpack.c | 161 +- crypto_kem/firesaber/clean/pack_unpack.h | 17 +- crypto_kem/firesaber/clean/poly.c | 44 +- crypto_kem/firesaber/clean/poly.h | 16 +- crypto_kem/firesaber/clean/poly_mul.c | 18 +- crypto_kem/firesaber/clean/poly_mul.h | 8 +- crypto_kem/lightsaber/META.yml | 4 +- crypto_kem/lightsaber/avx2/Makefile | 2 +- crypto_kem/lightsaber/avx2/SABER_indcpa.c | 435 +---- crypto_kem/lightsaber/avx2/SABER_params.h | 43 +- crypto_kem/lightsaber/avx2/cbd.c | 19 +- crypto_kem/lightsaber/avx2/cbd.h | 4 +- crypto_kem/lightsaber/avx2/kem.c | 12 +- crypto_kem/lightsaber/avx2/kem.h | 32 - crypto_kem/lightsaber/avx2/pack_unpack.c | 587 ++----- crypto_kem/lightsaber/avx2/pack_unpack.h | 46 +- crypto_kem/lightsaber/avx2/poly.c | 62 + crypto_kem/lightsaber/avx2/poly.h | 37 +- crypto_kem/lightsaber/avx2/poly_mul.c | 1524 +++++++++++++++++ crypto_kem/lightsaber/avx2/polymul/consts.h | 20 - crypto_kem/lightsaber/avx2/polymul/matrix.c | 303 ---- crypto_kem/lightsaber/avx2/polymul/scm_avx.c | 753 -------- .../lightsaber/avx2/polymul/toom-cook_4way.c | 1010 ----------- crypto_kem/lightsaber/clean/SABER_indcpa.c | 123 +- crypto_kem/lightsaber/clean/SABER_indcpa.h | 2 +- crypto_kem/lightsaber/clean/SABER_params.h | 12 +- crypto_kem/lightsaber/clean/api.h | 2 +- crypto_kem/lightsaber/clean/pack_unpack.c | 169 +- crypto_kem/lightsaber/clean/pack_unpack.h | 17 +- crypto_kem/lightsaber/clean/poly.c | 44 +- crypto_kem/lightsaber/clean/poly.h | 16 +- crypto_kem/lightsaber/clean/poly_mul.c | 18 +- crypto_kem/lightsaber/clean/poly_mul.h | 8 +- crypto_kem/saber/META.yml | 4 +- crypto_kem/saber/avx2/Makefile | 2 +- crypto_kem/saber/avx2/SABER_indcpa.c | 435 +---- crypto_kem/saber/avx2/SABER_params.h | 43 +- crypto_kem/saber/avx2/cbd.c | 23 +- crypto_kem/saber/avx2/cbd.h | 4 +- crypto_kem/saber/avx2/kem.c | 12 +- crypto_kem/saber/avx2/kem.h | 32 - crypto_kem/saber/avx2/pack_unpack.c | 583 ++----- crypto_kem/saber/avx2/pack_unpack.h | 46 +- crypto_kem/saber/avx2/poly.c | 62 + crypto_kem/saber/avx2/poly.h | 37 +- crypto_kem/saber/avx2/poly_mul.c | 1524 +++++++++++++++++ crypto_kem/saber/avx2/polymul/consts.h | 20 - crypto_kem/saber/avx2/polymul/matrix.c | 303 ---- crypto_kem/saber/avx2/polymul/scm_avx.c | 753 -------- .../saber/avx2/polymul/toom-cook_4way.c | 1010 ----------- crypto_kem/saber/clean/SABER_indcpa.c | 123 +- crypto_kem/saber/clean/SABER_indcpa.h | 2 +- crypto_kem/saber/clean/SABER_params.h | 12 +- crypto_kem/saber/clean/api.h | 2 +- crypto_kem/saber/clean/pack_unpack.c | 153 +- crypto_kem/saber/clean/pack_unpack.h | 17 +- crypto_kem/saber/clean/poly.c | 44 +- crypto_kem/saber/clean/poly.h | 16 +- crypto_kem/saber/clean/poly_mul.c | 18 +- crypto_kem/saber/clean/poly_mul.h | 8 +- test/duplicate_consistency/firesaber_avx2.yml | 9 + .../duplicate_consistency/firesaber_clean.yml | 9 + .../duplicate_consistency/lightsaber_avx2.yml | 27 +- .../lightsaber_clean.yml | 19 + test/duplicate_consistency/saber_avx2.yml | 18 +- test/duplicate_consistency/saber_clean.yml | 14 + 87 files changed, 6314 insertions(+), 9674 deletions(-) create mode 100644 crypto_kem/firesaber/avx2/poly.c create mode 100644 crypto_kem/firesaber/avx2/poly_mul.c delete mode 100644 crypto_kem/firesaber/avx2/polymul/consts.h delete mode 100644 crypto_kem/firesaber/avx2/polymul/matrix.c delete mode 100644 crypto_kem/firesaber/avx2/polymul/scm_avx.c delete mode 100644 crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c create mode 100644 crypto_kem/lightsaber/avx2/poly.c create mode 100644 crypto_kem/lightsaber/avx2/poly_mul.c delete mode 100644 crypto_kem/lightsaber/avx2/polymul/consts.h delete mode 100644 crypto_kem/lightsaber/avx2/polymul/matrix.c delete mode 100644 crypto_kem/lightsaber/avx2/polymul/scm_avx.c delete mode 100644 crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c create mode 100644 crypto_kem/saber/avx2/poly.c create mode 100644 crypto_kem/saber/avx2/poly_mul.c delete mode 100644 crypto_kem/saber/avx2/polymul/consts.h delete mode 100644 crypto_kem/saber/avx2/polymul/matrix.c delete mode 100644 crypto_kem/saber/avx2/polymul/scm_avx.c delete mode 100644 crypto_kem/saber/avx2/polymul/toom-cook_4way.c diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml index d1781339..0aa614ca 100644 --- a/crypto_kem/firesaber/META.yml +++ b/crypto_kem/firesaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/firesaber/avx2/Makefile b/crypto_kem/firesaber/avx2/Makefile index a44bbdb4..b7fbd7d8 100644 --- a/crypto_kem/firesaber/avx2/Makefile +++ b/crypto_kem/firesaber/avx2/Makefile @@ -2,7 +2,7 @@ LIB=libfiresaber_avx2.a HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h -OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o +OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/firesaber/avx2/SABER_indcpa.c b/crypto_kem/firesaber/avx2/SABER_indcpa.c index 625a3f6b..285a6625 100644 --- a/crypto_kem/firesaber/avx2/SABER_indcpa.c +++ b/crypto_kem/firesaber/avx2/SABER_indcpa.c @@ -1,416 +1,125 @@ -#include "./polymul/toom-cook_4way.c" #include "SABER_indcpa.h" #include "SABER_params.h" -#include "api.h" -#include "cbd.h" #include "fips202.h" #include "pack_unpack.h" +#include "poly.h" #include "randombytes.h" #include -#include #include -//#include "randombytes.h" -//#include "./polymul/toom_cook_4/toom-cook_4way.c" -#define h1 4 //2^(EQ-EP-1) +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) -#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) +void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + size_t i, j; + poly A[SABER_L][SABER_L]; + poly *skpv1 = A[0]; // use first row of A to hold sk temporarily + toom4_points skpv1_eval[SABER_L]; + poly res[SABER_L]; -static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { - int32_t i, j; + uint8_t rand[SABER_NOISESEEDBYTES]; + uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; - for (j = 0; j < SABER_KEYBYTES; j++) { - message_dec[j] = 0; - for (i = 0; i < 8; i++) { - message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); - } - } -} + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state -/*----------------------------------------------------------------------------------- - This routine generates a=[Matrix K x K] of 256-coefficient polynomials --------------------------------------------------------------------------------------*/ + randombytes(rand, SABER_NOISESEEDBYTES); + PQCLEAN_FIRESABER_AVX2_GenSecret(skpv1, rand); + PQCLEAN_FIRESABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key -static void GenMatrix(polyvec *a, const uint8_t *seed) { - uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; - - uint16_t temp_ar[SABER_N]; - - int i, j, k; - uint16_t mod = (SABER_Q - 1); - - shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_FIRESABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); - for (k = 0; k < SABER_N; k++) { - a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; - } - } - } -} - -static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { - - uint32_t i; - - uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; - - shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - PQCLEAN_FIRESABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); - } -} - -//********************************matrix-vector mul routines***************************************************** -static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { - int64_t i, j; - - __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time - - for (i = 0; i < NUM_POLY; i++) { - for (j = 0; j < NUM_POLY; j++) { - - if (isTranspose == 0) { - toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); - } else { - toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); - } - } - - TC_interpol(c_bucket, res_avx[i]); + for (j = 0; j < SABER_L; j++) { + PQCLEAN_FIRESABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]); } -} + PQCLEAN_FIRESABER_AVX2_GenMatrix(A, seed_A); // sample matrix A + PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order -static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { - - int64_t i; - - __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time - - for (i = 0; i < NUM_POLY; i++) { - toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); - } - TC_interpol(c_bucket, res_avx); -} - -//********************************matrix-vector mul routines***************************************************** - -void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { - - polyvec a[SABER_K]; - - uint16_t skpv1[SABER_K][SABER_N]; - - - - uint8_t seed[SABER_SEEDBYTES]; - uint8_t noiseseed[SABER_COINBYTES]; - int32_t i, j, k; - - -//--------------AVX declaration------------------ - - __m256i sk_avx[SABER_K][SABER_N / 16]; - __m256i mod; - __m256i res_avx[SABER_K][SABER_N / 16]; - __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; - //__m256i acc[2*SABER_N/16]; - - mod = _mm256_set1_epi16(SABER_Q - 1); - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - -//--------------AVX declaration ends------------------ - - randombytes(seed, SABER_SEEDBYTES); - - shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state - randombytes(noiseseed, SABER_COINBYTES); - - - GenMatrix(a, seed); //sample matrix A - - GenSecret(skpv1, noiseseed); - - -// Load sk into avx vectors - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); - } - - } - - // Load a into avx vectors - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - for (k = 0; k < SABER_N / 16; k++) { - a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); - } + // rounding + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_N; j++) { + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - - - //------------------------do the matrix vector multiplication and rounding------------ - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sk_avx[j], b_bucket[j]); - } - matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order - - // Now truncation - - - for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits - for (j = 0; j < SABER_N / 16; j++) { - res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); - res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); - res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); - } - } - - //------------------Pack sk into byte string------- - - PQCLEAN_FIRESABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); - - //------------------Pack pk into byte string------- - - for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); - } - } - PQCLEAN_FIRESABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string - - - for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. - pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; - } - + PQCLEAN_FIRESABER_AVX2_POLVECp2BS(pk, res); // pack public key } void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + size_t i, j; + poly A[SABER_L][SABER_L]; + poly res[SABER_L]; + toom4_points skpv1_eval[SABER_L]; - uint32_t i, j, k; - polyvec a[SABER_K]; // skpv; - uint8_t seed[SABER_SEEDBYTES]; - uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client + poly *temp = A[0]; // re-use stack space + poly *vprime = &A[0][0]; + poly *message = &A[0][1]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; - uint16_t skpv1[SABER_K][SABER_N]; - uint16_t temp[SABER_K][SABER_N]; - uint16_t message[SABER_KEYBYTES * 8]; - - uint8_t msk_c[SABER_SCALEBYTES_KEM]; - - //--------------AVX declaration------------------ - - __m256i sk_avx[SABER_K][SABER_N / 16]; - __m256i mod, mod_p; - __m256i res_avx[SABER_K][SABER_N / 16]; - __m256i vprime_avx[SABER_N / 16]; - __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; - //__m256i acc[2*SABER_N/16]; - - __m256i pkcl_avx[SABER_K][SABER_N / 16]; - - __m256i message_avx[SABER_N / 16]; - - mod = _mm256_set1_epi16(SABER_Q - 1); - mod_p = _mm256_set1_epi16(SABER_P - 1); - - - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - - //--------------AVX declaration ends------------------ - for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. - seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; + PQCLEAN_FIRESABER_AVX2_GenSecret(temp, noiseseed); + for (j = 0; j < SABER_L; j++) { + PQCLEAN_FIRESABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]); } - GenMatrix(a, seed); - GenSecret(skpv1, noiseseed); + PQCLEAN_FIRESABER_AVX2_GenMatrix(A, seed_A); + PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed - // ----------- Load skpv1 into avx vectors ---------- - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + // rounding + for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N; j++) { + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - - // ----------- Load skpv1 into avx vectors ---------- - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - for (k = 0; k < SABER_N / 16; k++) { - a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); - } - } - } - //-----------------matrix-vector multiplication and rounding - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sk_avx[j], b_bucket[j]); - } - matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order - - // Now truncation - - for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits - for (j = 0; j < SABER_N / 16; j++) { - res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); - res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); - res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); - - } - } - - - //-----this result should be put in b_prime for later use in server. - for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); - } - } - - PQCLEAN_FIRESABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string - -//**************client matrix-vector multiplication ends******************// - - //------now calculate the v' - - //-------unpack the public_key - PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); - } - } - - // InnerProduct - //for(k=0;k> i) & 0x01); - } - } - // message encoding - for (i = 0; i < SABER_N / 16; i++) { - message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); - message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); - } - - // SHIFTRIGHT(v'+h1-m mod p, EP-ET) - for (k = 0; k < SABER_N / 16; k++) { - vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); - vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); - vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); - } - - // Unpack avx - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); - } - - PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(msk_c, temp[0]); - - - for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { - ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; + for (i = 0; i < SABER_N; i++) { + vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); + vprime->coeffs[i] &= SABER_P - 1; + vprime->coeffs[i] >>= SABER_EP - SABER_ET; } + PQCLEAN_FIRESABER_AVX2_POLT2BS(msk_c, vprime); } void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + size_t i; - uint32_t i, j; - uint16_t sksv[SABER_K][SABER_N]; //secret key of the server - uint16_t pksv[SABER_K][SABER_N]; - uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; - uint8_t scale_ar[SABER_SCALEBYTES_KEM]; - uint16_t op[SABER_N]; + poly temp[SABER_L]; + toom4_points sksv_eval[SABER_L]; - //--------------AVX declaration------------------ + const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + poly *v = &temp[0]; + poly *cm = &temp[1]; - - //__m256i mod_p; - - __m256i v_avx[SABER_N / 16]; - - //__m256i acc[2*SABER_N/16]; - - __m256i sksv_avx[SABER_K][SABER_N / 16]; - __m256i pksv_avx[SABER_K][SABER_N / 16]; - - //mod_p=_mm256_set1_epi16(SABER_P-1); - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - //--------------AVX declaration ends------------------ - - //-------unpack the public_key - - PQCLEAN_FIRESABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key - PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); - pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); - } + PQCLEAN_FIRESABER_AVX2_BS2POLVECq(temp, sk); + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]); } - for (i = 0; i < SABER_N / 16; i++) { - v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); - } + PQCLEAN_FIRESABER_AVX2_BS2POLVECp(temp, ciphertext); + PQCLEAN_FIRESABER_AVX2_InnerProd(v, temp, sksv_eval); + PQCLEAN_FIRESABER_AVX2_BS2POLT(cm, packed_cm); - // InnerProduct(b', s, mod p) - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sksv_avx[j], b_bucket[j]); - } - - vector_vector_mul(v_avx, pksv_avx, b_bucket); - - for (i = 0; i < SABER_N / 16; i++) { - _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); - } - - - for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { - scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; - } - - PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(op, scale_ar); - - - //addition of h2 for (i = 0; i < SABER_N; i++) { - message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); + v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); + v->coeffs[i] &= SABER_P - 1; + v->coeffs[i] >>= SABER_EP - 1; } - - POL2MSG(m, message_dec_unpacked); + PQCLEAN_FIRESABER_AVX2_POLmsg2BS(m, v); } diff --git a/crypto_kem/firesaber/avx2/SABER_params.h b/crypto_kem/firesaber/avx2/SABER_params.h index e1476b6a..6481efec 100644 --- a/crypto_kem/firesaber/avx2/SABER_params.h +++ b/crypto_kem/firesaber/avx2/SABER_params.h @@ -1,45 +1,41 @@ #ifndef PARAMS_H #define PARAMS_H -#include "api.h" - - -#define SABER_K 4 +/* Don't change anything below this line */ +#define SABER_L 4 #define SABER_MU 6 #define SABER_ET 6 -#define SABER_EQ 13 -#define SABER_EP 10 - #define SABER_N 256 -#define SABER_Q 8192 //2^13 -#define SABER_P 1024 -#define SABER_SEEDBYTES 32 -#define SABER_NOISESEEDBYTES 32 -#define SABER_COINBYTES 32 -#define SABER_KEYBYTES 32 +#define SABER_EP 10 +#define SABER_P (1 << SABER_EP) -#define SABER_HASHBYTES 32 +#define SABER_EQ 13 +#define SABER_Q (1 << SABER_EQ) -#define SABER_POLYBYTES 416 //13*256/8 +#define SABER_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 -#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) -#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) -#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) -#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) -#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) - -#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) #endif diff --git a/crypto_kem/firesaber/avx2/cbd.c b/crypto_kem/firesaber/avx2/cbd.c index 37970a81..0da0876f 100644 --- a/crypto_kem/firesaber/avx2/cbd.c +++ b/crypto_kem/firesaber/avx2/cbd.c @@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -static uint64_t load_littleendian(const unsigned char *x, int bytes) { +static uint64_t load_littleendian(const uint8_t *x, int bytes) { int i; uint64_t r = x[0]; for (i = 1; i < bytes; i++) { @@ -20,33 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { return r; } - -void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { - uint16_t Qmod_minus1 = SABER_Q - 1; - +void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { uint32_t t, d, a[4], b[4]; int i, j; for (i = 0; i < SABER_N / 4; i++) { - t = load_littleendian(buf + 3 * i, 3); + t = (uint32_t) load_littleendian(buf + 3 * i, 3); d = 0; for (j = 0; j < 3; j++) { d += (t >> j) & 0x249249; } - a[0] = d & 0x7; - b[0] = (d >> 3) & 0x7; - a[1] = (d >> 6) & 0x7; - b[1] = (d >> 9) & 0x7; + a[0] = d & 0x7; + b[0] = (d >> 3) & 0x7; + a[1] = (d >> 6) & 0x7; + b[1] = (d >> 9) & 0x7; a[2] = (d >> 12) & 0x7; b[2] = (d >> 15) & 0x7; a[3] = (d >> 18) & 0x7; b[3] = (d >> 21); - r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; - r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; - r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; - r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; - + s[4 * i + 0] = (uint16_t)(a[0] - b[0]); + s[4 * i + 1] = (uint16_t)(a[1] - b[1]); + s[4 * i + 2] = (uint16_t)(a[2] - b[2]); + s[4 * i + 3] = (uint16_t)(a[3] - b[3]); } } diff --git a/crypto_kem/firesaber/avx2/cbd.h b/crypto_kem/firesaber/avx2/cbd.h index 210bcc50..dba55d9d 100644 --- a/crypto_kem/firesaber/avx2/cbd.h +++ b/crypto_kem/firesaber/avx2/cbd.h @@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -#include "poly.h" +#include "SABER_params.h" #include -void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); +void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); #endif diff --git a/crypto_kem/firesaber/avx2/kem.c b/crypto_kem/firesaber/avx2/kem.c index 2e72e6aa..92c19a7d 100644 --- a/crypto_kem/firesaber/avx2/kem.c +++ b/crypto_kem/firesaber/avx2/kem.c @@ -4,14 +4,12 @@ #include "fips202.h" #include "randombytes.h" #include "verify.h" -#include +#include #include -#include -#include int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - int i; + size_t i; PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,7 +37,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); // K^ <-- kr[0:31] // noiseseed (r) <-- kr[32:63]; - PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; + PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); @@ -49,7 +47,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t } int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { - int i; + size_t i; uint8_t fail; uint8_t cmp[SABER_BYTES_CCA_DEC]; uint8_t buf[64]; @@ -65,7 +63,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const ui sha3_512(kr, buf, 64); - PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); + PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk); fail = PQCLEAN_FIRESABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); diff --git a/crypto_kem/firesaber/avx2/kem.h b/crypto_kem/firesaber/avx2/kem.h index a55514d9..b28b04f6 100644 --- a/crypto_kem/firesaber/avx2/kem.h +++ b/crypto_kem/firesaber/avx2/kem.h @@ -1,35 +1,3 @@ -#ifndef INDCPA_H -#define INDCPA_H - -#include - -void PQCLEAN_FIRESABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); - - -void PQCLEAN_FIRESABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); - - -void PQCLEAN_FIRESABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); - - -void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); - -void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); - -void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); - - -int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); - -int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); - -int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); -//uint64_t clock1,clock2; - -//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; - - -#endif diff --git a/crypto_kem/firesaber/avx2/pack_unpack.c b/crypto_kem/firesaber/avx2/pack_unpack.c index 33c481b3..41b9747a 100644 --- a/crypto_kem/firesaber/avx2/pack_unpack.c +++ b/crypto_kem/firesaber/avx2/pack_unpack.c @@ -1,502 +1,149 @@ +#include "SABER_params.h" #include "pack_unpack.h" +#include "poly.h" +#include - -void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); - bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); - } -} - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; - data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; - data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); - data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; - data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; - data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); - data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); - data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); - } - -} - -void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0; - - for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); - } -} - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0; - - for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - data[offset_data] = bytes[j] & 0x0f; - data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; - } -} - -void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - +void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); + out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6); + out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4); + out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2); + in += 4; + out += 3; } } - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - +void PQCLEAN_FIRESABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; - data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; - data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; - data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); - } - -} - -void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); - } + out[0] = in[0] & 0x3f; + out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2); + out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4); + out[3] = ((in[2] & 0xff) >> 2); + in += 3; + out += 4; } } -void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); - } - } -} - -void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); - - bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); - - bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); - - } - } - - -} - -void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); + out[2] = ((in[1] >> 3) & 0xff); + out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); + out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); + out[5] = ((in[3] >> 1) & 0xff); + out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); + out[7] = ((in[4] >> 4) & 0xff); + out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); + out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); + out[10] = ((in[6] >> 2) & 0xff); + out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); + out[12] = ((in[7] >> 5) & 0xff); + in += 8; + out += 13; } } - - -void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); - data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); - data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); - data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); - - } - } -} - -void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); - } - } - - -} - - - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); - data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); - data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); - data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); - - } - } - - -} - - -void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); - - bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); - - bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); - - } - } - - -} - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); - } - } - - -} - -void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - //for(i=0;icoeffs; for (j = 0; j < SABER_N / 8; j++) { - //offset_byte=offset_byte1+13*j; - offset_byte = 13 * j; - offset_data = 8 * j; - data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); + out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); + out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); + out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); + out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); + out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); + out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); + out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + in += 13; + out += 8; } - //} - - } +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; + for (j = 0; j < SABER_N / 4; j++) { + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); + out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); + out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); + out[4] = ((in[3] >> 2) & 0xff); + in += 4; + out += 5; + } +} -void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - /*This function packs 11 bit data stream into 8 bits of data. - */ - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; +static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; + for (j = 0; j < SABER_N / 4; j++) { + out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); + out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); + out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); + out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + in += 5; + out += 4; + } +} - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 11) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 11 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); +void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); + } +} - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); +void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); + } +} - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); +void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); + } +} - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); +void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); + } +} +void PQCLEAN_FIRESABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { + size_t i, j; + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); } } - } -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { +void PQCLEAN_FIRESABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { + size_t i, j; + memset(bytes, 0, SABER_KEYBYTES); - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 11) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 11 * j; - offset_data = 8 * j; - - data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); - - data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); - - data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); - - data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); - - data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); - - data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); - - data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); - - data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); } } - - -} - -void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 14) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 7 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); - } - } - - -} - - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 14) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 7 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); - - data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); - - data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); - - data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); - } - } - - -} - -void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - - if (modulus == 1024) { - PQCLEAN_FIRESABER_AVX2_POLVECp2BS(bytes, data); - } else if (modulus == 8192) { - PQCLEAN_FIRESABER_AVX2_POLVECq2BS(bytes, data); - } -} - -void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { - - if (modulus == 1024) { - PQCLEAN_FIRESABER_AVX2_BS2POLVECp(data, bytes); - } else if (modulus == 8192) { - PQCLEAN_FIRESABER_AVX2_BS2POLVECq(data, bytes); - } - } diff --git a/crypto_kem/firesaber/avx2/pack_unpack.h b/crypto_kem/firesaber/avx2/pack_unpack.h index ba8a568f..eb6242be 100644 --- a/crypto_kem/firesaber/avx2/pack_unpack.h +++ b/crypto_kem/firesaber/avx2/pack_unpack.h @@ -1,56 +1,28 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H #include "SABER_params.h" +#include "poly.h" #include #include -void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); +void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); -void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); - -void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); +void PQCLEAN_FIRESABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); +void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); -void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); +void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); -void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); +void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); -void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); +void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); +void PQCLEAN_FIRESABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); +void PQCLEAN_FIRESABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); #endif diff --git a/crypto_kem/firesaber/avx2/poly.c b/crypto_kem/firesaber/avx2/poly.c new file mode 100644 index 00000000..2a7fa836 --- /dev/null +++ b/crypto_kem/firesaber/avx2/poly.c @@ -0,0 +1,62 @@ +#include "cbd.h" +#include "fips202.h" +#include "pack_unpack.h" +#include "poly.h" + + +void PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) { + size_t i, j; + toom4_points_product c_eval; + + if (transpose) { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1); + } + PQCLEAN_FIRESABER_AVX2_toom4_interp(&c[i], &c_eval); + } + } else { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1); + } + PQCLEAN_FIRESABER_AVX2_toom4_interp(&c[i], &c_eval); + } + } +} + +void PQCLEAN_FIRESABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) { + size_t i; + toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time + + PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0); + for (i = 1; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1); + } + + PQCLEAN_FIRESABER_AVX2_toom4_interp(c, &c_eval); +} + +void PQCLEAN_FIRESABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { + size_t i; + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); + } +} + +void PQCLEAN_FIRESABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { + size_t i; + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; + + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); + } +} diff --git a/crypto_kem/firesaber/avx2/poly.h b/crypto_kem/firesaber/avx2/poly.h index 8443de34..859fb95e 100644 --- a/crypto_kem/firesaber/avx2/poly.h +++ b/crypto_kem/firesaber/avx2/poly.h @@ -1,27 +1,38 @@ #ifndef POLY_H #define POLY_H -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ #include "SABER_params.h" +#include #include -typedef struct { +typedef union { uint16_t coeffs[SABER_N]; + __m256i dummy; } poly; -typedef struct { - poly vec[SABER_K]; -} polyvec; +typedef union { + uint16_t coeffs[4 * SABER_N]; + __m256i dummy; +} toom4_points; -void PQCLEAN_FIRESABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); +typedef union { + uint16_t coeffs[8 * SABER_N]; + __m256i dummy; +} toom4_points_product; + +void PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose); + +void PQCLEAN_FIRESABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]); + +void PQCLEAN_FIRESABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_FIRESABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); -void PQCLEAN_FIRESABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); +void PQCLEAN_FIRESABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval); + +void PQCLEAN_FIRESABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b); + +void PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate); #endif diff --git a/crypto_kem/firesaber/avx2/poly_mul.c b/crypto_kem/firesaber/avx2/poly_mul.c new file mode 100644 index 00000000..d4e37d59 --- /dev/null +++ b/crypto_kem/firesaber/avx2/poly_mul.c @@ -0,0 +1,1524 @@ +#include "SABER_params.h" +#include "poly.h" + + +#define L (SABER_N / 64) + +static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { + return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); +} + +static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + b4 = b[4]; + b5 = b[5]; + b6 = b[6]; + b7 = b[7]; + + c[0] = mul_add(a0, b0, c[0]); + + temp = _mm256_mullo_epi16(a0, b1); + temp = mul_add(a1, b0, temp); + c[1] = _mm256_add_epi16(temp, c[1]); + + temp = _mm256_mullo_epi16(a0, b2); + temp = mul_add(a1, b1, temp); + temp = mul_add(a2, b0, temp); + c[2] = _mm256_add_epi16(temp, c[2]); + + temp = _mm256_mullo_epi16(a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + temp = mul_add(a3, b0, temp); + c[3] = _mm256_add_epi16(temp, c[3]); + + temp = _mm256_mullo_epi16(a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + temp = mul_add(a2, b2, temp); + c[4] = _mm256_add_epi16(temp, c[4]); + + temp = _mm256_mullo_epi16(a0, b5); + temp = mul_add(a1, b4, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + temp = mul_add(a5, b0, temp); + c[5] = _mm256_add_epi16(temp, c[5]); + + temp = _mm256_mullo_epi16(a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a4, b2, temp); + c[6] = _mm256_add_epi16(temp, c[6]); + + temp = _mm256_mullo_epi16(a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add(a6, b1, temp); + temp = mul_add(a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a5, b2, temp); + c[7] = _mm256_add_epi16(temp, c[7]); + + temp = _mm256_mullo_epi16(a0, b[8]); + temp = mul_add(a1, b7, temp); + temp = mul_add(a7, b1, temp); + temp = mul_add(a[8], b0, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a6, b2, temp); + c[8] = _mm256_add_epi16(temp, c[8]); + + temp = _mm256_mullo_epi16(a0, b[9]); + temp = mul_add(a1, b[8], temp); + temp = mul_add(a[8], b1, temp); + temp = mul_add(a[9], b0, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a7, b2, temp); + c[9] = _mm256_add_epi16(temp, c[9]); + + temp = _mm256_mullo_epi16(a0, b[10]); + temp = mul_add(a1, b[9], temp); + temp = mul_add(a[9], b1, temp); + temp = mul_add(a[10], b0, temp); + temp = mul_add(a2, b[8], temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a[8], b2, temp); + c[10] = _mm256_add_epi16(temp, c[10]); + + temp = _mm256_mullo_epi16(a0, b[11]); + temp = mul_add(a1, b[10], temp); + temp = mul_add(a[10], b1, temp); + temp = mul_add(a[11], b0, temp); + temp = mul_add(a2, b[9], temp); + temp = mul_add(a3, b[8], temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a[8], b3, temp); + temp = mul_add(a[9], b2, temp); + c[11] = _mm256_add_epi16(temp, c[11]); + + temp = _mm256_mullo_epi16(a0, b[12]); + temp = mul_add(a1, b[11], temp); + temp = mul_add(a[11], b1, temp); + temp = mul_add(a[12], b0, temp); + temp = mul_add(a2, b[10], temp); + temp = mul_add(a3, b[9], temp); + temp = mul_add(a4, b[8], temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a[8], b4, temp); + temp = mul_add(a[9], b3, temp); + temp = mul_add(a[10], b2, temp); + c[12] = _mm256_add_epi16(temp, c[12]); + + temp = _mm256_mullo_epi16(a0, b[13]); + temp = mul_add(a1, b[12], temp); + temp = mul_add(a[12], b1, temp); + temp = mul_add(a[13], b0, temp); + temp = mul_add(a2, b[11], temp); + temp = mul_add(a3, b[10], temp); + temp = mul_add(a4, b[9], temp); + temp = mul_add(a5, b[8], temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a[8], b5, temp); + temp = mul_add(a[9], b4, temp); + temp = mul_add(a[10], b3, temp); + temp = mul_add(a[11], b2, temp); + c[13] = _mm256_add_epi16(temp, c[13]); + + temp = _mm256_mullo_epi16(a0, b[14]); + temp = mul_add(a1, b[13], temp); + temp = mul_add(a[13], b1, temp); + temp = mul_add(a[14], b0, temp); + temp = mul_add(a2, b[12], temp); + temp = mul_add(a3, b[11], temp); + temp = mul_add(a4, b[10], temp); + temp = mul_add(a5, b[9], temp); + temp = mul_add(a6, b[8], temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a[8], b6, temp); + temp = mul_add(a[9], b5, temp); + temp = mul_add(a[10], b4, temp); + temp = mul_add(a[11], b3, temp); + temp = mul_add(a[12], b2, temp); + c[14] = _mm256_add_epi16(temp, c[14]); + + temp = _mm256_mullo_epi16(a0, b[15]); + temp = mul_add(a1, b[14], temp); + temp = mul_add(a[14], b1, temp); + temp = mul_add(a[15], b0, temp); + temp = mul_add(a2, b[13], temp); + temp = mul_add(a3, b[12], temp); + temp = mul_add(a4, b[11], temp); + temp = mul_add(a5, b[10], temp); + temp = mul_add(a6, b[9], temp); + temp = mul_add(a7, b[8], temp); + temp = mul_add(a[8], b7, temp); + temp = mul_add(a[9], b6, temp); + temp = mul_add(a[10], b5, temp); + temp = mul_add(a[11], b4, temp); + temp = mul_add(a[12], b3, temp); + temp = mul_add(a[13], b2, temp); + c[15] = _mm256_add_epi16(temp, c[15]); + + a0 = a[14]; + a1 = a[15]; + a2 = a[13]; + a3 = a[12]; + a4 = a[11]; + a5 = a[10]; + a6 = a[9]; + a7 = a[8]; + + b0 = b[14]; + b1 = b[15]; + b2 = b[13]; + b3 = b[12]; + b4 = b[11]; + b5 = b[10]; + b6 = b[9]; + b7 = b[8]; + + temp = _mm256_mullo_epi16(a[1], b1); + temp = mul_add(a[2], b0, temp); + temp = mul_add(a[3], b2, temp); + temp = mul_add(a[4], b3, temp); + temp = mul_add(a[5], b4, temp); + temp = mul_add(a[6], b5, temp); + temp = mul_add(a[7], b6, temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a6, b[7], temp); + temp = mul_add(a5, b[6], temp); + temp = mul_add(a4, b[5], temp); + temp = mul_add(a3, b[4], temp); + temp = mul_add(a2, b[3], temp); + temp = mul_add(a0, b[2], temp); + temp = mul_add(a1, b[1], temp); + c[16] = _mm256_add_epi16(temp, c[16]); + + temp = _mm256_mullo_epi16(a[2], b1); + temp = mul_add(a[3], b0, temp); + temp = mul_add(a[4], b2, temp); + temp = mul_add(a[5], b3, temp); + temp = mul_add(a[6], b4, temp); + temp = mul_add(a[7], b5, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a5, b[7], temp); + temp = mul_add(a4, b[6], temp); + temp = mul_add(a3, b[5], temp); + temp = mul_add(a2, b[4], temp); + temp = mul_add(a0, b[3], temp); + temp = mul_add(a1, b[2], temp); + c[17] = _mm256_add_epi16(temp, c[17]); + + temp = _mm256_mullo_epi16(a[3], b1); + temp = mul_add(a[4], b0, temp); + temp = mul_add(a[5], b2, temp); + temp = mul_add(a[6], b3, temp); + temp = mul_add(a[7], b4, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a4, b[7], temp); + temp = mul_add(a3, b[6], temp); + temp = mul_add(a2, b[5], temp); + temp = mul_add(a0, b[4], temp); + temp = mul_add(a1, b[3], temp); + c[18] = _mm256_add_epi16(temp, c[18]); + + temp = _mm256_mullo_epi16(a[4], b1); + temp = mul_add(a[5], b0, temp); + temp = mul_add(a[6], b2, temp); + temp = mul_add(a[7], b3, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a3, b[7], temp); + temp = mul_add(a2, b[6], temp); + temp = mul_add(a0, b[5], temp); + temp = mul_add(a1, b[4], temp); + c[19] = _mm256_add_epi16(temp, c[19]); + + temp = _mm256_mullo_epi16(a[5], b1); + temp = mul_add(a[6], b0, temp); + temp = mul_add(a[7], b2, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a2, b[7], temp); + temp = mul_add(a0, b[6], temp); + temp = mul_add(a1, b[5], temp); + c[20] = _mm256_add_epi16(temp, c[20]); + + temp = _mm256_mullo_epi16(a[6], b1); + temp = mul_add(a[7], b0, temp); + temp = mul_add(a7, b2, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a0, b[7], temp); + temp = mul_add(a1, b[6], temp); + c[21] = _mm256_add_epi16(temp, c[21]); + + temp = _mm256_mullo_epi16(a[7], b1); + temp = mul_add(a7, b0, temp); + temp = mul_add(a6, b2, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a0, b7, temp); + temp = mul_add(a1, b[7], temp); + c[22] = _mm256_add_epi16(temp, c[22]); + + temp = _mm256_mullo_epi16(a7, b1); + temp = mul_add(a6, b0, temp); + temp = mul_add(a5, b2, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a0, b6, temp); + temp = mul_add(a1, b7, temp); + c[23] = _mm256_add_epi16(temp, c[23]); + + temp = _mm256_mullo_epi16(a6, b1); + temp = mul_add(a5, b0, temp); + temp = mul_add(a4, b2, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a0, b5, temp); + temp = mul_add(a1, b6, temp); + c[24] = _mm256_add_epi16(temp, c[24]); + + temp = _mm256_mullo_epi16(a5, b1); + temp = mul_add(a4, b0, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a0, b4, temp); + temp = mul_add(a1, b5, temp); + c[25] = _mm256_add_epi16(temp, c[25]); + + temp = _mm256_mullo_epi16(a4, b1); + temp = mul_add(a3, b0, temp); + temp = mul_add(a2, b2, temp); + temp = mul_add(a0, b3, temp); + temp = mul_add(a1, b4, temp); + c[26] = _mm256_add_epi16(temp, c[26]); + + temp = _mm256_mullo_epi16(a3, b1); + temp = mul_add(a2, b0, temp); + temp = mul_add(a0, b2, temp); + temp = mul_add(a1, b3, temp); + c[27] = _mm256_add_epi16(temp, c[27]); + + temp = _mm256_mullo_epi16(a2, b1); + temp = mul_add(a0, b0, temp); + temp = mul_add(a1, b2, temp); + c[28] = _mm256_add_epi16(temp, c[28]); + + temp = _mm256_mullo_epi16(a0, b1); + temp = mul_add(a1, b0, temp); + c[29] = _mm256_add_epi16(temp, c[29]); + + c[30] = mul_add(a1, b1, c[30]); + + c[31] = _mm256_set_epi64x(0, 0, 0, 0); +} + + +static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + b4 = b[4]; + b5 = b[5]; + b6 = b[6]; + b7 = b[7]; + + c[0] = _mm256_mullo_epi16(a0, b0); + + temp = _mm256_mullo_epi16(a0, b1); + c[1] = mul_add(a1, b0, temp); + + temp = _mm256_mullo_epi16(a0, b2); + temp = mul_add(a1, b1, temp); + c[2] = mul_add(a2, b0, temp); + + temp = _mm256_mullo_epi16(a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + c[3] = mul_add(a3, b0, temp); + + temp = _mm256_mullo_epi16(a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + c[4] = mul_add(a2, b2, temp); + + temp = _mm256_mullo_epi16(a0, b5); + temp = mul_add(a1, b4, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + c[5] = mul_add(a5, b0, temp); + + temp = _mm256_mullo_epi16(a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + c[6] = mul_add(a4, b2, temp); + + temp = _mm256_mullo_epi16(a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add(a6, b1, temp); + temp = mul_add(a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a4, b3, temp); + c[7] = mul_add(a5, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[8]); + temp = mul_add(a1, b7, temp); + temp = mul_add(a7, b1, temp); + temp = mul_add(a[8], b0, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a5, b3, temp); + c[8] = mul_add(a6, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[9]); + temp = mul_add(a1, b[8], temp); + temp = mul_add(a[8], b1, temp); + temp = mul_add(a[9], b0, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a6, b3, temp); + c[9] = mul_add(a7, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[10]); + temp = mul_add(a1, b[9], temp); + temp = mul_add(a[9], b1, temp); + temp = mul_add(a[10], b0, temp); + temp = mul_add(a2, b[8], temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a7, b3, temp); + c[10] = mul_add(a[8], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[11]); + temp = mul_add(a1, b[10], temp); + temp = mul_add(a[10], b1, temp); + temp = mul_add(a[11], b0, temp); + temp = mul_add(a2, b[9], temp); + temp = mul_add(a3, b[8], temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a[8], b3, temp); + c[11] = mul_add(a[9], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[12]); + temp = mul_add(a1, b[11], temp); + temp = mul_add(a[11], b1, temp); + temp = mul_add(a[12], b0, temp); + temp = mul_add(a2, b[10], temp); + temp = mul_add(a3, b[9], temp); + temp = mul_add(a4, b[8], temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a[8], b4, temp); + temp = mul_add(a[9], b3, temp); + c[12] = mul_add(a[10], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[13]); + temp = mul_add(a1, b[12], temp); + temp = mul_add(a[12], b1, temp); + temp = mul_add(a[13], b0, temp); + temp = mul_add(a2, b[11], temp); + temp = mul_add(a3, b[10], temp); + temp = mul_add(a4, b[9], temp); + temp = mul_add(a5, b[8], temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a[8], b5, temp); + temp = mul_add(a[9], b4, temp); + temp = mul_add(a[10], b3, temp); + c[13] = mul_add(a[11], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[14]); + temp = mul_add(a1, b[13], temp); + temp = mul_add(a[13], b1, temp); + temp = mul_add(a[14], b0, temp); + temp = mul_add(a2, b[12], temp); + temp = mul_add(a3, b[11], temp); + temp = mul_add(a4, b[10], temp); + temp = mul_add(a5, b[9], temp); + temp = mul_add(a6, b[8], temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a[8], b6, temp); + temp = mul_add(a[9], b5, temp); + temp = mul_add(a[10], b4, temp); + temp = mul_add(a[11], b3, temp); + c[14] = mul_add(a[12], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[15]); + temp = mul_add(a1, b[14], temp); + temp = mul_add(a[14], b1, temp); + temp = mul_add(a[15], b0, temp); + temp = mul_add(a2, b[13], temp); + temp = mul_add(a3, b[12], temp); + temp = mul_add(a4, b[11], temp); + temp = mul_add(a5, b[10], temp); + temp = mul_add(a6, b[9], temp); + temp = mul_add(a7, b[8], temp); + temp = mul_add(a[8], b7, temp); + temp = mul_add(a[9], b6, temp); + temp = mul_add(a[10], b5, temp); + temp = mul_add(a[11], b4, temp); + temp = mul_add(a[12], b3, temp); + c[15] = mul_add(a[13], b2, temp); + + // unrolled second triangle + a0 = a[14]; + a1 = a[15]; + a2 = a[13]; + a3 = a[12]; + a4 = a[11]; + a5 = a[10]; + a6 = a[9]; + a7 = a[8]; + + b0 = b[14]; + b1 = b[15]; + b2 = b[13]; + b3 = b[12]; + b4 = b[11]; + b5 = b[10]; + b6 = b[9]; + b7 = b[8]; + + temp = _mm256_mullo_epi16(a[1], b1); + temp = mul_add(a[2], b0, temp); + temp = mul_add(a[3], b2, temp); + temp = mul_add(a[4], b3, temp); + temp = mul_add(a[5], b4, temp); + temp = mul_add(a[6], b5, temp); + temp = mul_add(a[7], b6, temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a6, b[7], temp); + temp = mul_add(a5, b[6], temp); + temp = mul_add(a4, b[5], temp); + temp = mul_add(a3, b[4], temp); + temp = mul_add(a2, b[3], temp); + temp = mul_add(a0, b[2], temp); + c[16] = mul_add(a1, b[1], temp); + + temp = _mm256_mullo_epi16(a[2], b1); + temp = mul_add(a[3], b0, temp); + temp = mul_add(a[4], b2, temp); + temp = mul_add(a[5], b3, temp); + temp = mul_add(a[6], b4, temp); + temp = mul_add(a[7], b5, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a5, b[7], temp); + temp = mul_add(a4, b[6], temp); + temp = mul_add(a3, b[5], temp); + temp = mul_add(a2, b[4], temp); + temp = mul_add(a0, b[3], temp); + c[17] = mul_add(a1, b[2], temp); + + temp = _mm256_mullo_epi16(a[3], b1); + temp = mul_add(a[4], b0, temp); + temp = mul_add(a[5], b2, temp); + temp = mul_add(a[6], b3, temp); + temp = mul_add(a[7], b4, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a4, b[7], temp); + temp = mul_add(a3, b[6], temp); + temp = mul_add(a2, b[5], temp); + temp = mul_add(a0, b[4], temp); + c[18] = mul_add(a1, b[3], temp); + + temp = _mm256_mullo_epi16(a[4], b1); + temp = mul_add(a[5], b0, temp); + temp = mul_add(a[6], b2, temp); + temp = mul_add(a[7], b3, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a3, b[7], temp); + temp = mul_add(a2, b[6], temp); + temp = mul_add(a0, b[5], temp); + c[19] = mul_add(a1, b[4], temp); + + temp = _mm256_mullo_epi16(a[5], b1); + temp = mul_add(a[6], b0, temp); + temp = mul_add(a[7], b2, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a2, b[7], temp); + temp = mul_add(a0, b[6], temp); + c[20] = mul_add(a1, b[5], temp); + + temp = _mm256_mullo_epi16(a[6], b1); + temp = mul_add(a[7], b0, temp); + temp = mul_add(a7, b2, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a0, b[7], temp); + c[21] = mul_add(a1, b[6], temp); + + temp = _mm256_mullo_epi16(a[7], b1); + temp = mul_add(a7, b0, temp); + temp = mul_add(a6, b2, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a0, b7, temp); + c[22] = mul_add(a1, b[7], temp); + + temp = _mm256_mullo_epi16(a7, b1); + temp = mul_add(a6, b0, temp); + temp = mul_add(a5, b2, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a0, b6, temp); + c[23] = mul_add(a1, b7, temp); + + temp = _mm256_mullo_epi16(a6, b1); + temp = mul_add(a5, b0, temp); + temp = mul_add(a4, b2, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a0, b5, temp); + c[24] = mul_add(a1, b6, temp); + + temp = _mm256_mullo_epi16(a5, b1); + temp = mul_add(a4, b0, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a0, b4, temp); + c[25] = mul_add(a1, b5, temp); + + temp = _mm256_mullo_epi16(a4, b1); + temp = mul_add(a3, b0, temp); + temp = mul_add(a2, b2, temp); + temp = mul_add(a0, b3, temp); + c[26] = mul_add(a1, b4, temp); + + temp = _mm256_mullo_epi16(a3, b1); + temp = mul_add(a2, b0, temp); + temp = mul_add(a0, b2, temp); + c[27] = mul_add(a1, b3, temp); + + temp = _mm256_mullo_epi16(a2, b1); + temp = mul_add(a0, b0, temp); + c[28] = mul_add(a1, b2, temp); + + temp = _mm256_mullo_epi16(a0, b1); + c[29] = mul_add(a1, b0, temp); + + c[30] = _mm256_mullo_epi16(a1, b1); + + c[31] = _mm256_set_epi64x(0, 0, 0, 0); +} + +static void transpose(__m256i *M) { + __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; + __m256i temp, temp0, temp1, temp2; + + r0 = _mm256_unpacklo_epi16(M[0], M[1]); + r1 = _mm256_unpacklo_epi16(M[2], M[3]); + r2 = _mm256_unpacklo_epi16(M[4], M[5]); + r3 = _mm256_unpacklo_epi16(M[6], M[7]); + r4 = _mm256_unpacklo_epi16(M[8], M[9]); + r5 = _mm256_unpacklo_epi16(M[10], M[11]); + r6 = _mm256_unpacklo_epi16(M[12], M[13]); + r7 = _mm256_unpacklo_epi16(M[14], M[15]); + + temp = _mm256_unpacklo_epi32(r0, r1); + temp0 = _mm256_unpacklo_epi32(r2, r3); + temp1 = _mm256_unpacklo_epi32(r4, r5); + temp2 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpackhi_epi32(r0, r1); + r9 = _mm256_unpackhi_epi32(r2, r3); + r10 = _mm256_unpackhi_epi32(r4, r5); + r11 = _mm256_unpackhi_epi32(r6, r7); + + r0 = _mm256_unpacklo_epi64(temp, temp0); + r2 = _mm256_unpackhi_epi64(temp, temp0); + r1 = _mm256_unpacklo_epi64(temp1, temp2); + r3 = _mm256_unpackhi_epi64(temp1, temp2); + + temp = _mm256_unpackhi_epi16(M[0], M[1]); + temp0 = _mm256_unpackhi_epi16(M[2], M[3]); + temp1 = _mm256_unpackhi_epi16(M[4], M[5]); + temp2 = _mm256_unpackhi_epi16(M[6], M[7]); + + r4 = _mm256_unpackhi_epi16(M[8], M[9]); + M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); + M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); + r5 = _mm256_unpackhi_epi16(M[10], M[11]); + r6 = _mm256_unpackhi_epi16(M[12], M[13]); + r7 = _mm256_unpackhi_epi16(M[14], M[15]); + + r0 = _mm256_unpacklo_epi64(r8, r9); + r1 = _mm256_unpacklo_epi64(r10, r11); + r2 = _mm256_unpackhi_epi64(r8, r9); + r3 = _mm256_unpackhi_epi64(r10, r11); + + M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); + M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); + + r0 = _mm256_unpacklo_epi32(temp, temp0); + r1 = _mm256_unpacklo_epi32(temp1, temp2); + r2 = _mm256_unpacklo_epi32(r4, r5); + r3 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpacklo_epi64(r0, r1); + r10 = _mm256_unpackhi_epi64(r0, r1); + r9 = _mm256_unpacklo_epi64(r2, r3); + r11 = _mm256_unpackhi_epi64(r2, r3); + + M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); + M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); + M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); + M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); + + r0 = _mm256_unpackhi_epi32(temp, temp0); + r1 = _mm256_unpackhi_epi32(temp1, temp2); + r2 = _mm256_unpackhi_epi32(r4, r5); + r3 = _mm256_unpackhi_epi32(r6, r7); + + r4 = _mm256_unpacklo_epi64(r0, r1); + r6 = _mm256_unpackhi_epi64(r0, r1); + r5 = _mm256_unpacklo_epi64(r2, r3); + r7 = _mm256_unpackhi_epi64(r2, r3); + + M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); + M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); + M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); + M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); +} + +static void batch_64coefficient_multiplications(toom4_points_product *c_eval, const __m256i *a, const toom4_points *b_eval, int accumulate) { + toom4_points a_eval;// Holds evaluation (a & b) for 7 Karatsuba at a time + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + __m256i *va = (__m256i *)a_eval.coeffs; + __m256i *vb = (__m256i *)b_eval->coeffs; + __m256i *vc = (__m256i *)c_eval->coeffs; + + //------------------AVX evaluation for 1st poly----------------------- + r0_avx = a[0 * L + 0]; + r1_avx = a[0 * L + 1]; + r2_avx = a[0 * L + 2]; + r3_avx = a[0 * L + 3]; + + va[0] = r0_avx; + va[1] = r1_avx; + va[2] = r2_avx; + va[3] = r3_avx; + va[4] = _mm256_add_epi16(r0_avx, r1_avx); + va[5] = _mm256_add_epi16(r2_avx, r3_avx); + va[6] = _mm256_add_epi16(r0_avx, r2_avx); + va[7] = _mm256_add_epi16(r1_avx, r3_avx); + va[8] = _mm256_add_epi16(va[6], va[7]); + //------------------AVX evaluation for 1st poly ends------------------ + + //------------------AVX evaluation for 2nd poly----------------------- + r0_avx = a[1 * L + 0]; + r1_avx = a[1 * L + 1]; + r2_avx = a[1 * L + 2]; + r3_avx = a[1 * L + 3]; + + va[0 + 9] = r0_avx; + va[1 + 9] = r1_avx; + va[2 + 9] = r2_avx; + va[3 + 9] = r3_avx; + va[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 9] = _mm256_add_epi16(va[6 + 9], va[7 + 9]); + //------------------AVX evaluation for 2nd poly ends------------------ + + //------------------AVX evaluation for 3rd poly----------------------- + r0_avx = a[2 * L + 0]; + r1_avx = a[2 * L + 1]; + r2_avx = a[2 * L + 2]; + r3_avx = a[2 * L + 3]; + + va[0 + 18] = r0_avx; + va[1 + 18] = r1_avx; + va[2 + 18] = r2_avx; + va[3 + 18] = r3_avx; + va[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 18] = _mm256_add_epi16(va[6 + 18], va[7 + 18]); + //------------------AVX evaluation for 3rd poly ends------------------ + + //------------------AVX evaluation for 4th poly----------------------- + r0_avx = a[3 * L + 0]; + r1_avx = a[3 * L + 1]; + r2_avx = a[3 * L + 2]; + r3_avx = a[3 * L + 3]; + + va[0 + 27] = r0_avx; + va[1 + 27] = r1_avx; + va[2 + 27] = r2_avx; + va[3 + 27] = r3_avx; + va[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 27] = _mm256_add_epi16(va[6 + 27], va[7 + 27]); + //------------------AVX evaluation for 4th poly ends------------------ + + //------------------AVX evaluation for 5th poly----------------------- + r0_avx = a[4 * L + 0]; + r1_avx = a[4 * L + 1]; + r2_avx = a[4 * L + 2]; + r3_avx = a[4 * L + 3]; + + va[0 + 36] = r0_avx; + va[1 + 36] = r1_avx; + va[2 + 36] = r2_avx; + va[3 + 36] = r3_avx; + va[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 36] = _mm256_add_epi16(va[6 + 36], va[7 + 36]); + //------------------AVX evaluation for 5th poly ends------------------ + + //------------------AVX evaluation for 6th poly----------------------- + r0_avx = a[5 * L + 0]; + r1_avx = a[5 * L + 1]; + r2_avx = a[5 * L + 2]; + r3_avx = a[5 * L + 3]; + + va[0 + 45] = r0_avx; + va[1 + 45] = r1_avx; + va[2 + 45] = r2_avx; + va[3 + 45] = r3_avx; + va[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 45] = _mm256_add_epi16(va[6 + 45], va[7 + 45]); + //------------------AVX evaluation for 6th poly ends------------------ + + //------------------AVX evaluation for 7th poly----------------------- + r0_avx = a[6 * L + 0]; + r1_avx = a[6 * L + 1]; + r2_avx = a[6 * L + 2]; + r3_avx = a[6 * L + 3]; + + va[0 + 54] = r0_avx; + va[1 + 54] = r1_avx; + va[2 + 54] = r2_avx; + va[3 + 54] = r3_avx; + va[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 54] = _mm256_add_epi16(va[6 + 54], va[7 + 54]); + //------------------AVX evaluation for 7th poly ends------------------ + + //-----------------Forward transposes-------------------------------------- + transpose(va); + transpose(va + 16); + transpose(va + 32); + transpose(va + 48); + //-----------------Forward transposes ends--------------------------------- + + if (accumulate == 0) { + schoolbook_avx(vc, va, vb); + schoolbook_avx(vc + 32, va + 16, vb + 16); + schoolbook_avx(vc + 64, va + 32, vb + 32); + schoolbook_avx(vc + 96, va + 48, vb + 48); + } else { + schoolbook_avx_acc(vc, va, vb); + schoolbook_avx_acc(vc + 32, va + 16, vb + 16); + schoolbook_avx_acc(vc + 64, va + 32, vb + 32); + schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + } +} + +static void karatsuba_eval(__m256i *b_eval, const __m256i *b) { + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + //-------1st poly---------------------------------------------------- + r0_avx = b[0 * L + 0]; + r1_avx = b[0 * L + 1]; + r2_avx = b[0 * L + 2]; + r3_avx = b[0 * L + 3]; + + b_eval[0] = r0_avx; + b_eval[1] = r1_avx; + b_eval[2] = r2_avx; + b_eval[3] = r3_avx; + b_eval[4] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8] = _mm256_add_epi16(b_eval[6], b_eval[7]); + + //-------2nd poly---------------------------------------------------- + r0_avx = b[1 * L + 0]; + r1_avx = b[1 * L + 1]; + r2_avx = b[1 * L + 2]; + r3_avx = b[1 * L + 3]; + + b_eval[0 + 9] = r0_avx; + b_eval[1 + 9] = r1_avx; + b_eval[2 + 9] = r2_avx; + b_eval[3 + 9] = r3_avx; + b_eval[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 9] = _mm256_add_epi16(b_eval[6 + 9], b_eval[7 + 9]); + + //-------3rd poly---------------------------------------------------- + r0_avx = b[2 * L + 0]; + r1_avx = b[2 * L + 1]; + r2_avx = b[2 * L + 2]; + r3_avx = b[2 * L + 3]; + + b_eval[0 + 18] = r0_avx; + b_eval[1 + 18] = r1_avx; + b_eval[2 + 18] = r2_avx; + b_eval[3 + 18] = r3_avx; + b_eval[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 18] = _mm256_add_epi16(b_eval[6 + 18], b_eval[7 + 18]); + + //-------4th poly---------------------------------------------------- + r0_avx = b[3 * L + 0]; + r1_avx = b[3 * L + 1]; + r2_avx = b[3 * L + 2]; + r3_avx = b[3 * L + 3]; + + b_eval[0 + 27] = r0_avx; + b_eval[1 + 27] = r1_avx; + b_eval[2 + 27] = r2_avx; + b_eval[3 + 27] = r3_avx; + b_eval[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 27] = _mm256_add_epi16(b_eval[6 + 27], b_eval[7 + 27]); + + //-------5th poly---------------------------------------------------- + r0_avx = b[4 * L + 0]; + r1_avx = b[4 * L + 1]; + r2_avx = b[4 * L + 2]; + r3_avx = b[4 * L + 3]; + + b_eval[0 + 36] = r0_avx; + b_eval[1 + 36] = r1_avx; + b_eval[2 + 36] = r2_avx; + b_eval[3 + 36] = r3_avx; + b_eval[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 36] = _mm256_add_epi16(b_eval[6 + 36], b_eval[7 + 36]); + + //-------6th poly---------------------------------------------------- + r0_avx = b[5 * L + 0]; + r1_avx = b[5 * L + 1]; + r2_avx = b[5 * L + 2]; + r3_avx = b[5 * L + 3]; + + b_eval[0 + 45] = r0_avx; + b_eval[1 + 45] = r1_avx; + b_eval[2 + 45] = r2_avx; + b_eval[3 + 45] = r3_avx; + b_eval[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 45] = _mm256_add_epi16(b_eval[6 + 45], b_eval[7 + 45]); + + //-------7th poly---------------------------------------------------- + r0_avx = b[6 * L + 0]; + r1_avx = b[6 * L + 1]; + r2_avx = b[6 * L + 2]; + r3_avx = b[6 * L + 3]; + + b_eval[0 + 54] = r0_avx; + b_eval[1 + 54] = r1_avx; + b_eval[2 + 54] = r2_avx; + b_eval[3 + 54] = r3_avx; + b_eval[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 54] = _mm256_add_epi16(b_eval[6 + 54], b_eval[7 + 54]); + + //--------------Evaluating B poly ends------------------------------- + transpose(b_eval); + transpose(b_eval + 16); + transpose(b_eval + 32); + transpose(b_eval + 48); +} + +static void karatsuba_interp(__m256i *result_final0, __m256i *result_final1, __m256i *result_final2, __m256i *result_final3, __m256i *result_final4, __m256i *result_final5, __m256i *result_final6, const __m256i *c_eval) { + __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results + __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; + + //------------------------AVX interpolation for 1st poly external------------------- + res_avx0 = c_eval[0]; + res_avx2 = c_eval[1]; + res_avx4 = c_eval[2]; + res_avx6 = c_eval[3]; + c6_avx = c_eval[6]; + c7_avx = c_eval[7]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[8], c6_avx), c7_avx); + + res_avx1 = c_eval[16]; + res_avx3 = c_eval[17]; + res_avx5 = c_eval[18]; + res_avx7 = c_eval[19]; + c22_avx = c_eval[22]; + c23_avx = c_eval[23]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[21], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[24], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[20], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[5], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[4], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final0[0] = res_avx0; + result_final0[1] = res_avx1; + result_final0[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final0[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final0[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final0[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final0[6] = res_avx6; + result_final0[7] = res_avx7; + //------------------------AVX interpolation for 1st poly ends-------------- + + + //------------------------AVX interpolation for 2nd poly external------------------- + res_avx0 = c_eval[9]; //c_eval0 + res_avx2 = c_eval[10]; //c_eval1 + res_avx4 = c_eval[11]; //c_eval2 + res_avx6 = c_eval[12]; //c_eval3 + c6_avx = c_eval[15]; //c_eval6 + c7_avx = c_eval[32]; //c_eval7 + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[33], c6_avx), c7_avx); + + res_avx1 = c_eval[25]; //c_eval0 + res_avx3 = c_eval[26]; //c_eval1 + res_avx5 = c_eval[27]; //c_eval2 + res_avx7 = c_eval[28]; //c_eval3 + c22_avx = c_eval[31]; + c23_avx = c_eval[48]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[30], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[49], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[29], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[14], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[13], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final1[0] = res_avx0; + result_final1[1] = res_avx1; + result_final1[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final1[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final1[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final1[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final1[6] = res_avx6; + result_final1[7] = res_avx7; + //------------------------AVX interpolation for 2nd poly ends-------------- + + //------------------------AVX interpolation for 3rd poly external------------------- + res_avx0 = c_eval[34]; //c_eval0 + res_avx2 = c_eval[35]; //c_eval1 + res_avx4 = c_eval[36]; + res_avx6 = c_eval[37]; + c6_avx = c_eval[40]; + c7_avx = c_eval[41]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[42], c6_avx), c7_avx); + + res_avx1 = c_eval[50]; //c_eval0 + res_avx3 = c_eval[51]; //c_eval1 + res_avx5 = c_eval[52]; + res_avx7 = c_eval[53]; + c22_avx = c_eval[56]; + c23_avx = c_eval[57]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[55], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[58], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[54], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[39], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[38], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final2[0] = res_avx0; + result_final2[1] = res_avx1; + result_final2[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final2[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final2[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final2[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final2[6] = res_avx6; + result_final2[7] = res_avx7; + //------------------------AVX interpolation for 3rd poly ends-------------- + + //------------------------AVX interpolation for 4th poly external------------------- + res_avx0 = c_eval[43]; + res_avx2 = c_eval[44]; + res_avx4 = c_eval[45]; + res_avx6 = c_eval[46]; + c6_avx = c_eval[65]; + c7_avx = c_eval[66]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[67], c6_avx), c7_avx); + + res_avx1 = c_eval[59]; + res_avx3 = c_eval[60]; + res_avx5 = c_eval[61]; + res_avx7 = c_eval[62]; + c22_avx = c_eval[81]; + c23_avx = c_eval[82]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[80], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[83], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[63], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[64], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[47], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final3[0] = res_avx0; + result_final3[1] = res_avx1; + result_final3[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final3[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final3[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final3[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final3[6] = res_avx6; + result_final3[7] = res_avx7; + //------------------------AVX interpolation for 4th poly ends-------------- + + //------------------------AVX interpolation for 5th poly external------------------- + res_avx0 = c_eval[68]; + res_avx2 = c_eval[69]; + res_avx4 = c_eval[70]; + res_avx6 = c_eval[71]; + c6_avx = c_eval[74]; + c7_avx = c_eval[75]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[76], c6_avx), c7_avx); + + res_avx1 = c_eval[84]; + res_avx3 = c_eval[85]; + res_avx5 = c_eval[86]; + res_avx7 = c_eval[87]; + c22_avx = c_eval[90]; + c23_avx = c_eval[91]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[89], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[92], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[88], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[73], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[72], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final4[0] = res_avx0; + result_final4[1] = res_avx1; + result_final4[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final4[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final4[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final4[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final4[6] = res_avx6; + result_final4[7] = res_avx7; + //------------------------AVX interpolation for 5th poly ends-------------- + + //------------------------AVX interpolation for 6th poly external------------------- + res_avx0 = c_eval[77]; + res_avx2 = c_eval[78]; + res_avx4 = c_eval[79]; + res_avx6 = c_eval[96]; + c6_avx = c_eval[99]; + c7_avx = c_eval[100]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[101], c6_avx), c7_avx); + + res_avx1 = c_eval[93]; + res_avx3 = c_eval[94]; + res_avx5 = c_eval[95]; + res_avx7 = c_eval[112]; + c22_avx = c_eval[115]; + c23_avx = c_eval[116]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[114], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[117], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[113], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[98], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[97], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final5[0] = res_avx0; + result_final5[1] = res_avx1; + result_final5[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final5[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final5[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final5[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final5[6] = res_avx6; + result_final5[7] = res_avx7; + //------------------------AVX interpolation for 6th poly ends-------------- + + //------------------------AVX interpolation for 7th poly external------------------- + res_avx0 = c_eval[102]; + res_avx2 = c_eval[103]; + res_avx4 = c_eval[104]; + res_avx6 = c_eval[105]; + c6_avx = c_eval[108]; + c7_avx = c_eval[109]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[110], c6_avx), c7_avx); + + res_avx1 = c_eval[118]; + res_avx3 = c_eval[119]; + res_avx5 = c_eval[120]; + res_avx7 = c_eval[121]; + c22_avx = c_eval[124]; + c23_avx = c_eval[125]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[123], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[126], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[122], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[107], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[106], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final6[0] = res_avx0; + result_final6[1] = res_avx1; + result_final6[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final6[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final6[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final6[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final6[6] = res_avx6; + result_final6[7] = res_avx7; + //------------------------AVX interpolation for 7th poly ends-------------- +} + +void PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a, const toom4_points *b_eval, int accumulate) { + size_t i; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i aw_avx[7 * L]; + __m256i *va = (__m256i *)a->coeffs; + + for (i = 0; i < L; i++) { + r0_avx = va[0 * L + i]; + r1_avx = va[1 * L + i]; + r2_avx = va[2 * L + i]; + r3_avx = va[3 * L + i]; + r4_avx = _mm256_add_epi16(r0_avx, r2_avx); + r5_avx = _mm256_add_epi16(r1_avx, r3_avx); + aw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + aw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r0_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r2_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r5_avx = _mm256_slli_epi16(r1_avx, 2); + r5_avx = _mm256_add_epi16(r5_avx, r3_avx); + aw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + aw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r3_avx, 3); + r6_avx = _mm256_slli_epi16(r2_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + r6_avx = _mm256_slli_epi16(r1_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + aw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx); + aw_avx[6 * L + i] = r0_avx; + aw_avx[0 * L + i] = r3_avx; + } + + batch_64coefficient_multiplications(c_eval, aw_avx, b_eval, accumulate); +} + +void PQCLEAN_FIRESABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b) { + size_t i; + __m256i bw_avx[7 * L]; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i *vb = (__m256i *)b->coeffs; + __m256i *vb_eval = (__m256i *)b_eval->coeffs; + + for (i = 0; i < L; i++) { + r0_avx = vb[0 * L + i]; + r1_avx = vb[1 * L + i]; + r2_avx = vb[2 * L + i]; + r3_avx = vb[3 * L + i]; + r4_avx = _mm256_add_epi16(r0_avx, r2_avx); + r5_avx = _mm256_add_epi16(r1_avx, r3_avx); + bw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + bw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r0_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r2_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r5_avx = _mm256_slli_epi16(r1_avx, 2); + r5_avx = _mm256_add_epi16(r5_avx, r3_avx); + bw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + bw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r3_avx, 3); + r6_avx = _mm256_slli_epi16(r2_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + r6_avx = _mm256_slli_epi16(r1_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + bw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx); + bw_avx[6 * L + i] = r0_avx; + bw_avx[0 * L + i] = r3_avx; + } + + karatsuba_eval(vb_eval, bw_avx); +} + + +void PQCLEAN_FIRESABER_AVX2_toom4_interp(poly *res, const toom4_points_product *c_eval) { + size_t i; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx; + __m256i w1_avx[2 * L], w2_avx[2 * L], w3_avx[2 * L], w4_avx[2 * L], w5_avx[2 * L], w6_avx[2 * L], w7_avx[2 * L]; + __m256i res_full[32]; + __m256i *vc = (__m256i *)c_eval->coeffs; + __m256i *vres = (__m256i *)res->coeffs; + + transpose(vc); + transpose(vc + 16); + transpose(vc + 32); + transpose(vc + 48); + transpose(vc + 64); + transpose(vc + 80); + transpose(vc + 96); + transpose(vc + 112); + + karatsuba_interp(w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx, vc); + + for (i = 0; i < 2 * L; i++) { + r0_avx = w1_avx[i]; + r1_avx = w2_avx[i]; + r2_avx = w3_avx[i]; + r3_avx = w4_avx[i]; + r4_avx = w5_avx[i]; + r5_avx = w6_avx[i]; + r6_avx = w7_avx[i]; + + r1_avx = _mm256_add_epi16(r1_avx, r4_avx); + r5_avx = _mm256_sub_epi16(r5_avx, r4_avx); + r3_avx = _mm256_sub_epi16(r3_avx, r2_avx); + r3_avx = _mm256_srli_epi16(r3_avx, 1); + r4_avx = _mm256_sub_epi16(r4_avx, r0_avx); + temp_avx = _mm256_slli_epi16(r6_avx, 6); + + r4_avx = _mm256_sub_epi16(r4_avx, temp_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r5_avx); + r2_avx = _mm256_add_epi16(r2_avx, r3_avx); + temp_avx = _mm256_slli_epi16(r2_avx, 6); + + r1_avx = _mm256_sub_epi16(r1_avx, temp_avx); + r1_avx = _mm256_sub_epi16(r1_avx, r2_avx); + r2_avx = _mm256_sub_epi16(r2_avx, r6_avx); + r2_avx = _mm256_sub_epi16(r2_avx, r0_avx); + temp_avx = _mm256_mullo_epi16(r2_avx, _mm256_set1_epi16(45)); + + r1_avx = _mm256_add_epi16(r1_avx, temp_avx); + temp_avx = _mm256_slli_epi16(r2_avx, 3); + + r4_avx = _mm256_sub_epi16(r4_avx, temp_avx); + r4_avx = _mm256_mullo_epi16(r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16) + r4_avx = _mm256_srli_epi16(r4_avx, 3); + r5_avx = _mm256_add_epi16(r5_avx, r1_avx); + temp_avx = _mm256_slli_epi16(r3_avx, 4); + + r1_avx = _mm256_add_epi16(r1_avx, temp_avx); + r1_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16) + r1_avx = _mm256_srli_epi16(r1_avx, 1); + r3_avx = _mm256_add_epi16(r1_avx, r3_avx); + r3_avx = _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx); + temp_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(30)); + temp_avx = _mm256_sub_epi16(temp_avx, r5_avx); + temp_avx = _mm256_mullo_epi16(temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16) + + r5_avx = _mm256_srli_epi16(temp_avx, 2); + r2_avx = _mm256_sub_epi16(r2_avx, r4_avx); + r1_avx = _mm256_sub_epi16(r1_avx, r5_avx); + + if (i < L) { + res_full[0 * L + i] = r6_avx; + res_full[1 * L + i] = r5_avx; + res_full[2 * L + i] = r4_avx; + res_full[3 * L + i] = r3_avx; + res_full[4 * L + i] = r2_avx; + res_full[5 * L + i] = r1_avx; + res_full[6 * L + i] = r0_avx; + } else { + res_full[0 * L + i] = _mm256_add_epi16(res_full[0 * L + i], r6_avx); + res_full[1 * L + i] = _mm256_add_epi16(res_full[1 * L + i], r5_avx); + res_full[2 * L + i] = _mm256_add_epi16(res_full[2 * L + i], r4_avx); + res_full[3 * L + i] = _mm256_add_epi16(res_full[3 * L + i], r3_avx); + res_full[4 * L + i] = _mm256_add_epi16(res_full[4 * L + i], r2_avx); + res_full[5 * L + i] = _mm256_add_epi16(res_full[5 * L + i], r1_avx); + res_full[6 * L + i] = r0_avx; + } + } + + // Reduction by X^256 + 1 + for (i = 0; i < 16; i++) { + vres[i] = _mm256_sub_epi16(res_full[i], res_full[i + 16]); + } +} diff --git a/crypto_kem/firesaber/avx2/polymul/consts.h b/crypto_kem/firesaber/avx2/polymul/consts.h deleted file mode 100644 index 40826398..00000000 --- a/crypto_kem/firesaber/avx2/polymul/consts.h +++ /dev/null @@ -1,20 +0,0 @@ -#include "../SABER_params.h" - -#define AVX_N (SABER_N >> 4) -#define small_len_avx (AVX_N >> 2) - -#define SCHB_N 16 - -#define N_SB (SABER_N >> 2) -#define N_SB_RES (2*N_SB-1) - -#define N_SB_16 (N_SB >> 2) -#define N_SB_16_RES (2*N_SB_16-1) - -#define AVX_N1 16 /*N/16*/ - -#define SCM_SIZE 16 - -// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements -#define NUM_POLY SABER_K -//int NUM_POLY=2; diff --git a/crypto_kem/firesaber/avx2/polymul/matrix.c b/crypto_kem/firesaber/avx2/polymul/matrix.c deleted file mode 100644 index 5fa35783..00000000 --- a/crypto_kem/firesaber/avx2/polymul/matrix.c +++ /dev/null @@ -1,303 +0,0 @@ -#include - -static void transpose_n1(__m256i *M) -{ - //int i; - register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; - register __m256i temp, temp0, temp1, temp2; - - //for(i=0; i<8; i=i+1) - //{ - r0 = _mm256_unpacklo_epi16(M[0], M[1]); - r1 = _mm256_unpacklo_epi16(M[2], M[3]); - r2 = _mm256_unpacklo_epi16(M[4], M[5]); - r3 = _mm256_unpacklo_epi16(M[6], M[7]); - r4 = _mm256_unpacklo_epi16(M[8], M[9]); - r5 = _mm256_unpacklo_epi16(M[10], M[11]); - r6 = _mm256_unpacklo_epi16(M[12], M[13]); - r7 = _mm256_unpacklo_epi16(M[14], M[15]); - - - temp = _mm256_unpacklo_epi32(r0, r1); - temp0 = _mm256_unpacklo_epi32(r2, r3); - temp1 = _mm256_unpacklo_epi32(r4, r5); - temp2 = _mm256_unpacklo_epi32(r6, r7); - - r8 = _mm256_unpackhi_epi32(r0, r1); - r9 = _mm256_unpackhi_epi32(r2, r3); - r10 = _mm256_unpackhi_epi32(r4, r5); - r11 = _mm256_unpackhi_epi32(r6, r7); - - r0 = _mm256_unpacklo_epi64(temp, temp0); - r2 = _mm256_unpackhi_epi64(temp, temp0); - - r1 = _mm256_unpacklo_epi64(temp1, temp2); - r3 = _mm256_unpackhi_epi64(temp1, temp2); - - temp = _mm256_unpackhi_epi16(M[0], M[1]); - temp0 = _mm256_unpackhi_epi16(M[2], M[3]); - temp1 = _mm256_unpackhi_epi16(M[4], M[5]); - temp2 = _mm256_unpackhi_epi16(M[6], M[7]); - r4 = _mm256_unpackhi_epi16(M[8], M[9]); - - M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); - M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); - M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); - M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); - - - r5 = _mm256_unpackhi_epi16(M[10], M[11]); - r6 = _mm256_unpackhi_epi16(M[12], M[13]); - r7 = _mm256_unpackhi_epi16(M[14], M[15]); - - - - r0 = _mm256_unpacklo_epi64(r8, r9); - r1 = _mm256_unpacklo_epi64(r10, r11); - - r2 = _mm256_unpackhi_epi64(r8, r9); - r3 = _mm256_unpackhi_epi64(r10, r11); - - - - M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); - M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); - M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); - M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); - - - //for(i=0; i<4; i=i+1) - //{ - r0 = _mm256_unpacklo_epi32(temp, temp0); - r1 = _mm256_unpacklo_epi32(temp1, temp2); - r2 = _mm256_unpacklo_epi32(r4, r5); - r3 = _mm256_unpacklo_epi32(r6, r7); - - //} - - - //for(i=0; i<2; i=i+1) - //{ - r8 = _mm256_unpacklo_epi64(r0, r1); - r10 = _mm256_unpackhi_epi64(r0, r1); - - r9 = _mm256_unpacklo_epi64(r2, r3); - r11 = _mm256_unpackhi_epi64(r2, r3); - - M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); - M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); - M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); - M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); - - r0 = _mm256_unpackhi_epi32(temp, temp0); - r1 = _mm256_unpackhi_epi32(temp1, temp2); - r2 = _mm256_unpackhi_epi32(r4, r5); - r3 = _mm256_unpackhi_epi32(r6, r7); - - //} -// for(i=0; i<2; i=i+1) -// { - r4 = _mm256_unpacklo_epi64(r0, r1); - r6 = _mm256_unpackhi_epi64(r0, r1); - - r5 = _mm256_unpacklo_epi64(r2, r3); - r7 = _mm256_unpackhi_epi64(r2, r3); - -// } - - //------------------------------------------------------- - - M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); - M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); - M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); - M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); -} - -/* -void transpose_unrolled(__m256i *M) -{ - int i; - __m256i tL[8], tH[8]; - __m256i bL[4], bH[4], cL[4], cH[4]; - __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; - - __m256i r0, r1, r2, r3, r4, r5, r6, r7; - - //for(i=0; i<8; i=i+1) - //{ - tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); - tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); - - tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); - tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); - - tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); - tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); - - tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); - tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); - - tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); - tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); - - tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); - tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); - - tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); - tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); - - tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); - tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); - - //} - - //------------------------------------------------------- - //for(i=0; i<4; i=i+1) - //{ - bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); - bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); - - bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); - bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); - - bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); - bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); - - bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); - bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); - - //} - - //for(i=0; i<2; i=i+1) - //{ - dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); - dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); - - dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); - dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); - - M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); - M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); - M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); - M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); - - //} - //for(i=0; i<2; i=i+1) - //{ - eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); - eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); - - eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); - eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); - - //} - - //------------------------------------------------------- - - //------------------------------------------------------- - for(i=0; i<4; i=i+1) - { - cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); - cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); - } - - - for(i=0; i<2; i=i+1) - { - fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); - fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); - gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); - } - - //------------------------------------------------------- - - - - M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); - M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); - M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); - M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); - - M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); - M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); - M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); - M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); - - M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); - M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); - M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); - M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); -} - - -void transpose1(__m256i *M) -{ - int i; - __m256i tL[8], tH[8]; - __m256i bL[4], bH[4], cL[4], cH[4]; - __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; - - for(i=0; i<8; i=i+1) - { - tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); - tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); - } - - for(i=0; i<4; i=i+1) - { - bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); - bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); - } - for(i=0; i<4; i=i+1) - { - cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); - cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); - } - - for(i=0; i<2; i=i+1) - { - dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); - dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); - eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); - } - - for(i=0; i<2; i=i+1) - { - fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); - fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); - gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); - } - - M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); - M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); - M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); - M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); - - M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); - M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); - M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); - M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); - - M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); - M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); - M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); - M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); - - M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); - M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); - M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); - M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); -} -*/ diff --git a/crypto_kem/firesaber/avx2/polymul/scm_avx.c b/crypto_kem/firesaber/avx2/polymul/scm_avx.c deleted file mode 100644 index 48870f51..00000000 --- a/crypto_kem/firesaber/avx2/polymul/scm_avx.c +++ /dev/null @@ -1,753 +0,0 @@ -//#define SCM_SIZE 16 - -//#pragma STDC FP_CONTRACT ON - -#include - -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - - -static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched - //the c_avx are added cummulatively -{ - - register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - register __m256i temp; - - - a0=a[0]; - a1=a[1]; - a2=a[2]; - a3=a[3]; - a4=a[4]; - a5=a[5]; - a6=a[6]; - a7=a[7]; - - b0=b[0]; - b1=b[1]; - b2=b[2]; - b3=b[3]; - b4=b[4]; - b5=b[5]; - b6=b[6]; - b7=b[7]; - - // New Unrolled first triangle - - //otherwise accumulate - c_avx[0] = mul_add(a0, b0, c_avx[0]); - - - temp = _mm256_mullo_epi16 (a0, b1); - temp=mul_add(a1, b0, temp); - c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); - - - temp = _mm256_mullo_epi16 (a0, b2); - temp = mul_add(a1, b1, temp); - temp=mul_add(a2, b0, temp); - c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); - - - temp = _mm256_mullo_epi16 (a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp=mul_add(a3, b0, temp); - c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); - - temp = _mm256_mullo_epi16 (a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp=mul_add(a2, b2, temp); - c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); - - - temp = _mm256_mullo_epi16 (a0, b5); - temp = mul_add(a1, b4 , temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp=mul_add(a5, b0, temp); - c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); - - temp = _mm256_mullo_epi16 (a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp=mul_add(a4, b2, temp); - c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); - - - temp = _mm256_mullo_epi16 (a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add (a6, b1, temp); - temp = mul_add (a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add (a3, b4, temp); - temp = mul_add (a4, b3, temp); - temp=mul_add(a5, b2, temp); - c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); - - temp = _mm256_mullo_epi16 (a0, b[8]); - temp = mul_add (a1, b7, temp); - temp = mul_add (a7, b1, temp); - temp = mul_add (a[8], b0, temp); - temp = mul_add (a2, b6,temp); - temp = mul_add(a3, b5, temp); - temp = mul_add (a4, b4,temp); - temp = mul_add (a5, b3, temp); - - temp=mul_add(a6, b2, temp); - c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); - - - temp = _mm256_mullo_epi16 (a0, b[9]); - temp = mul_add (a1, b[8], temp); - temp = mul_add (a[8], b1, temp); - temp = mul_add (a[9], b0, temp); - temp = mul_add (a2, b7, temp); - temp = mul_add (a3, b6, temp); - temp = mul_add (a4, b5, temp); - temp = mul_add (a5, b4, temp); - temp = mul_add (a6, b3, temp); - temp=mul_add(a7, b2, temp); - c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); - - - temp= _mm256_mullo_epi16 (a0, b[10]); - temp = mul_add (a1, b[9], temp); - temp = mul_add (a[9], b1, temp); - temp = mul_add (a[10], b0, temp); - temp = mul_add (a2, b[8], temp); - temp = mul_add (a3, b7, temp); - temp = mul_add (a4, b6, temp); - temp = mul_add (a5, b5, temp); - temp = mul_add (a6, b4, temp); - temp = mul_add (a7, b3, temp); - temp=mul_add(a[8], b2, temp); - c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); - - - temp = _mm256_mullo_epi16 (a0, b[11]); - temp = mul_add (a1, b[10], temp ); - temp = mul_add (a[10], b1, temp ); - temp = mul_add (a[11], b0, temp ); - temp = mul_add (a2, b[9], temp ); - temp = mul_add (a3, b[8], temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a[8], b3, temp ); - temp=mul_add(a[9], b2, temp); - c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); - - - temp = _mm256_mullo_epi16 (a0, b[12]); - temp = mul_add (a1, b[11], temp); - temp = mul_add (a[11], b1, temp); - temp = mul_add (a[12], b0, temp); - temp = mul_add (a2, b[10], temp); - temp = mul_add (a3, b[9], temp); - temp = mul_add (a4, b[8], temp); - temp = mul_add (a5, b7, temp); - temp = mul_add (a6, b6, temp); - temp = mul_add (a7, b5, temp); - temp = mul_add (a[8], b4, temp); - temp = mul_add (a[9], b3, temp); - temp=mul_add(a[10], b2, temp); - c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); - - - temp = _mm256_mullo_epi16 (a0, b[13]); - temp = mul_add (a1, b[12], temp ); - temp = mul_add (a[12], b1, temp ); - temp = mul_add (a[13], b0, temp ); - temp = mul_add (a2, b[11], temp ); - temp = mul_add (a3, b[10], temp ); - temp = mul_add (a4, b[9], temp ); - temp = mul_add (a5, b[8], temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a[8], b5, temp ); - temp = mul_add (a[9], b4, temp ); - temp = mul_add (a[10], b3, temp ); - temp=mul_add(a[11], b2, temp); - c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); - - - - temp = _mm256_mullo_epi16 (a0, b[14]); - temp = mul_add (a1, b[13], temp ); - temp = mul_add (a[13], b1, temp ); - temp = mul_add (a[14], b0, temp ); - temp = mul_add (a2, b[12], temp ); - temp = mul_add (a3, b[11], temp ); - temp = mul_add (a4, b[10], temp ); - temp = mul_add (a5, b[9], temp ); - temp = mul_add (a6, b[8], temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a[8], b6, temp ); - temp = mul_add (a[9], b5, temp ); - temp = mul_add (a[10], b4, temp ); - temp = mul_add (a[11], b3, temp ); - temp=mul_add(a[12], b2, temp); - c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); - - - temp = _mm256_mullo_epi16 (a0, b[15]); - temp = mul_add (a1, b[14], temp ); - temp = mul_add (a[14], b1, temp ); - temp = mul_add (a[15], b0, temp ); - temp = mul_add (a2, b[13], temp ); - temp = mul_add (a3, b[12], temp ); - temp = mul_add (a4, b[11], temp ); - temp = mul_add (a5, b[10], temp ); - temp = mul_add (a6, b[9], temp ); - temp = mul_add (a7, b[8], temp ); - temp = mul_add (a[8], b7, temp ); - temp = mul_add (a[9], b6, temp ); - temp = mul_add (a[10], b5, temp ); - temp = mul_add (a[11], b4, temp ); - temp = mul_add (a[12], b3, temp ); - temp=mul_add(a[13], b2, temp); - c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); - - - // unrolled second triangle - a0=a[14]; - a1=a[15]; - a2=a[13]; - a3=a[12]; - a4=a[11]; - a5=a[10]; - a6=a[9]; - a7=a[8]; - - b0=b[14]; - b1=b[15]; - b2=b[13]; - b3=b[12]; - b4=b[11]; - b5=b[10]; - b6=b[9]; - b7=b[8]; - - temp = _mm256_mullo_epi16 (a[1], b1); - temp = mul_add (a[2], b0, temp ); - temp = mul_add (a[3], b2, temp ); - temp = mul_add (a[4], b3, temp ); - temp = mul_add (a[5], b4, temp ); - temp = mul_add (a[6], b5, temp ); - temp = mul_add (a[7], b6, temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a6, b[7], temp ); - temp = mul_add (a5, b[6], temp ); - temp = mul_add (a4, b[5], temp ); - temp = mul_add (a3, b[4], temp ); - temp = mul_add (a2, b[3], temp ); - temp = mul_add (a0, b[2], temp ); - temp=mul_add(a1, b[1], temp); - c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); - - - temp = _mm256_mullo_epi16 (a[2], b1); - temp = mul_add (a[3], b0, temp ); - temp = mul_add (a[4], b2, temp ); - temp = mul_add (a[5], b3, temp ); - temp = mul_add (a[6], b4, temp ); - temp = mul_add (a[7], b5, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a5, b[7], temp ); - temp = mul_add (a4, b[6], temp ); - temp = mul_add (a3, b[5], temp ); - temp = mul_add (a2, b[4], temp ); - temp = mul_add (a0, b[3], temp ); - temp=mul_add(a1, b[2], temp); - c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); - - - temp = _mm256_mullo_epi16 (a[3], b1); - temp = mul_add (a[4], b0, temp ); - temp = mul_add (a[5], b2, temp ); - temp = mul_add (a[6], b3, temp ); - temp = mul_add (a[7], b4, temp ); - temp = mul_add (a7, b5, temp ); - temp = mul_add (a6, b6, temp ); - temp = mul_add (a5, b7, temp ); - temp = mul_add (a4, b[7], temp ); - temp = mul_add (a3, b[6], temp ); - temp = mul_add (a2, b[5], temp ); - temp = mul_add (a0, b[4], temp ); - temp=mul_add(a1, b[3], temp); - c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); - - - temp = _mm256_mullo_epi16 (a[4], b1); - temp = mul_add (a[5], b0, temp ); - temp = mul_add (a[6], b2, temp ); - temp = mul_add (a[7], b3, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a3, b[7], temp ); - temp = mul_add (a2, b[6], temp ); - temp = mul_add (a0, b[5], temp ); - temp=mul_add(a1, b[4], temp); - c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); - - - temp = _mm256_mullo_epi16 (a[5], b1); - temp = mul_add (a[6], b0, temp ); - temp = mul_add (a[7], b2, temp ); - temp = mul_add (a7, b3, temp ); - temp = mul_add (a6, b4, temp ); - temp = mul_add (a5, b5, temp ); - temp = mul_add (a4, b6, temp ); - temp = mul_add (a3, b7, temp ); - temp = mul_add (a2, b[7], temp ); - temp = mul_add (a0, b[6], temp ); - temp=mul_add(a1, b[5], temp); - c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); - - - temp = _mm256_mullo_epi16 (a[6], b1); - temp = mul_add (a[7], b0, temp ); - temp = mul_add (a7, b2, temp ); - temp = mul_add (a6, b3, temp ); - temp = mul_add (a5, b4, temp ); - temp = mul_add (a4, b5, temp ); - temp = mul_add (a3, b6, temp ); - temp = mul_add (a2, b7, temp ); - temp = mul_add (a0, b[7], temp ); - temp=mul_add(a1, b[6], temp); - c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); - - - temp = _mm256_mullo_epi16 (a[7], b1); - temp = mul_add (a7, b0, temp ); - temp = mul_add (a6, b2, temp ); - temp = mul_add (a5, b3, temp ); - temp = mul_add (a4, b4, temp ); - temp = mul_add (a3, b5, temp ); - temp = mul_add (a2, b6, temp ); - temp = mul_add (a0, b7, temp ); - temp=mul_add(a1, b[7], temp); - c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); - - - temp = _mm256_mullo_epi16 (a7, b1); - temp = mul_add (a6, b0, temp ); - temp = mul_add (a5, b2, temp ); - temp = mul_add (a4, b3, temp ); - temp = mul_add (a3, b4, temp ); - temp = mul_add (a2, b5, temp ); - temp = mul_add (a0, b6, temp ); - temp=mul_add(a1, b7, temp); - c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); - - - temp = _mm256_mullo_epi16 (a6, b1); - temp = mul_add (a5, b0, temp ); - temp = mul_add (a4, b2, temp ); - temp = mul_add (a3, b3, temp ); - temp = mul_add (a2, b4, temp ); - temp = mul_add (a0, b5, temp ); - temp=mul_add(a1, b6, temp); - c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); - - - temp = _mm256_mullo_epi16 (a5, b1); - temp = mul_add (a4, b0, temp ); - temp = mul_add (a3, b2, temp ); - temp = mul_add (a2, b3, temp ); - temp = mul_add (a0, b4, temp ); - temp=mul_add(a1, b5, temp); - c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); - - - temp = _mm256_mullo_epi16 (a4, b1); - temp = mul_add (a3, b0, temp ); - temp = mul_add (a2, b2, temp ); - temp = mul_add (a0, b3, temp ); - temp=mul_add(a1, b4, temp); - c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); - - - temp = _mm256_mullo_epi16 (a3, b1); - temp = mul_add (a2, b0, temp ); - temp = mul_add (a0, b2, temp ); - temp=mul_add(a1, b3, temp); - c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); - - - temp = _mm256_mullo_epi16 (a2, b1); - temp = mul_add (a0, b0, temp ); - temp=mul_add(a1, b2, temp); - c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); - - - temp = _mm256_mullo_epi16 (a0, b1); - temp=mul_add(a1, b0, temp); - c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); - - - c_avx[30] = mul_add(a1, b1, c_avx[30]); - - - - c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); - - -} - - - -static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched - //the c_avx are not added cummulatively -{ - - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - - - a0=a[0]; - a1=a[1]; - a2=a[2]; - a3=a[3]; - a4=a[4]; - a5=a[5]; - a6=a[6]; - a7=a[7]; - - b0=b[0]; - b1=b[1]; - b2=b[2]; - b3=b[3]; - b4=b[4]; - b5=b[5]; - b6=b[6]; - b7=b[7]; - - // New Unrolled first triangle - c_avx[0] = _mm256_mullo_epi16 (a0, b0); - - temp = _mm256_mullo_epi16 (a0, b1); - c_avx[1]=mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b2); - - temp = mul_add(a1, b1, temp); - c_avx[2]= mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c_avx[3]= mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c_avx[4]= mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b5); - temp = mul_add(a1, b4 , temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c_avx[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c_avx[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add (a6, b1, temp); - temp = mul_add (a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add (a3, b4, temp); - temp = mul_add (a4, b3, temp); - c_avx[7] = mul_add (a5, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[8]); - temp = mul_add (a1, b7, temp); - temp = mul_add (a7, b1, temp); - temp = mul_add (a[8], b0, temp); - temp = mul_add (a2, b6,temp); - temp = mul_add(a3, b5, temp); - temp = mul_add (a4, b4,temp); - temp = mul_add (a5, b3, temp); - c_avx[8] = mul_add (a6, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[9]); - temp = mul_add (a1, b[8], temp); - temp = mul_add (a[8], b1, temp); - temp = mul_add (a[9], b0, temp); - temp = mul_add (a2, b7, temp); - temp = mul_add (a3, b6, temp); - temp = mul_add (a4, b5, temp); - temp = mul_add (a5, b4, temp); - temp = mul_add (a6, b3, temp); - c_avx[9] = mul_add (a7, b2, temp); - - temp= _mm256_mullo_epi16 (a0, b[10]); - temp = mul_add (a1, b[9], temp); - temp = mul_add (a[9], b1, temp); - temp = mul_add (a[10], b0, temp); - temp = mul_add (a2, b[8], temp); - temp = mul_add (a3, b7, temp); - temp = mul_add (a4, b6, temp); - temp = mul_add (a5, b5, temp); - temp = mul_add (a6, b4, temp); - temp = mul_add (a7, b3, temp); - c_avx[10] = mul_add (a[8], b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[11]); - temp = mul_add (a1, b[10], temp ); - temp = mul_add (a[10], b1, temp ); - temp = mul_add (a[11], b0, temp ); - temp = mul_add (a2, b[9], temp ); - temp = mul_add (a3, b[8], temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a[8], b3, temp ); - c_avx[11] = mul_add (a[9], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[12]); - temp = mul_add (a1, b[11], temp); - temp = mul_add (a[11], b1, temp); - temp = mul_add (a[12], b0, temp); - temp = mul_add (a2, b[10], temp); - temp = mul_add (a3, b[9], temp); - temp = mul_add (a4, b[8], temp); - temp = mul_add (a5, b7, temp); - temp = mul_add (a6, b6, temp); - temp = mul_add (a7, b5, temp); - temp = mul_add (a[8], b4, temp); - temp = mul_add (a[9], b3, temp); - c_avx[12] = mul_add (a[10], b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[13]); - temp = mul_add (a1, b[12], temp ); - temp = mul_add (a[12], b1, temp ); - temp = mul_add (a[13], b0, temp ); - temp = mul_add (a2, b[11], temp ); - temp = mul_add (a3, b[10], temp ); - temp = mul_add (a4, b[9], temp ); - temp = mul_add (a5, b[8], temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a[8], b5, temp ); - temp = mul_add (a[9], b4, temp ); - temp = mul_add (a[10], b3, temp ); - c_avx[13] = mul_add (a[11], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[14]); - temp = mul_add (a1, b[13], temp ); - temp = mul_add (a[13], b1, temp ); - temp = mul_add (a[14], b0, temp ); - temp = mul_add (a2, b[12], temp ); - temp = mul_add (a3, b[11], temp ); - temp = mul_add (a4, b[10], temp ); - temp = mul_add (a5, b[9], temp ); - temp = mul_add (a6, b[8], temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a[8], b6, temp ); - temp = mul_add (a[9], b5, temp ); - temp = mul_add (a[10], b4, temp ); - temp = mul_add (a[11], b3, temp ); - c_avx[14] = mul_add (a[12], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[15]); - temp = mul_add (a1, b[14], temp ); - temp = mul_add (a[14], b1, temp ); - temp = mul_add (a[15], b0, temp ); - temp = mul_add (a2, b[13], temp ); - temp = mul_add (a3, b[12], temp ); - temp = mul_add (a4, b[11], temp ); - temp = mul_add (a5, b[10], temp ); - temp = mul_add (a6, b[9], temp ); - temp = mul_add (a7, b[8], temp ); - temp = mul_add (a[8], b7, temp ); - temp = mul_add (a[9], b6, temp ); - temp = mul_add (a[10], b5, temp ); - temp = mul_add (a[11], b4, temp ); - temp = mul_add (a[12], b3, temp ); - c_avx[15] = mul_add (a[13], b2, temp ); - - - // unrolled second triangle - a0=a[14]; - a1=a[15]; - a2=a[13]; - a3=a[12]; - a4=a[11]; - a5=a[10]; - a6=a[9]; - a7=a[8]; - - b0=b[14]; - b1=b[15]; - b2=b[13]; - b3=b[12]; - b4=b[11]; - b5=b[10]; - b6=b[9]; - b7=b[8]; - - - temp = _mm256_mullo_epi16 (a[1], b1); - temp = mul_add (a[2], b0, temp ); - temp = mul_add (a[3], b2, temp ); - temp = mul_add (a[4], b3, temp ); - temp = mul_add (a[5], b4, temp ); - temp = mul_add (a[6], b5, temp ); - temp = mul_add (a[7], b6, temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a6, b[7], temp ); - temp = mul_add (a5, b[6], temp ); - temp = mul_add (a4, b[5], temp ); - temp = mul_add (a3, b[4], temp ); - temp = mul_add (a2, b[3], temp ); - temp = mul_add (a0, b[2], temp ); - c_avx[16] = mul_add (a1, b[1], temp ); - - temp = _mm256_mullo_epi16 (a[2], b1); - temp = mul_add (a[3], b0, temp ); - temp = mul_add (a[4], b2, temp ); - temp = mul_add (a[5], b3, temp ); - temp = mul_add (a[6], b4, temp ); - temp = mul_add (a[7], b5, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a5, b[7], temp ); - temp = mul_add (a4, b[6], temp ); - temp = mul_add (a3, b[5], temp ); - temp = mul_add (a2, b[4], temp ); - temp = mul_add (a0, b[3], temp ); - c_avx[17] = mul_add (a1, b[2], temp ); - - temp = _mm256_mullo_epi16 (a[3], b1); - temp = mul_add (a[4], b0, temp ); - temp = mul_add (a[5], b2, temp ); - temp = mul_add (a[6], b3, temp ); - temp = mul_add (a[7], b4, temp ); - temp = mul_add (a7, b5, temp ); - temp = mul_add (a6, b6, temp ); - temp = mul_add (a5, b7, temp ); - temp = mul_add (a4, b[7], temp ); - temp = mul_add (a3, b[6], temp ); - temp = mul_add (a2, b[5], temp ); - temp = mul_add (a0, b[4], temp ); - c_avx[18] = mul_add (a1, b[3], temp ); - - temp = _mm256_mullo_epi16 (a[4], b1); - temp = mul_add (a[5], b0, temp ); - temp = mul_add (a[6], b2, temp ); - temp = mul_add (a[7], b3, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a3, b[7], temp ); - temp = mul_add (a2, b[6], temp ); - temp = mul_add (a0, b[5], temp ); - c_avx[19] = mul_add (a1, b[4], temp ); - - temp = _mm256_mullo_epi16 (a[5], b1); - temp = mul_add (a[6], b0, temp ); - temp = mul_add (a[7], b2, temp ); - temp = mul_add (a7, b3, temp ); - temp = mul_add (a6, b4, temp ); - temp = mul_add (a5, b5, temp ); - temp = mul_add (a4, b6, temp ); - temp = mul_add (a3, b7, temp ); - temp = mul_add (a2, b[7], temp ); - temp = mul_add (a0, b[6], temp ); - c_avx[20] = mul_add (a1, b[5], temp ); - - temp = _mm256_mullo_epi16 (a[6], b1); - temp = mul_add (a[7], b0, temp ); - temp = mul_add (a7, b2, temp ); - temp = mul_add (a6, b3, temp ); - temp = mul_add (a5, b4, temp ); - temp = mul_add (a4, b5, temp ); - temp = mul_add (a3, b6, temp ); - temp = mul_add (a2, b7, temp ); - temp = mul_add (a0, b[7], temp ); - c_avx[21] = mul_add (a1, b[6], temp ); - - temp = _mm256_mullo_epi16 (a[7], b1); - temp = mul_add (a7, b0, temp ); - temp = mul_add (a6, b2, temp ); - temp = mul_add (a5, b3, temp ); - temp = mul_add (a4, b4, temp ); - temp = mul_add (a3, b5, temp ); - temp = mul_add (a2, b6, temp ); - temp = mul_add (a0, b7, temp ); - c_avx[22] = mul_add (a1, b[7], temp ); - - temp = _mm256_mullo_epi16 (a7, b1); - temp = mul_add (a6, b0, temp ); - temp = mul_add (a5, b2, temp ); - temp = mul_add (a4, b3, temp ); - temp = mul_add (a3, b4, temp ); - temp = mul_add (a2, b5, temp ); - temp = mul_add (a0, b6, temp ); - c_avx[23] = mul_add (a1, b7, temp ); - - temp = _mm256_mullo_epi16 (a6, b1); - temp = mul_add (a5, b0, temp ); - temp = mul_add (a4, b2, temp ); - temp = mul_add (a3, b3, temp ); - temp = mul_add (a2, b4, temp ); - temp = mul_add (a0, b5, temp ); - c_avx[24] = mul_add (a1, b6, temp ); - - temp = _mm256_mullo_epi16 (a5, b1); - temp = mul_add (a4, b0, temp ); - temp = mul_add (a3, b2, temp ); - temp = mul_add (a2, b3, temp ); - temp = mul_add (a0, b4, temp ); - c_avx[25] = mul_add (a1, b5, temp ); - - temp = _mm256_mullo_epi16 (a4, b1); - temp = mul_add (a3, b0, temp ); - temp = mul_add (a2, b2, temp ); - temp = mul_add (a0, b3, temp ); - c_avx[26] = mul_add (a1, b4, temp ); - - temp = _mm256_mullo_epi16 (a3, b1); - temp = mul_add (a2, b0, temp ); - temp = mul_add (a0, b2, temp ); - c_avx[27] = mul_add (a1, b3, temp ); - - temp = _mm256_mullo_epi16 (a2, b1); - temp = mul_add (a0, b0, temp ); - c_avx[28] = mul_add (a1, b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b1); - c_avx[29] = mul_add (a1, b0, temp); - - c_avx[30] = _mm256_mullo_epi16 (a1, b1); - - - c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); - -} diff --git a/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c b/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c deleted file mode 100644 index 78fb86c2..00000000 --- a/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c +++ /dev/null @@ -1,1010 +0,0 @@ -/* -Cleaned version for step by step approach look into the _debug file -*/ -//#include "timing.c" -#include "consts.h" -#include "matrix.c" -#include "scm_avx.c" - -static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX. -{ - __m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time - - //uint16_t i; - - register __m256i r0_avx, r1_avx, r2_avx, r3_avx; - - - - //CLOCK1=cpucycles(); - - //------------------AVX evaluation for 1st poly----------------------- - - r0_avx=a[0]; - r1_avx=a[1]; - r2_avx=a[2]; - r3_avx=a[3]; - a_bucket[0]=r0_avx; - a_bucket[1]=r1_avx; - a_bucket[2]=r2_avx; - a_bucket[3]=r3_avx; - a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]); - - - //------------------AVX evaluation for 1st poly ends------------------ - - - //------------------AVX evaluation for 2nd poly----------------------- - r0_avx=a[small_len_avx]; - r1_avx=a[small_len_avx+1]; - r2_avx=a[small_len_avx+2]; - r3_avx=a[small_len_avx+3]; - a_bucket[0+9]=r0_avx; - a_bucket[1+9]=r1_avx; - a_bucket[2+9]=r2_avx; - a_bucket[3+9]=r3_avx; - a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]); - - - //------------------AVX evaluation for 2nd poly ends------------------ - - - //------------------AVX evaluation for 3rd poly----------------------- - r0_avx=a[2*small_len_avx]; - r1_avx=a[2*small_len_avx+1]; - r2_avx=a[2*small_len_avx+2]; - r3_avx=a[2*small_len_avx+3]; - a_bucket[0+18]=r0_avx; - a_bucket[1+18]=r1_avx; - a_bucket[2+18]=r2_avx; - a_bucket[3+18]=r3_avx; - a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]); - - //------------------AVX evaluation for 3rd poly ends------------------ - - - //------------------AVX evaluation for 4th poly----------------------- - - r0_avx=a[3*small_len_avx]; - r1_avx=a[3*small_len_avx+1]; - r2_avx=a[3*small_len_avx+2]; - r3_avx=a[3*small_len_avx+3]; - a_bucket[0+27]=r0_avx; - a_bucket[1+27]=r1_avx; - a_bucket[2+27]=r2_avx; - a_bucket[3+27]=r3_avx; - a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]); - - //------------------AVX evaluation for 4th poly ends------------------ - - //------------------AVX evaluation for 5th poly----------------------- - - r0_avx=a[4*small_len_avx+0]; - r1_avx=a[4*small_len_avx+1]; - r2_avx=a[4*small_len_avx+2]; - r3_avx=a[4*small_len_avx+3]; - a_bucket[0+36]=r0_avx; - a_bucket[1+36]=r1_avx; - a_bucket[2+36]=r2_avx; - a_bucket[3+36]=r3_avx; - a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]); - - //------------------AVX evaluation for 5th poly ends------------------ - - - //------------------AVX evaluation for 6th poly----------------------- - r0_avx=a[5*small_len_avx]; - r1_avx=a[5*small_len_avx+1]; - r2_avx=a[5*small_len_avx+2]; - r3_avx=a[5*small_len_avx+3]; - a_bucket[0+45]=r0_avx; - a_bucket[1+45]=r1_avx; - a_bucket[2+45]=r2_avx; - a_bucket[3+45]=r3_avx; - a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]); - - //------------------AVX evaluation for 6th poly ends------------------ - - //------------------AVX evaluation for 7th poly----------------------- - - r0_avx=a[6*small_len_avx]; - r1_avx=a[6*small_len_avx+1]; - r2_avx=a[6*small_len_avx+2]; - r3_avx=a[6*small_len_avx+3]; - a_bucket[0+54]=r0_avx; - a_bucket[1+54]=r1_avx; - a_bucket[2+54]=r2_avx; - a_bucket[3+54]=r3_avx; - a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]); - - //------------------AVX evaluation for 7th poly ends------------------ - - - - //CLOCK2=cpucycles(); - //CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1); - //printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1); - - - //CLOCK1=cpucycles(); - //-----------------Forward transposes-------------------------------------- - transpose_n1(a_bucket); - transpose_n1(a_bucket+16); - transpose_n1(a_bucket+32); - transpose_n1(a_bucket+48); - - //-----------------Forwatrd transposes ends--------------------------------- - - //----------------------all multiplications--------------------------------- - if(f==0){ - schoolbook_avx_new2(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); - schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); - schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); - } - else{ - schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); - //schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); - schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); - } - /* - schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f); - schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f); - schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f); - schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f); - */ - - - //----------------------all multiplications ends----------------------------- - - - //-----------------Reverse transposes-------------------------------------- - - /* - transpose(c_bucket); - transpose(c_bucket+16); - - transpose(c_bucket+2*SCM_SIZE); - transpose(c_bucket+16+2*SCM_SIZE); - - transpose(c_bucket+4*SCM_SIZE); - transpose(c_bucket+16+4*SCM_SIZE); - - transpose(c_bucket+6*SCM_SIZE); - transpose(c_bucket+16+6*SCM_SIZE); - */ - //-----------------Reverse transposes ends--------------------------------- - - //CLOCK2=cpucycles(); - //CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1); - - //KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6); - -} - -static void KARA_eval(__m256i* b, __m256i *b_bucket){ - - __m256i r0_avx, r1_avx, r2_avx, r3_avx; - - - //-------1st poly---------------------------------------------------- - r0_avx=b[0]; - r1_avx=b[1]; - r2_avx=b[2]; - r3_avx=b[3]; - b_bucket[0]=r0_avx; - b_bucket[1]=r1_avx; - b_bucket[2]=r2_avx; - b_bucket[3]=r3_avx; - b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]); - //-------2nd poly---------------------------------------------------- - - r0_avx=b[small_len_avx]; - r1_avx=b[small_len_avx+1]; - r2_avx=b[small_len_avx+2]; - r3_avx=b[small_len_avx+3]; - b_bucket[0+9]=r0_avx; - b_bucket[1+9]=r1_avx; - b_bucket[2+9]=r2_avx; - b_bucket[3+9]=r3_avx; - b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]); - - //-------3rd poly---------------------------------------------------- - - r0_avx=b[2*small_len_avx+0]; - r1_avx=b[2*small_len_avx+1]; - r2_avx=b[2*small_len_avx+2]; - r3_avx=b[2*small_len_avx+3]; - b_bucket[0+18]=r0_avx; - b_bucket[1+18]=r1_avx; - b_bucket[2+18]=r2_avx; - b_bucket[3+18]=r3_avx; - b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]); - - //-------4th poly---------------------------------------------------- - r0_avx=b[3*small_len_avx]; - r1_avx=b[3*small_len_avx+1]; - r2_avx=b[3*small_len_avx+2]; - r3_avx=b[3*small_len_avx+3]; - b_bucket[0+27]=r0_avx; - b_bucket[1+27]=r1_avx; - b_bucket[2+27]=r2_avx; - b_bucket[3+27]=r3_avx; - b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]); - - //-------5th poly---------------------------------------------------- - - r0_avx=b[4*small_len_avx]; - r1_avx=b[4*small_len_avx+1]; - r2_avx=b[4*small_len_avx+2]; - r3_avx=b[4*small_len_avx+3]; - b_bucket[0+36]=r0_avx; - b_bucket[1+36]=r1_avx; - b_bucket[2+36]=r2_avx; - b_bucket[3+36]=r3_avx; - b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]); - - //-------6th poly---------------------------------------------------- - - r0_avx=b[5*small_len_avx]; - r1_avx=b[5*small_len_avx+1]; - r2_avx=b[5*small_len_avx+2]; - r3_avx=b[5*small_len_avx+3]; - b_bucket[0+45]=r0_avx; - b_bucket[1+45]=r1_avx; - b_bucket[2+45]=r2_avx; - b_bucket[3+45]=r3_avx; - b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]); - - //-------7th poly---------------------------------------------------- - - r0_avx=b[6*small_len_avx]; - r1_avx=b[6*small_len_avx+1]; - r2_avx=b[6*small_len_avx+2]; - r3_avx=b[6*small_len_avx+3]; - b_bucket[0+54]=r0_avx; - b_bucket[1+54]=r1_avx; - b_bucket[2+54]=r2_avx; - b_bucket[3+54]=r3_avx; - b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]); - - //--------------Evaluating B poly ends------------------------------- - - transpose_n1(b_bucket); - transpose_n1(b_bucket+16); - transpose_n1(b_bucket+32); - transpose_n1(b_bucket+48); -} - -static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){ - - //int64_t i; - register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results - - __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; - - //CLOCK1=cpucycles(); - - //------------------------AVX interpolation for 1st poly external------------------- - - //loop1 - res_avx0 = c_bucket[0]; - res_avx2 = c_bucket[1]; - res_avx4 = c_bucket[2]; - res_avx6 = c_bucket[3]; - - c6_avx=c_bucket[6]; - c7_avx=c_bucket[7]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx); - - res_avx1 = c_bucket[16]; - res_avx3 = c_bucket[17]; - res_avx5 = c_bucket[18]; - res_avx7 = c_bucket[19]; - - c22_avx=c_bucket[22]; - c23_avx=c_bucket[23]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final0[0]=res_avx0; - result_final0[1]=res_avx1; - - result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final0[6]=res_avx6; - result_final0[7]=res_avx7; - - - //------------------------AVX interpolation for 1st poly ends-------------- - - - //------------------------AVX interpolation for 2nd poly external------------------- - - //loop1 - res_avx0 = c_bucket[9]; //c_bucket0 - res_avx2 = c_bucket[10]; //c_bucket1 - res_avx4 = c_bucket[11]; //c_bucket2 - res_avx6 = c_bucket[12]; //c_bucket3 - - c6_avx=c_bucket[15]; //c_bucket6 - c7_avx=c_bucket[32]; //c_bucket7 - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx); - - res_avx1 = c_bucket[25]; //c_bucket0 - res_avx3 = c_bucket[26]; //c_bucket1 - res_avx5 = c_bucket[27]; //c_bucket2 - res_avx7 = c_bucket[28]; //c_bucket3 - - c22_avx=c_bucket[31]; - c23_avx=c_bucket[48]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final1[0]=res_avx0; - result_final1[1]=res_avx1; - - result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final1[6]=res_avx6; - result_final1[7]=res_avx7; - - - //------------------------AVX interpolation for 2nd poly ends-------------- - - //------------------------AVX interpolation for 3rd poly external------------------- - - //loop1 - res_avx0 = c_bucket[34]; //c_bucket0 - res_avx2 = c_bucket[35]; //c_bucket1 - res_avx4 = c_bucket[36]; - res_avx6 = c_bucket[37]; - - c6_avx=c_bucket[40]; - c7_avx=c_bucket[41]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx); - - res_avx1 = c_bucket[50]; //c_bucket0 - res_avx3 = c_bucket[51]; //c_bucket1 - res_avx5 = c_bucket[52]; - res_avx7 = c_bucket[53]; - - c22_avx=c_bucket[56]; - c23_avx=c_bucket[57]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - //loop4 - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - //loop5 - result_final2[0]=res_avx0; - result_final2[1]=res_avx1; - - result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final2[6]=res_avx6; - result_final2[7]=res_avx7; - - //------------------------AVX interpolation for 3rd poly ends-------------- - - //------------------------AVX interpolation for 4th poly external------------------- - - //loop1 - res_avx0 = c_bucket[43]; - res_avx2 = c_bucket[44]; - res_avx4 = c_bucket[45]; - res_avx6 = c_bucket[46]; - - c6_avx=c_bucket[65]; - c7_avx=c_bucket[66]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx); - - res_avx1 = c_bucket[59]; - res_avx3 = c_bucket[60]; - res_avx5 = c_bucket[61]; - res_avx7 = c_bucket[62]; - - c22_avx=c_bucket[81]; - c23_avx=c_bucket[82]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final3[0]=res_avx0; - result_final3[1]=res_avx1; - - result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final3[6]=res_avx6; - result_final3[7]=res_avx7; - - - //------------------------AVX interpolation for 4th poly ends-------------- - - //------------------------AVX interpolation for 5th poly external------------------- - - //loop1 - res_avx0 = c_bucket[68]; - res_avx2 = c_bucket[69]; - res_avx4 = c_bucket[70]; - res_avx6 = c_bucket[71]; - - c6_avx=c_bucket[74]; - c7_avx=c_bucket[75]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx); - - res_avx1 = c_bucket[84]; - res_avx3 = c_bucket[85]; - res_avx5 = c_bucket[86]; - res_avx7 = c_bucket[87]; - - c22_avx=c_bucket[90]; - c23_avx=c_bucket[91]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final4[0]=res_avx0; - result_final4[1]=res_avx1; - - result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final4[6]=res_avx6; - result_final4[7]=res_avx7; - - - //------------------------AVX interpolation for 5th poly ends-------------- - - //------------------------AVX interpolation for 6th poly external------------------- - - //loop1 - res_avx0 = c_bucket[77]; - res_avx2 = c_bucket[78]; - res_avx4 = c_bucket[79]; - res_avx6 = c_bucket[96]; - - c6_avx=c_bucket[99]; - c7_avx=c_bucket[100]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx); - - res_avx1 = c_bucket[93]; - res_avx3 = c_bucket[94]; - res_avx5 = c_bucket[95]; - res_avx7 = c_bucket[112]; - - c22_avx=c_bucket[115]; - c23_avx=c_bucket[116]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final5[0]=res_avx0; - result_final5[1]=res_avx1; - - result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final5[6]=res_avx6; - result_final5[7]=res_avx7; - - - //------------------------AVX interpolation for 6th poly ends-------------- - - //------------------------AVX interpolation for 7th poly external------------------- - - //loop1 - res_avx0 = c_bucket[102]; - res_avx2 = c_bucket[103]; - res_avx4 = c_bucket[104]; - res_avx6 = c_bucket[105]; - - c6_avx=c_bucket[108]; - c7_avx=c_bucket[109]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx); - - res_avx1 = c_bucket[118]; - res_avx3 = c_bucket[119]; - res_avx5 = c_bucket[120]; - res_avx7 = c_bucket[121]; - - c22_avx=c_bucket[124]; - c23_avx=c_bucket[125]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final6[0]=res_avx0; - result_final6[1]=res_avx1; - - result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final6[6]=res_avx6; - result_final6[7]=res_avx7; - - - //------------------------AVX interpolation for 7th poly ends-------------- - - //CLOCK2=cpucycles(); - //CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1); - //printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1); - - - -} - -static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ - - int i; - -//---------------AVX data----------------------------- - - __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; - __m256i aw_avx[7*small_len_avx]; - -//----------------AVX data---------------------------- - - -// EVALUATION - - //CLOCK1=cpucycles(); - - for (i=0; i> (SABER_EQ - SABER_EP); + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); - PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); - memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); + PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(pk, res); // pack public key } -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { - uint16_t A[SABER_L][SABER_L][SABER_N]; - uint16_t sp[SABER_L][SABER_N]; - uint16_t bp[SABER_L][SABER_N] = {{0}}; - uint16_t vp[SABER_N] = {0}; - uint16_t mp[SABER_N]; - uint16_t b[SABER_L][SABER_N]; + +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { size_t i, j; + + poly A[SABER_L][SABER_L]; + poly res[SABER_L]; + poly s[SABER_L]; + poly *temp = A[0]; // re-use stack space + poly *vprime = &A[0][0]; + poly *message = &A[0][1]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + PQCLEAN_FIRESABER_CLEAN_GenSecret(s, noiseseed); PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A); - PQCLEAN_FIRESABER_CLEAN_GenSecret(sp, seed_sp); - PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); + PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed - for (i = 0; i < SABER_L; i++) { + + // rounding + for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits for (j = 0; j < SABER_N; j++) { - bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } + PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(ciphertext, res); - PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); - PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, pk); - PQCLEAN_FIRESABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); - - PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(mp, m); - - for (j = 0; j < SABER_N; j++) { - vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); - } - - PQCLEAN_FIRESABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); -} - -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { - - uint16_t s[SABER_L][SABER_N]; - uint16_t b[SABER_L][SABER_N]; - uint16_t v[SABER_N] = {0}; - uint16_t cm[SABER_N]; - size_t i; - - PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk); - PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, ciphertext); - PQCLEAN_FIRESABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); - PQCLEAN_FIRESABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); + // vector-vector scalar multiplication with mod p + PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(temp, pk); + PQCLEAN_FIRESABER_CLEAN_InnerProd(vprime, temp, s); + PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(message, m); for (i = 0; i < SABER_N; i++) { - v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); + vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); + vprime->coeffs[i] &= SABER_P - 1; + vprime->coeffs[i] >>= SABER_EP - SABER_ET; + } + + PQCLEAN_FIRESABER_CLEAN_POLT2BS(msk_c, vprime); +} + + +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + size_t i; + + poly temp[SABER_L]; + poly s[SABER_L]; + + const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + poly *v = &temp[0]; + poly *cm = &temp[1]; + + PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk); + PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(temp, ciphertext); + PQCLEAN_FIRESABER_CLEAN_InnerProd(&temp[0], temp, s); + + PQCLEAN_FIRESABER_CLEAN_BS2POLT(cm, packed_cm); + + for (i = 0; i < SABER_N; i++) { + v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); + v->coeffs[i] &= SABER_P - 1; + v->coeffs[i] >>= SABER_EP - 1; } PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(m, v); diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.h b/crypto_kem/firesaber/clean/SABER_indcpa.h index 28a5feee..cc009afe 100644 --- a/crypto_kem/firesaber/clean/SABER_indcpa.h +++ b/crypto_kem/firesaber/clean/SABER_indcpa.h @@ -5,7 +5,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); +void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); diff --git a/crypto_kem/firesaber/clean/SABER_params.h b/crypto_kem/firesaber/clean/SABER_params.h index 9121a12b..6481efec 100644 --- a/crypto_kem/firesaber/clean/SABER_params.h +++ b/crypto_kem/firesaber/clean/SABER_params.h @@ -2,19 +2,21 @@ #define PARAMS_H -/* Change this for different security strengths */ - /* Don't change anything below this line */ #define SABER_L 4 #define SABER_MU 6 #define SABER_ET 6 -#define SABER_EQ 13 -#define SABER_EP 10 #define SABER_N 256 +#define SABER_EP 10 +#define SABER_P (1 << SABER_EP) + +#define SABER_EQ 13 +#define SABER_Q (1 << SABER_EQ) + #define SABER_SEEDBYTES 32 -#define SABER_NOISE_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 #define SABER_KEYBYTES 32 #define SABER_HASHBYTES 32 diff --git a/crypto_kem/firesaber/clean/api.h b/crypto_kem/firesaber/clean/api.h index 14718674..fdff18fa 100644 --- a/crypto_kem/firesaber/clean/api.h +++ b/crypto_kem/firesaber/clean/api.h @@ -15,4 +15,4 @@ int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); -#endif /* api_h */ +#endif /* PQCLEAN_FIRESABER_CLEAN_API_H */ diff --git a/crypto_kem/firesaber/clean/pack_unpack.c b/crypto_kem/firesaber/clean/pack_unpack.c index 0add1409..ec2f1263 100644 --- a/crypto_kem/firesaber/clean/pack_unpack.c +++ b/crypto_kem/firesaber/clean/pack_unpack.c @@ -1,136 +1,149 @@ -#include "api.h" +#include "SABER_params.h" #include "pack_unpack.h" +#include "poly.h" #include -void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); + out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6); + out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4); + out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2); + in += 4; + out += 3; } } -void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { - size_t j, offset_byte, offset_data; +void PQCLEAN_FIRESABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; - data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2); - data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4); - data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); + out[0] = in[0] & 0x3f; + out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2); + out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4); + out[3] = ((in[2] & 0xff) >> 2); + in += 3; + out += 4; } } -static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); - bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); - bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); - bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); - bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); - bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); - bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); - bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); - bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); - bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); - bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); - bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); + out[2] = ((in[1] >> 3) & 0xff); + out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); + out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); + out[5] = ((in[3] >> 1) & 0xff); + out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); + out[7] = ((in[4] >> 4) & 0xff); + out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); + out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); + out[10] = ((in[6] >> 2) & 0xff); + out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); + out[12] = ((in[7] >> 5) & 0xff); + in += 8; + out += 13; } } -static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { - size_t j, offset_byte, offset_data; +static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); + out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); + out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); + out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); + out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); + out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); + out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); + out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + in += 13; + out += 8; } } -static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); - bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); - bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); + out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); + out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); + out[4] = ((in[3] >> 2) & 0xff); + in += 4; + out += 5; } } -static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { - size_t j, offset_byte, offset_data; +static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 5 * j; - offset_data = 4 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); - data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); - data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); - data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); + out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); + out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); + out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + in += 5; + out += 4; } } -void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { +void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { size_t i; for (i = 0; i < SABER_L; i++) { - POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); } } -void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { +void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { size_t i; for (i = 0; i < SABER_L; i++) { - BS2POLq(data[i], bytes + i * SABER_POLYBYTES); + BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); } } -void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { +void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { size_t i; for (i = 0; i < SABER_L; i++) { - POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); } } -void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { size_t i; for (i = 0; i < SABER_L; i++) { - BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); + BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); } } -void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { +void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { size_t i, j; for (j = 0; j < SABER_KEYBYTES; j++) { for (i = 0; i < 8; i++) { - data[j * 8 + i] = ((bytes[j] >> i) & 0x01); + data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); } } } -void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { +void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { size_t i, j; memset(bytes, 0, SABER_KEYBYTES); for (j = 0; j < SABER_KEYBYTES; j++) { for (i = 0; i < 8; i++) { - bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); + bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); } } } diff --git a/crypto_kem/firesaber/clean/pack_unpack.h b/crypto_kem/firesaber/clean/pack_unpack.h index 0a8ee253..698cecb1 100644 --- a/crypto_kem/firesaber/clean/pack_unpack.h +++ b/crypto_kem/firesaber/clean/pack_unpack.h @@ -1,27 +1,28 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H #include "SABER_params.h" +#include "poly.h" #include #include -void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); +void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); -void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); +void PQCLEAN_FIRESABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); +void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); -void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); +void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); -void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); +void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); -void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); +void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); -void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); +void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); #endif diff --git a/crypto_kem/firesaber/clean/poly.c b/crypto_kem/firesaber/clean/poly.c index c6d729ba..2ce0e871 100644 --- a/crypto_kem/firesaber/clean/poly.c +++ b/crypto_kem/firesaber/clean/poly.c @@ -3,32 +3,40 @@ #include "fips202.h" #include "pack_unpack.h" #include "poly.h" -#include "poly_mul.h" #include -void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { +void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) { size_t i, j; - for (i = 0; i < SABER_L; i++) { - for (j = 0; j < SABER_L; j++) { - if (transpose == 1) { - PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); - } else { - PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); + + if (transpose) { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1); + } + } + } else { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1); } } } } -void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { - size_t j; - for (j = 0; j < SABER_L; j++) { - PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res, b[j], s[j]); +void PQCLEAN_FIRESABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) { + size_t i; + + PQCLEAN_FIRESABER_CLEAN_poly_mul(c, &b[0], &s[0], 0); + for (i = 1; i < SABER_L; i++) { + PQCLEAN_FIRESABER_CLEAN_poly_mul(c, &b[i], &s[i], 1); } } -void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { - uint8_t buf[SABER_L * SABER_POLYVECBYTES]; +void PQCLEAN_FIRESABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { size_t i; + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); @@ -37,13 +45,13 @@ void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], co } } -void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { - uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; +void PQCLEAN_FIRESABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { size_t i; + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; - shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); for (i = 0; i < SABER_L; i++) { - PQCLEAN_FIRESABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); + PQCLEAN_FIRESABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); } } diff --git a/crypto_kem/firesaber/clean/poly.h b/crypto_kem/firesaber/clean/poly.h index 044e4eec..fdbbfa1f 100644 --- a/crypto_kem/firesaber/clean/poly.h +++ b/crypto_kem/firesaber/clean/poly.h @@ -3,13 +3,21 @@ #include "SABER_params.h" #include -void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); +typedef union { + uint16_t coeffs[SABER_N]; +} poly; -void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); -void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); +void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose); -void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); +void PQCLEAN_FIRESABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]); + +void PQCLEAN_FIRESABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_FIRESABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); + + +void PQCLEAN_FIRESABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate); #endif diff --git a/crypto_kem/firesaber/clean/poly_mul.c b/crypto_kem/firesaber/clean/poly_mul.c index 6b527c21..b57e04fb 100644 --- a/crypto_kem/firesaber/clean/poly_mul.c +++ b/crypto_kem/firesaber/clean/poly_mul.c @@ -1,4 +1,4 @@ -#include "poly_mul.h" +#include "poly.h" #include #include @@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t } /* res += a*b */ -void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { - uint16_t c[2 * SABER_N] = {0}; +void PQCLEAN_FIRESABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) { + uint16_t C[2 * SABER_N] = {0}; size_t i; - toom_cook_4way(c, a, b); + toom_cook_4way(C, a->coeffs, b->coeffs); /* reduction */ - for (i = SABER_N; i < 2 * SABER_N; i++) { - res[i - SABER_N] += (c[i - SABER_N] - c[i]); + if (accumulate == 0) { + for (i = SABER_N; i < 2 * SABER_N; i++) { + c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]); + } + } else { + for (i = SABER_N; i < 2 * SABER_N; i++) { + c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]); + } } } diff --git a/crypto_kem/firesaber/clean/poly_mul.h b/crypto_kem/firesaber/clean/poly_mul.h index b6911577..b28b04f6 100644 --- a/crypto_kem/firesaber/clean/poly_mul.h +++ b/crypto_kem/firesaber/clean/poly_mul.h @@ -1,9 +1,3 @@ -#ifndef POLY_MUL_H -#define POLY_MUL_H -#include "SABER_params.h" -#include - -void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); -#endif + diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml index 7e1dd2eb..027f1fab 100644 --- a/crypto_kem/lightsaber/META.yml +++ b/crypto_kem/lightsaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/lightsaber/avx2/Makefile b/crypto_kem/lightsaber/avx2/Makefile index 0522fe8d..f2817574 100644 --- a/crypto_kem/lightsaber/avx2/Makefile +++ b/crypto_kem/lightsaber/avx2/Makefile @@ -2,7 +2,7 @@ LIB=liblightsaber_avx2.a HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h -OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o +OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/lightsaber/avx2/SABER_indcpa.c b/crypto_kem/lightsaber/avx2/SABER_indcpa.c index 47f760e9..50f57221 100644 --- a/crypto_kem/lightsaber/avx2/SABER_indcpa.c +++ b/crypto_kem/lightsaber/avx2/SABER_indcpa.c @@ -1,416 +1,125 @@ -#include "./polymul/toom-cook_4way.c" #include "SABER_indcpa.h" #include "SABER_params.h" -#include "api.h" -#include "cbd.h" #include "fips202.h" #include "pack_unpack.h" +#include "poly.h" #include "randombytes.h" #include -#include #include -//#include "randombytes.h" -//#include "./polymul/toom_cook_4/toom-cook_4way.c" -#define h1 4 //2^(EQ-EP-1) +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) -#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) +void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + size_t i, j; + poly A[SABER_L][SABER_L]; + poly *skpv1 = A[0]; // use first row of A to hold sk temporarily + toom4_points skpv1_eval[SABER_L]; + poly res[SABER_L]; -static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { - int32_t i, j; + uint8_t rand[SABER_NOISESEEDBYTES]; + uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; - for (j = 0; j < SABER_KEYBYTES; j++) { - message_dec[j] = 0; - for (i = 0; i < 8; i++) { - message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); - } - } -} + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state -/*----------------------------------------------------------------------------------- - This routine generates a=[Matrix K x K] of 256-coefficient polynomials --------------------------------------------------------------------------------------*/ + randombytes(rand, SABER_NOISESEEDBYTES); + PQCLEAN_LIGHTSABER_AVX2_GenSecret(skpv1, rand); + PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key -static void GenMatrix(polyvec *a, const uint8_t *seed) { - uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; - - uint16_t temp_ar[SABER_N]; - - int i, j, k; - uint16_t mod = (SABER_Q - 1); - - shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_LIGHTSABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); - for (k = 0; k < SABER_N; k++) { - a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; - } - } - } -} - -static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { - - uint32_t i; - - uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; - - shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - PQCLEAN_LIGHTSABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); - } -} - -//********************************matrix-vector mul routines***************************************************** -static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { - int64_t i, j; - - __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time - - for (i = 0; i < NUM_POLY; i++) { - for (j = 0; j < NUM_POLY; j++) { - - if (isTranspose == 0) { - toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); - } else { - toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); - } - } - - TC_interpol(c_bucket, res_avx[i]); + for (j = 0; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]); } -} + PQCLEAN_LIGHTSABER_AVX2_GenMatrix(A, seed_A); // sample matrix A + PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order -static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { - - int64_t i; - - __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time - - for (i = 0; i < NUM_POLY; i++) { - toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); - } - TC_interpol(c_bucket, res_avx); -} - -//********************************matrix-vector mul routines***************************************************** - -void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { - - polyvec a[SABER_K]; - - uint16_t skpv1[SABER_K][SABER_N]; - - - - uint8_t seed[SABER_SEEDBYTES]; - uint8_t noiseseed[SABER_COINBYTES]; - int32_t i, j, k; - - -//--------------AVX declaration------------------ - - __m256i sk_avx[SABER_K][SABER_N / 16]; - __m256i mod; - __m256i res_avx[SABER_K][SABER_N / 16]; - __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; - //__m256i acc[2*SABER_N/16]; - - mod = _mm256_set1_epi16(SABER_Q - 1); - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - -//--------------AVX declaration ends------------------ - - randombytes(seed, SABER_SEEDBYTES); - - shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state - randombytes(noiseseed, SABER_COINBYTES); - - - GenMatrix(a, seed); //sample matrix A - - GenSecret(skpv1, noiseseed); - - -// Load sk into avx vectors - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); - } - - } - - // Load a into avx vectors - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - for (k = 0; k < SABER_N / 16; k++) { - a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); - } + // rounding + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_N; j++) { + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - - - //------------------------do the matrix vector multiplication and rounding------------ - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sk_avx[j], b_bucket[j]); - } - matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order - - // Now truncation - - - for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits - for (j = 0; j < SABER_N / 16; j++) { - res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); - res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); - res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); - } - } - - //------------------Pack sk into byte string------- - - PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); - - //------------------Pack pk into byte string------- - - for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); - } - } - PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string - - - for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. - pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; - } - + PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(pk, res); // pack public key } void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + size_t i, j; + poly A[SABER_L][SABER_L]; + poly res[SABER_L]; + toom4_points skpv1_eval[SABER_L]; - uint32_t i, j, k; - polyvec a[SABER_K]; // skpv; - uint8_t seed[SABER_SEEDBYTES]; - uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client + poly *temp = A[0]; // re-use stack space + poly *vprime = &A[0][0]; + poly *message = &A[0][1]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; - uint16_t skpv1[SABER_K][SABER_N]; - uint16_t temp[SABER_K][SABER_N]; - uint16_t message[SABER_KEYBYTES * 8]; - - uint8_t msk_c[SABER_SCALEBYTES_KEM]; - - //--------------AVX declaration------------------ - - __m256i sk_avx[SABER_K][SABER_N / 16]; - __m256i mod, mod_p; - __m256i res_avx[SABER_K][SABER_N / 16]; - __m256i vprime_avx[SABER_N / 16]; - __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; - //__m256i acc[2*SABER_N/16]; - - __m256i pkcl_avx[SABER_K][SABER_N / 16]; - - __m256i message_avx[SABER_N / 16]; - - mod = _mm256_set1_epi16(SABER_Q - 1); - mod_p = _mm256_set1_epi16(SABER_P - 1); - - - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - - //--------------AVX declaration ends------------------ - for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. - seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; + PQCLEAN_LIGHTSABER_AVX2_GenSecret(temp, noiseseed); + for (j = 0; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]); } - GenMatrix(a, seed); - GenSecret(skpv1, noiseseed); + PQCLEAN_LIGHTSABER_AVX2_GenMatrix(A, seed_A); + PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed - // ----------- Load skpv1 into avx vectors ---------- - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + // rounding + for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N; j++) { + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - - // ----------- Load skpv1 into avx vectors ---------- - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - for (k = 0; k < SABER_N / 16; k++) { - a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); - } - } - } - //-----------------matrix-vector multiplication and rounding - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sk_avx[j], b_bucket[j]); - } - matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order - - // Now truncation - - for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits - for (j = 0; j < SABER_N / 16; j++) { - res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); - res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); - res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); - - } - } - - - //-----this result should be put in b_prime for later use in server. - for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); - } - } - - PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string - -//**************client matrix-vector multiplication ends******************// - - //------now calculate the v' - - //-------unpack the public_key - PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); - } - } - - // InnerProduct - //for(k=0;k> i) & 0x01); - } - } - // message encoding - for (i = 0; i < SABER_N / 16; i++) { - message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); - message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); - } - - // SHIFTRIGHT(v'+h1-m mod p, EP-ET) - for (k = 0; k < SABER_N / 16; k++) { - vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); - vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); - vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); - } - - // Unpack avx - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); - } - - PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(msk_c, temp[0]); - - - for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { - ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; + for (i = 0; i < SABER_N; i++) { + vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); + vprime->coeffs[i] &= SABER_P - 1; + vprime->coeffs[i] >>= SABER_EP - SABER_ET; } + PQCLEAN_LIGHTSABER_AVX2_POLT2BS(msk_c, vprime); } void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + size_t i; - uint32_t i, j; - uint16_t sksv[SABER_K][SABER_N]; //secret key of the server - uint16_t pksv[SABER_K][SABER_N]; - uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; - uint8_t scale_ar[SABER_SCALEBYTES_KEM]; - uint16_t op[SABER_N]; + poly temp[SABER_L]; + toom4_points sksv_eval[SABER_L]; - //--------------AVX declaration------------------ + const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + poly *v = &temp[0]; + poly *cm = &temp[1]; - - //__m256i mod_p; - - __m256i v_avx[SABER_N / 16]; - - //__m256i acc[2*SABER_N/16]; - - __m256i sksv_avx[SABER_K][SABER_N / 16]; - __m256i pksv_avx[SABER_K][SABER_N / 16]; - - //mod_p=_mm256_set1_epi16(SABER_P-1); - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - //--------------AVX declaration ends------------------ - - //-------unpack the public_key - - PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key - PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); - pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); - } + PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(temp, sk); + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]); } - for (i = 0; i < SABER_N / 16; i++) { - v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); - } + PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(temp, ciphertext); + PQCLEAN_LIGHTSABER_AVX2_InnerProd(v, temp, sksv_eval); + PQCLEAN_LIGHTSABER_AVX2_BS2POLT(cm, packed_cm); - // InnerProduct(b', s, mod p) - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sksv_avx[j], b_bucket[j]); - } - - vector_vector_mul(v_avx, pksv_avx, b_bucket); - - for (i = 0; i < SABER_N / 16; i++) { - _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); - } - - - for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { - scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; - } - - PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(op, scale_ar); - - - //addition of h2 for (i = 0; i < SABER_N; i++) { - message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); + v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); + v->coeffs[i] &= SABER_P - 1; + v->coeffs[i] >>= SABER_EP - 1; } - - POL2MSG(m, message_dec_unpacked); + PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(m, v); } diff --git a/crypto_kem/lightsaber/avx2/SABER_params.h b/crypto_kem/lightsaber/avx2/SABER_params.h index 11d34fda..8da6ec34 100644 --- a/crypto_kem/lightsaber/avx2/SABER_params.h +++ b/crypto_kem/lightsaber/avx2/SABER_params.h @@ -1,46 +1,41 @@ #ifndef PARAMS_H #define PARAMS_H -#include "api.h" - - -#define SABER_K 2 +/* Don't change anything below this line */ +#define SABER_L 2 #define SABER_MU 10 #define SABER_ET 3 +#define SABER_N 256 + +#define SABER_EP 10 +#define SABER_P (1 << SABER_EP) #define SABER_EQ 13 -#define SABER_EP 10 +#define SABER_Q (1 << SABER_EQ) -#define SABER_N 256 -#define SABER_Q 8192 //2^13 -#define SABER_P 1024 +#define SABER_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 -#define SABER_SEEDBYTES 32 -#define SABER_NOISESEEDBYTES 32 -#define SABER_COINBYTES 32 -#define SABER_KEYBYTES 32 +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) -#define SABER_HASHBYTES 32 +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) -#define SABER_POLYBYTES 416 //13*256/8 +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) -#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) - -#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation - -#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) - -#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) -#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) - -#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) #endif diff --git a/crypto_kem/lightsaber/avx2/cbd.c b/crypto_kem/lightsaber/avx2/cbd.c index a43170e2..5a61236f 100644 --- a/crypto_kem/lightsaber/avx2/cbd.c +++ b/crypto_kem/lightsaber/avx2/cbd.c @@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -static uint64_t load_littleendian(const unsigned char *x, int bytes) { +static uint64_t load_littleendian(const uint8_t *x, int bytes) { int i; uint64_t r = x[0]; for (i = 1; i < bytes; i++) { @@ -20,10 +20,7 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { return r; } - -void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { - uint16_t Qmod_minus1 = SABER_Q - 1; - +void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { uint64_t t, d, a[4], b[4]; int i, j; @@ -34,8 +31,8 @@ void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { d += (t >> j) & 0x0842108421UL; } - a[0] = d & 0x1f; - b[0] = (d >> 5) & 0x1f; + a[0] = d & 0x1f; + b[0] = (d >> 5) & 0x1f; a[1] = (d >> 10) & 0x1f; b[1] = (d >> 15) & 0x1f; a[2] = (d >> 20) & 0x1f; @@ -43,9 +40,9 @@ void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { a[3] = (d >> 30) & 0x1f; b[3] = (d >> 35); - r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; - r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; - r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; - r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; + s[4 * i + 0] = (uint16_t)(a[0] - b[0]); + s[4 * i + 1] = (uint16_t)(a[1] - b[1]); + s[4 * i + 2] = (uint16_t)(a[2] - b[2]); + s[4 * i + 3] = (uint16_t)(a[3] - b[3]); } } diff --git a/crypto_kem/lightsaber/avx2/cbd.h b/crypto_kem/lightsaber/avx2/cbd.h index 01ba76e8..5be3a405 100644 --- a/crypto_kem/lightsaber/avx2/cbd.h +++ b/crypto_kem/lightsaber/avx2/cbd.h @@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -#include "poly.h" +#include "SABER_params.h" #include -void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); +void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); #endif diff --git a/crypto_kem/lightsaber/avx2/kem.c b/crypto_kem/lightsaber/avx2/kem.c index 70221f10..e60a2d51 100644 --- a/crypto_kem/lightsaber/avx2/kem.c +++ b/crypto_kem/lightsaber/avx2/kem.c @@ -4,14 +4,12 @@ #include "fips202.h" #include "randombytes.h" #include "verify.h" -#include +#include #include -#include -#include int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - int i; + size_t i; PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,7 +37,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); // K^ <-- kr[0:31] // noiseseed (r) <-- kr[32:63]; - PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; + PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); @@ -49,7 +47,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t } int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { - int i; + size_t i; uint8_t fail; uint8_t cmp[SABER_BYTES_CCA_DEC]; uint8_t buf[64]; @@ -65,7 +63,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const u sha3_512(kr, buf, 64); - PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); + PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk); fail = PQCLEAN_LIGHTSABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); diff --git a/crypto_kem/lightsaber/avx2/kem.h b/crypto_kem/lightsaber/avx2/kem.h index b80c335d..b28b04f6 100644 --- a/crypto_kem/lightsaber/avx2/kem.h +++ b/crypto_kem/lightsaber/avx2/kem.h @@ -1,35 +1,3 @@ -#ifndef INDCPA_H -#define INDCPA_H - -#include - -void PQCLEAN_LIGHTSABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); - - -void PQCLEAN_LIGHTSABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); - - -void PQCLEAN_LIGHTSABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); - - -void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); - -void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); - -void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); - - -int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); - -int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); - -int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); -//uint64_t clock1,clock2; - -//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; - - -#endif diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.c b/crypto_kem/lightsaber/avx2/pack_unpack.c index e912fd0a..a9f866ae 100644 --- a/crypto_kem/lightsaber/avx2/pack_unpack.c +++ b/crypto_kem/lightsaber/avx2/pack_unpack.c @@ -1,502 +1,153 @@ +#include "SABER_params.h" #include "pack_unpack.h" +#include "poly.h" +#include - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - +void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); - bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); + out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6); + out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7); + out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5); + in += 8; + out += 3; } } -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - +void PQCLEAN_LIGHTSABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; - data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; - data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); - data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; - data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; - data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); - data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); - data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); - } - -} - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0; - - for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); + out[0] = (in[0]) & 0x07; + out[1] = ((in[0]) >> 3) & 0x07; + out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2); + out[3] = ((in[1]) >> 1) & 0x07; + out[4] = ((in[1]) >> 4) & 0x07; + out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1); + out[6] = ((in[2] >> 2) & 0x07); + out[7] = ((in[2] >> 5) & 0x07); + in += 3; + out += 8; } } -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0; - - for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - data[offset_data] = bytes[j] & 0x0f; - data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; + for (j = 0; j < SABER_N / 8; j++) { + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); + out[2] = ((in[1] >> 3) & 0xff); + out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); + out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); + out[5] = ((in[3] >> 1) & 0xff); + out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); + out[7] = ((in[4] >> 4) & 0xff); + out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); + out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); + out[10] = ((in[6] >> 2) & 0xff); + out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); + out[12] = ((in[7] >> 5) & 0xff); + in += 8; + out += 13; } } -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; +static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; + for (j = 0; j < SABER_N / 8; j++) { + out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); + out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); + out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); + out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); + out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); + out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); + out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); + out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + in += 13; + out += 8; + } +} +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); + out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); + out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); + out[4] = ((in[3] >> 2) & 0xff); + in += 4; + out += 5; } } - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - +static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; - data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; - data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; - data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); + out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); + out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); + out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + in += 5; + out += 4; } - } -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { +void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); + } +} - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); + } +} - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); +void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); + } +} - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); + } +} - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); +void PQCLEAN_LIGHTSABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { + size_t i, j; + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); } } } -void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { +void PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { + size_t i, j; + memset(bytes, 0, SABER_KEYBYTES); - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); } } } - -void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); - - bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); - - bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); - - } - } - - -} - -void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); - } -} - - - -void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); - data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); - data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); - data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); - - } - } -} - -void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); - } - } - - -} - - - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); - data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); - data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); - data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); - - } - } - - -} - - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); - - bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); - - bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); - - } - } - - -} - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); - } - } - - -} - -void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - //for(i=0;i> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); - } - //} - - -} - - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - /*This function packs 11 bit data stream into 8 bits of data. - */ - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 11) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 11 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); - - } - } - -} - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 11) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 11 * j; - offset_data = 8 * j; - - data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); - - data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); - - data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); - - data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); - - data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); - - data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); - - data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); - - data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); - } - } - - -} - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 14) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 7 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); - } - } - - -} - - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 14) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 7 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); - - data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); - - data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); - - data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); - } - } - - -} - -void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - - if (modulus == 1024) { - PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(bytes, data); - } else if (modulus == 8192) { - PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(bytes, data); - } -} - -void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { - - if (modulus == 1024) { - PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(data, bytes); - } else if (modulus == 8192) { - PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(data, bytes); - } - -} diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.h b/crypto_kem/lightsaber/avx2/pack_unpack.h index 9a5d41f0..2ba7822b 100644 --- a/crypto_kem/lightsaber/avx2/pack_unpack.h +++ b/crypto_kem/lightsaber/avx2/pack_unpack.h @@ -1,56 +1,28 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H #include "SABER_params.h" +#include "poly.h" #include #include -void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); +void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); -void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); - -void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); +void PQCLEAN_LIGHTSABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); +void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); -void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); +void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); +void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); +void PQCLEAN_LIGHTSABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); +void PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); #endif diff --git a/crypto_kem/lightsaber/avx2/poly.c b/crypto_kem/lightsaber/avx2/poly.c new file mode 100644 index 00000000..56227f6f --- /dev/null +++ b/crypto_kem/lightsaber/avx2/poly.c @@ -0,0 +1,62 @@ +#include "cbd.h" +#include "fips202.h" +#include "pack_unpack.h" +#include "poly.h" + + +void PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) { + size_t i, j; + toom4_points_product c_eval; + + if (transpose) { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1); + } + PQCLEAN_LIGHTSABER_AVX2_toom4_interp(&c[i], &c_eval); + } + } else { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1); + } + PQCLEAN_LIGHTSABER_AVX2_toom4_interp(&c[i], &c_eval); + } + } +} + +void PQCLEAN_LIGHTSABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) { + size_t i; + toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time + + PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0); + for (i = 1; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1); + } + + PQCLEAN_LIGHTSABER_AVX2_toom4_interp(c, &c_eval); +} + +void PQCLEAN_LIGHTSABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { + size_t i; + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); + } +} + +void PQCLEAN_LIGHTSABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { + size_t i; + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; + + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); + } +} diff --git a/crypto_kem/lightsaber/avx2/poly.h b/crypto_kem/lightsaber/avx2/poly.h index 8f2a7574..2e7b2a11 100644 --- a/crypto_kem/lightsaber/avx2/poly.h +++ b/crypto_kem/lightsaber/avx2/poly.h @@ -1,27 +1,38 @@ #ifndef POLY_H #define POLY_H -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ #include "SABER_params.h" +#include #include -typedef struct { +typedef union { uint16_t coeffs[SABER_N]; + __m256i dummy; } poly; -typedef struct { - poly vec[SABER_K]; -} polyvec; +typedef union { + uint16_t coeffs[4 * SABER_N]; + __m256i dummy; +} toom4_points; -void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); +typedef union { + uint16_t coeffs[8 * SABER_N]; + __m256i dummy; +} toom4_points_product; + +void PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose); + +void PQCLEAN_LIGHTSABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]); + +void PQCLEAN_LIGHTSABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_LIGHTSABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); -void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); +void PQCLEAN_LIGHTSABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval); + +void PQCLEAN_LIGHTSABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b); + +void PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate); #endif diff --git a/crypto_kem/lightsaber/avx2/poly_mul.c b/crypto_kem/lightsaber/avx2/poly_mul.c new file mode 100644 index 00000000..9ae8de05 --- /dev/null +++ b/crypto_kem/lightsaber/avx2/poly_mul.c @@ -0,0 +1,1524 @@ +#include "SABER_params.h" +#include "poly.h" + + +#define L (SABER_N / 64) + +static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { + return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); +} + +static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + b4 = b[4]; + b5 = b[5]; + b6 = b[6]; + b7 = b[7]; + + c[0] = mul_add(a0, b0, c[0]); + + temp = _mm256_mullo_epi16(a0, b1); + temp = mul_add(a1, b0, temp); + c[1] = _mm256_add_epi16(temp, c[1]); + + temp = _mm256_mullo_epi16(a0, b2); + temp = mul_add(a1, b1, temp); + temp = mul_add(a2, b0, temp); + c[2] = _mm256_add_epi16(temp, c[2]); + + temp = _mm256_mullo_epi16(a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + temp = mul_add(a3, b0, temp); + c[3] = _mm256_add_epi16(temp, c[3]); + + temp = _mm256_mullo_epi16(a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + temp = mul_add(a2, b2, temp); + c[4] = _mm256_add_epi16(temp, c[4]); + + temp = _mm256_mullo_epi16(a0, b5); + temp = mul_add(a1, b4, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + temp = mul_add(a5, b0, temp); + c[5] = _mm256_add_epi16(temp, c[5]); + + temp = _mm256_mullo_epi16(a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a4, b2, temp); + c[6] = _mm256_add_epi16(temp, c[6]); + + temp = _mm256_mullo_epi16(a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add(a6, b1, temp); + temp = mul_add(a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a5, b2, temp); + c[7] = _mm256_add_epi16(temp, c[7]); + + temp = _mm256_mullo_epi16(a0, b[8]); + temp = mul_add(a1, b7, temp); + temp = mul_add(a7, b1, temp); + temp = mul_add(a[8], b0, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a6, b2, temp); + c[8] = _mm256_add_epi16(temp, c[8]); + + temp = _mm256_mullo_epi16(a0, b[9]); + temp = mul_add(a1, b[8], temp); + temp = mul_add(a[8], b1, temp); + temp = mul_add(a[9], b0, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a7, b2, temp); + c[9] = _mm256_add_epi16(temp, c[9]); + + temp = _mm256_mullo_epi16(a0, b[10]); + temp = mul_add(a1, b[9], temp); + temp = mul_add(a[9], b1, temp); + temp = mul_add(a[10], b0, temp); + temp = mul_add(a2, b[8], temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a[8], b2, temp); + c[10] = _mm256_add_epi16(temp, c[10]); + + temp = _mm256_mullo_epi16(a0, b[11]); + temp = mul_add(a1, b[10], temp); + temp = mul_add(a[10], b1, temp); + temp = mul_add(a[11], b0, temp); + temp = mul_add(a2, b[9], temp); + temp = mul_add(a3, b[8], temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a[8], b3, temp); + temp = mul_add(a[9], b2, temp); + c[11] = _mm256_add_epi16(temp, c[11]); + + temp = _mm256_mullo_epi16(a0, b[12]); + temp = mul_add(a1, b[11], temp); + temp = mul_add(a[11], b1, temp); + temp = mul_add(a[12], b0, temp); + temp = mul_add(a2, b[10], temp); + temp = mul_add(a3, b[9], temp); + temp = mul_add(a4, b[8], temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a[8], b4, temp); + temp = mul_add(a[9], b3, temp); + temp = mul_add(a[10], b2, temp); + c[12] = _mm256_add_epi16(temp, c[12]); + + temp = _mm256_mullo_epi16(a0, b[13]); + temp = mul_add(a1, b[12], temp); + temp = mul_add(a[12], b1, temp); + temp = mul_add(a[13], b0, temp); + temp = mul_add(a2, b[11], temp); + temp = mul_add(a3, b[10], temp); + temp = mul_add(a4, b[9], temp); + temp = mul_add(a5, b[8], temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a[8], b5, temp); + temp = mul_add(a[9], b4, temp); + temp = mul_add(a[10], b3, temp); + temp = mul_add(a[11], b2, temp); + c[13] = _mm256_add_epi16(temp, c[13]); + + temp = _mm256_mullo_epi16(a0, b[14]); + temp = mul_add(a1, b[13], temp); + temp = mul_add(a[13], b1, temp); + temp = mul_add(a[14], b0, temp); + temp = mul_add(a2, b[12], temp); + temp = mul_add(a3, b[11], temp); + temp = mul_add(a4, b[10], temp); + temp = mul_add(a5, b[9], temp); + temp = mul_add(a6, b[8], temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a[8], b6, temp); + temp = mul_add(a[9], b5, temp); + temp = mul_add(a[10], b4, temp); + temp = mul_add(a[11], b3, temp); + temp = mul_add(a[12], b2, temp); + c[14] = _mm256_add_epi16(temp, c[14]); + + temp = _mm256_mullo_epi16(a0, b[15]); + temp = mul_add(a1, b[14], temp); + temp = mul_add(a[14], b1, temp); + temp = mul_add(a[15], b0, temp); + temp = mul_add(a2, b[13], temp); + temp = mul_add(a3, b[12], temp); + temp = mul_add(a4, b[11], temp); + temp = mul_add(a5, b[10], temp); + temp = mul_add(a6, b[9], temp); + temp = mul_add(a7, b[8], temp); + temp = mul_add(a[8], b7, temp); + temp = mul_add(a[9], b6, temp); + temp = mul_add(a[10], b5, temp); + temp = mul_add(a[11], b4, temp); + temp = mul_add(a[12], b3, temp); + temp = mul_add(a[13], b2, temp); + c[15] = _mm256_add_epi16(temp, c[15]); + + a0 = a[14]; + a1 = a[15]; + a2 = a[13]; + a3 = a[12]; + a4 = a[11]; + a5 = a[10]; + a6 = a[9]; + a7 = a[8]; + + b0 = b[14]; + b1 = b[15]; + b2 = b[13]; + b3 = b[12]; + b4 = b[11]; + b5 = b[10]; + b6 = b[9]; + b7 = b[8]; + + temp = _mm256_mullo_epi16(a[1], b1); + temp = mul_add(a[2], b0, temp); + temp = mul_add(a[3], b2, temp); + temp = mul_add(a[4], b3, temp); + temp = mul_add(a[5], b4, temp); + temp = mul_add(a[6], b5, temp); + temp = mul_add(a[7], b6, temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a6, b[7], temp); + temp = mul_add(a5, b[6], temp); + temp = mul_add(a4, b[5], temp); + temp = mul_add(a3, b[4], temp); + temp = mul_add(a2, b[3], temp); + temp = mul_add(a0, b[2], temp); + temp = mul_add(a1, b[1], temp); + c[16] = _mm256_add_epi16(temp, c[16]); + + temp = _mm256_mullo_epi16(a[2], b1); + temp = mul_add(a[3], b0, temp); + temp = mul_add(a[4], b2, temp); + temp = mul_add(a[5], b3, temp); + temp = mul_add(a[6], b4, temp); + temp = mul_add(a[7], b5, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a5, b[7], temp); + temp = mul_add(a4, b[6], temp); + temp = mul_add(a3, b[5], temp); + temp = mul_add(a2, b[4], temp); + temp = mul_add(a0, b[3], temp); + temp = mul_add(a1, b[2], temp); + c[17] = _mm256_add_epi16(temp, c[17]); + + temp = _mm256_mullo_epi16(a[3], b1); + temp = mul_add(a[4], b0, temp); + temp = mul_add(a[5], b2, temp); + temp = mul_add(a[6], b3, temp); + temp = mul_add(a[7], b4, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a4, b[7], temp); + temp = mul_add(a3, b[6], temp); + temp = mul_add(a2, b[5], temp); + temp = mul_add(a0, b[4], temp); + temp = mul_add(a1, b[3], temp); + c[18] = _mm256_add_epi16(temp, c[18]); + + temp = _mm256_mullo_epi16(a[4], b1); + temp = mul_add(a[5], b0, temp); + temp = mul_add(a[6], b2, temp); + temp = mul_add(a[7], b3, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a3, b[7], temp); + temp = mul_add(a2, b[6], temp); + temp = mul_add(a0, b[5], temp); + temp = mul_add(a1, b[4], temp); + c[19] = _mm256_add_epi16(temp, c[19]); + + temp = _mm256_mullo_epi16(a[5], b1); + temp = mul_add(a[6], b0, temp); + temp = mul_add(a[7], b2, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a2, b[7], temp); + temp = mul_add(a0, b[6], temp); + temp = mul_add(a1, b[5], temp); + c[20] = _mm256_add_epi16(temp, c[20]); + + temp = _mm256_mullo_epi16(a[6], b1); + temp = mul_add(a[7], b0, temp); + temp = mul_add(a7, b2, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a0, b[7], temp); + temp = mul_add(a1, b[6], temp); + c[21] = _mm256_add_epi16(temp, c[21]); + + temp = _mm256_mullo_epi16(a[7], b1); + temp = mul_add(a7, b0, temp); + temp = mul_add(a6, b2, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a0, b7, temp); + temp = mul_add(a1, b[7], temp); + c[22] = _mm256_add_epi16(temp, c[22]); + + temp = _mm256_mullo_epi16(a7, b1); + temp = mul_add(a6, b0, temp); + temp = mul_add(a5, b2, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a0, b6, temp); + temp = mul_add(a1, b7, temp); + c[23] = _mm256_add_epi16(temp, c[23]); + + temp = _mm256_mullo_epi16(a6, b1); + temp = mul_add(a5, b0, temp); + temp = mul_add(a4, b2, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a0, b5, temp); + temp = mul_add(a1, b6, temp); + c[24] = _mm256_add_epi16(temp, c[24]); + + temp = _mm256_mullo_epi16(a5, b1); + temp = mul_add(a4, b0, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a0, b4, temp); + temp = mul_add(a1, b5, temp); + c[25] = _mm256_add_epi16(temp, c[25]); + + temp = _mm256_mullo_epi16(a4, b1); + temp = mul_add(a3, b0, temp); + temp = mul_add(a2, b2, temp); + temp = mul_add(a0, b3, temp); + temp = mul_add(a1, b4, temp); + c[26] = _mm256_add_epi16(temp, c[26]); + + temp = _mm256_mullo_epi16(a3, b1); + temp = mul_add(a2, b0, temp); + temp = mul_add(a0, b2, temp); + temp = mul_add(a1, b3, temp); + c[27] = _mm256_add_epi16(temp, c[27]); + + temp = _mm256_mullo_epi16(a2, b1); + temp = mul_add(a0, b0, temp); + temp = mul_add(a1, b2, temp); + c[28] = _mm256_add_epi16(temp, c[28]); + + temp = _mm256_mullo_epi16(a0, b1); + temp = mul_add(a1, b0, temp); + c[29] = _mm256_add_epi16(temp, c[29]); + + c[30] = mul_add(a1, b1, c[30]); + + c[31] = _mm256_set_epi64x(0, 0, 0, 0); +} + + +static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + b4 = b[4]; + b5 = b[5]; + b6 = b[6]; + b7 = b[7]; + + c[0] = _mm256_mullo_epi16(a0, b0); + + temp = _mm256_mullo_epi16(a0, b1); + c[1] = mul_add(a1, b0, temp); + + temp = _mm256_mullo_epi16(a0, b2); + temp = mul_add(a1, b1, temp); + c[2] = mul_add(a2, b0, temp); + + temp = _mm256_mullo_epi16(a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + c[3] = mul_add(a3, b0, temp); + + temp = _mm256_mullo_epi16(a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + c[4] = mul_add(a2, b2, temp); + + temp = _mm256_mullo_epi16(a0, b5); + temp = mul_add(a1, b4, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + c[5] = mul_add(a5, b0, temp); + + temp = _mm256_mullo_epi16(a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + c[6] = mul_add(a4, b2, temp); + + temp = _mm256_mullo_epi16(a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add(a6, b1, temp); + temp = mul_add(a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a4, b3, temp); + c[7] = mul_add(a5, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[8]); + temp = mul_add(a1, b7, temp); + temp = mul_add(a7, b1, temp); + temp = mul_add(a[8], b0, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a5, b3, temp); + c[8] = mul_add(a6, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[9]); + temp = mul_add(a1, b[8], temp); + temp = mul_add(a[8], b1, temp); + temp = mul_add(a[9], b0, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a6, b3, temp); + c[9] = mul_add(a7, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[10]); + temp = mul_add(a1, b[9], temp); + temp = mul_add(a[9], b1, temp); + temp = mul_add(a[10], b0, temp); + temp = mul_add(a2, b[8], temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a7, b3, temp); + c[10] = mul_add(a[8], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[11]); + temp = mul_add(a1, b[10], temp); + temp = mul_add(a[10], b1, temp); + temp = mul_add(a[11], b0, temp); + temp = mul_add(a2, b[9], temp); + temp = mul_add(a3, b[8], temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a[8], b3, temp); + c[11] = mul_add(a[9], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[12]); + temp = mul_add(a1, b[11], temp); + temp = mul_add(a[11], b1, temp); + temp = mul_add(a[12], b0, temp); + temp = mul_add(a2, b[10], temp); + temp = mul_add(a3, b[9], temp); + temp = mul_add(a4, b[8], temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a[8], b4, temp); + temp = mul_add(a[9], b3, temp); + c[12] = mul_add(a[10], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[13]); + temp = mul_add(a1, b[12], temp); + temp = mul_add(a[12], b1, temp); + temp = mul_add(a[13], b0, temp); + temp = mul_add(a2, b[11], temp); + temp = mul_add(a3, b[10], temp); + temp = mul_add(a4, b[9], temp); + temp = mul_add(a5, b[8], temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a[8], b5, temp); + temp = mul_add(a[9], b4, temp); + temp = mul_add(a[10], b3, temp); + c[13] = mul_add(a[11], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[14]); + temp = mul_add(a1, b[13], temp); + temp = mul_add(a[13], b1, temp); + temp = mul_add(a[14], b0, temp); + temp = mul_add(a2, b[12], temp); + temp = mul_add(a3, b[11], temp); + temp = mul_add(a4, b[10], temp); + temp = mul_add(a5, b[9], temp); + temp = mul_add(a6, b[8], temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a[8], b6, temp); + temp = mul_add(a[9], b5, temp); + temp = mul_add(a[10], b4, temp); + temp = mul_add(a[11], b3, temp); + c[14] = mul_add(a[12], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[15]); + temp = mul_add(a1, b[14], temp); + temp = mul_add(a[14], b1, temp); + temp = mul_add(a[15], b0, temp); + temp = mul_add(a2, b[13], temp); + temp = mul_add(a3, b[12], temp); + temp = mul_add(a4, b[11], temp); + temp = mul_add(a5, b[10], temp); + temp = mul_add(a6, b[9], temp); + temp = mul_add(a7, b[8], temp); + temp = mul_add(a[8], b7, temp); + temp = mul_add(a[9], b6, temp); + temp = mul_add(a[10], b5, temp); + temp = mul_add(a[11], b4, temp); + temp = mul_add(a[12], b3, temp); + c[15] = mul_add(a[13], b2, temp); + + // unrolled second triangle + a0 = a[14]; + a1 = a[15]; + a2 = a[13]; + a3 = a[12]; + a4 = a[11]; + a5 = a[10]; + a6 = a[9]; + a7 = a[8]; + + b0 = b[14]; + b1 = b[15]; + b2 = b[13]; + b3 = b[12]; + b4 = b[11]; + b5 = b[10]; + b6 = b[9]; + b7 = b[8]; + + temp = _mm256_mullo_epi16(a[1], b1); + temp = mul_add(a[2], b0, temp); + temp = mul_add(a[3], b2, temp); + temp = mul_add(a[4], b3, temp); + temp = mul_add(a[5], b4, temp); + temp = mul_add(a[6], b5, temp); + temp = mul_add(a[7], b6, temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a6, b[7], temp); + temp = mul_add(a5, b[6], temp); + temp = mul_add(a4, b[5], temp); + temp = mul_add(a3, b[4], temp); + temp = mul_add(a2, b[3], temp); + temp = mul_add(a0, b[2], temp); + c[16] = mul_add(a1, b[1], temp); + + temp = _mm256_mullo_epi16(a[2], b1); + temp = mul_add(a[3], b0, temp); + temp = mul_add(a[4], b2, temp); + temp = mul_add(a[5], b3, temp); + temp = mul_add(a[6], b4, temp); + temp = mul_add(a[7], b5, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a5, b[7], temp); + temp = mul_add(a4, b[6], temp); + temp = mul_add(a3, b[5], temp); + temp = mul_add(a2, b[4], temp); + temp = mul_add(a0, b[3], temp); + c[17] = mul_add(a1, b[2], temp); + + temp = _mm256_mullo_epi16(a[3], b1); + temp = mul_add(a[4], b0, temp); + temp = mul_add(a[5], b2, temp); + temp = mul_add(a[6], b3, temp); + temp = mul_add(a[7], b4, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a4, b[7], temp); + temp = mul_add(a3, b[6], temp); + temp = mul_add(a2, b[5], temp); + temp = mul_add(a0, b[4], temp); + c[18] = mul_add(a1, b[3], temp); + + temp = _mm256_mullo_epi16(a[4], b1); + temp = mul_add(a[5], b0, temp); + temp = mul_add(a[6], b2, temp); + temp = mul_add(a[7], b3, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a3, b[7], temp); + temp = mul_add(a2, b[6], temp); + temp = mul_add(a0, b[5], temp); + c[19] = mul_add(a1, b[4], temp); + + temp = _mm256_mullo_epi16(a[5], b1); + temp = mul_add(a[6], b0, temp); + temp = mul_add(a[7], b2, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a2, b[7], temp); + temp = mul_add(a0, b[6], temp); + c[20] = mul_add(a1, b[5], temp); + + temp = _mm256_mullo_epi16(a[6], b1); + temp = mul_add(a[7], b0, temp); + temp = mul_add(a7, b2, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a0, b[7], temp); + c[21] = mul_add(a1, b[6], temp); + + temp = _mm256_mullo_epi16(a[7], b1); + temp = mul_add(a7, b0, temp); + temp = mul_add(a6, b2, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a0, b7, temp); + c[22] = mul_add(a1, b[7], temp); + + temp = _mm256_mullo_epi16(a7, b1); + temp = mul_add(a6, b0, temp); + temp = mul_add(a5, b2, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a0, b6, temp); + c[23] = mul_add(a1, b7, temp); + + temp = _mm256_mullo_epi16(a6, b1); + temp = mul_add(a5, b0, temp); + temp = mul_add(a4, b2, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a0, b5, temp); + c[24] = mul_add(a1, b6, temp); + + temp = _mm256_mullo_epi16(a5, b1); + temp = mul_add(a4, b0, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a0, b4, temp); + c[25] = mul_add(a1, b5, temp); + + temp = _mm256_mullo_epi16(a4, b1); + temp = mul_add(a3, b0, temp); + temp = mul_add(a2, b2, temp); + temp = mul_add(a0, b3, temp); + c[26] = mul_add(a1, b4, temp); + + temp = _mm256_mullo_epi16(a3, b1); + temp = mul_add(a2, b0, temp); + temp = mul_add(a0, b2, temp); + c[27] = mul_add(a1, b3, temp); + + temp = _mm256_mullo_epi16(a2, b1); + temp = mul_add(a0, b0, temp); + c[28] = mul_add(a1, b2, temp); + + temp = _mm256_mullo_epi16(a0, b1); + c[29] = mul_add(a1, b0, temp); + + c[30] = _mm256_mullo_epi16(a1, b1); + + c[31] = _mm256_set_epi64x(0, 0, 0, 0); +} + +static void transpose(__m256i *M) { + __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; + __m256i temp, temp0, temp1, temp2; + + r0 = _mm256_unpacklo_epi16(M[0], M[1]); + r1 = _mm256_unpacklo_epi16(M[2], M[3]); + r2 = _mm256_unpacklo_epi16(M[4], M[5]); + r3 = _mm256_unpacklo_epi16(M[6], M[7]); + r4 = _mm256_unpacklo_epi16(M[8], M[9]); + r5 = _mm256_unpacklo_epi16(M[10], M[11]); + r6 = _mm256_unpacklo_epi16(M[12], M[13]); + r7 = _mm256_unpacklo_epi16(M[14], M[15]); + + temp = _mm256_unpacklo_epi32(r0, r1); + temp0 = _mm256_unpacklo_epi32(r2, r3); + temp1 = _mm256_unpacklo_epi32(r4, r5); + temp2 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpackhi_epi32(r0, r1); + r9 = _mm256_unpackhi_epi32(r2, r3); + r10 = _mm256_unpackhi_epi32(r4, r5); + r11 = _mm256_unpackhi_epi32(r6, r7); + + r0 = _mm256_unpacklo_epi64(temp, temp0); + r2 = _mm256_unpackhi_epi64(temp, temp0); + r1 = _mm256_unpacklo_epi64(temp1, temp2); + r3 = _mm256_unpackhi_epi64(temp1, temp2); + + temp = _mm256_unpackhi_epi16(M[0], M[1]); + temp0 = _mm256_unpackhi_epi16(M[2], M[3]); + temp1 = _mm256_unpackhi_epi16(M[4], M[5]); + temp2 = _mm256_unpackhi_epi16(M[6], M[7]); + + r4 = _mm256_unpackhi_epi16(M[8], M[9]); + M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); + M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); + r5 = _mm256_unpackhi_epi16(M[10], M[11]); + r6 = _mm256_unpackhi_epi16(M[12], M[13]); + r7 = _mm256_unpackhi_epi16(M[14], M[15]); + + r0 = _mm256_unpacklo_epi64(r8, r9); + r1 = _mm256_unpacklo_epi64(r10, r11); + r2 = _mm256_unpackhi_epi64(r8, r9); + r3 = _mm256_unpackhi_epi64(r10, r11); + + M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); + M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); + + r0 = _mm256_unpacklo_epi32(temp, temp0); + r1 = _mm256_unpacklo_epi32(temp1, temp2); + r2 = _mm256_unpacklo_epi32(r4, r5); + r3 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpacklo_epi64(r0, r1); + r10 = _mm256_unpackhi_epi64(r0, r1); + r9 = _mm256_unpacklo_epi64(r2, r3); + r11 = _mm256_unpackhi_epi64(r2, r3); + + M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); + M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); + M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); + M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); + + r0 = _mm256_unpackhi_epi32(temp, temp0); + r1 = _mm256_unpackhi_epi32(temp1, temp2); + r2 = _mm256_unpackhi_epi32(r4, r5); + r3 = _mm256_unpackhi_epi32(r6, r7); + + r4 = _mm256_unpacklo_epi64(r0, r1); + r6 = _mm256_unpackhi_epi64(r0, r1); + r5 = _mm256_unpacklo_epi64(r2, r3); + r7 = _mm256_unpackhi_epi64(r2, r3); + + M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); + M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); + M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); + M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); +} + +static void batch_64coefficient_multiplications(toom4_points_product *c_eval, const __m256i *a, const toom4_points *b_eval, int accumulate) { + toom4_points a_eval;// Holds evaluation (a & b) for 7 Karatsuba at a time + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + __m256i *va = (__m256i *)a_eval.coeffs; + __m256i *vb = (__m256i *)b_eval->coeffs; + __m256i *vc = (__m256i *)c_eval->coeffs; + + //------------------AVX evaluation for 1st poly----------------------- + r0_avx = a[0 * L + 0]; + r1_avx = a[0 * L + 1]; + r2_avx = a[0 * L + 2]; + r3_avx = a[0 * L + 3]; + + va[0] = r0_avx; + va[1] = r1_avx; + va[2] = r2_avx; + va[3] = r3_avx; + va[4] = _mm256_add_epi16(r0_avx, r1_avx); + va[5] = _mm256_add_epi16(r2_avx, r3_avx); + va[6] = _mm256_add_epi16(r0_avx, r2_avx); + va[7] = _mm256_add_epi16(r1_avx, r3_avx); + va[8] = _mm256_add_epi16(va[6], va[7]); + //------------------AVX evaluation for 1st poly ends------------------ + + //------------------AVX evaluation for 2nd poly----------------------- + r0_avx = a[1 * L + 0]; + r1_avx = a[1 * L + 1]; + r2_avx = a[1 * L + 2]; + r3_avx = a[1 * L + 3]; + + va[0 + 9] = r0_avx; + va[1 + 9] = r1_avx; + va[2 + 9] = r2_avx; + va[3 + 9] = r3_avx; + va[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 9] = _mm256_add_epi16(va[6 + 9], va[7 + 9]); + //------------------AVX evaluation for 2nd poly ends------------------ + + //------------------AVX evaluation for 3rd poly----------------------- + r0_avx = a[2 * L + 0]; + r1_avx = a[2 * L + 1]; + r2_avx = a[2 * L + 2]; + r3_avx = a[2 * L + 3]; + + va[0 + 18] = r0_avx; + va[1 + 18] = r1_avx; + va[2 + 18] = r2_avx; + va[3 + 18] = r3_avx; + va[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 18] = _mm256_add_epi16(va[6 + 18], va[7 + 18]); + //------------------AVX evaluation for 3rd poly ends------------------ + + //------------------AVX evaluation for 4th poly----------------------- + r0_avx = a[3 * L + 0]; + r1_avx = a[3 * L + 1]; + r2_avx = a[3 * L + 2]; + r3_avx = a[3 * L + 3]; + + va[0 + 27] = r0_avx; + va[1 + 27] = r1_avx; + va[2 + 27] = r2_avx; + va[3 + 27] = r3_avx; + va[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 27] = _mm256_add_epi16(va[6 + 27], va[7 + 27]); + //------------------AVX evaluation for 4th poly ends------------------ + + //------------------AVX evaluation for 5th poly----------------------- + r0_avx = a[4 * L + 0]; + r1_avx = a[4 * L + 1]; + r2_avx = a[4 * L + 2]; + r3_avx = a[4 * L + 3]; + + va[0 + 36] = r0_avx; + va[1 + 36] = r1_avx; + va[2 + 36] = r2_avx; + va[3 + 36] = r3_avx; + va[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 36] = _mm256_add_epi16(va[6 + 36], va[7 + 36]); + //------------------AVX evaluation for 5th poly ends------------------ + + //------------------AVX evaluation for 6th poly----------------------- + r0_avx = a[5 * L + 0]; + r1_avx = a[5 * L + 1]; + r2_avx = a[5 * L + 2]; + r3_avx = a[5 * L + 3]; + + va[0 + 45] = r0_avx; + va[1 + 45] = r1_avx; + va[2 + 45] = r2_avx; + va[3 + 45] = r3_avx; + va[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 45] = _mm256_add_epi16(va[6 + 45], va[7 + 45]); + //------------------AVX evaluation for 6th poly ends------------------ + + //------------------AVX evaluation for 7th poly----------------------- + r0_avx = a[6 * L + 0]; + r1_avx = a[6 * L + 1]; + r2_avx = a[6 * L + 2]; + r3_avx = a[6 * L + 3]; + + va[0 + 54] = r0_avx; + va[1 + 54] = r1_avx; + va[2 + 54] = r2_avx; + va[3 + 54] = r3_avx; + va[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 54] = _mm256_add_epi16(va[6 + 54], va[7 + 54]); + //------------------AVX evaluation for 7th poly ends------------------ + + //-----------------Forward transposes-------------------------------------- + transpose(va); + transpose(va + 16); + transpose(va + 32); + transpose(va + 48); + //-----------------Forward transposes ends--------------------------------- + + if (accumulate == 0) { + schoolbook_avx(vc, va, vb); + schoolbook_avx(vc + 32, va + 16, vb + 16); + schoolbook_avx(vc + 64, va + 32, vb + 32); + schoolbook_avx(vc + 96, va + 48, vb + 48); + } else { + schoolbook_avx_acc(vc, va, vb); + schoolbook_avx_acc(vc + 32, va + 16, vb + 16); + schoolbook_avx_acc(vc + 64, va + 32, vb + 32); + schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + } +} + +static void karatsuba_eval(__m256i *b_eval, const __m256i *b) { + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + //-------1st poly---------------------------------------------------- + r0_avx = b[0 * L + 0]; + r1_avx = b[0 * L + 1]; + r2_avx = b[0 * L + 2]; + r3_avx = b[0 * L + 3]; + + b_eval[0] = r0_avx; + b_eval[1] = r1_avx; + b_eval[2] = r2_avx; + b_eval[3] = r3_avx; + b_eval[4] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8] = _mm256_add_epi16(b_eval[6], b_eval[7]); + + //-------2nd poly---------------------------------------------------- + r0_avx = b[1 * L + 0]; + r1_avx = b[1 * L + 1]; + r2_avx = b[1 * L + 2]; + r3_avx = b[1 * L + 3]; + + b_eval[0 + 9] = r0_avx; + b_eval[1 + 9] = r1_avx; + b_eval[2 + 9] = r2_avx; + b_eval[3 + 9] = r3_avx; + b_eval[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 9] = _mm256_add_epi16(b_eval[6 + 9], b_eval[7 + 9]); + + //-------3rd poly---------------------------------------------------- + r0_avx = b[2 * L + 0]; + r1_avx = b[2 * L + 1]; + r2_avx = b[2 * L + 2]; + r3_avx = b[2 * L + 3]; + + b_eval[0 + 18] = r0_avx; + b_eval[1 + 18] = r1_avx; + b_eval[2 + 18] = r2_avx; + b_eval[3 + 18] = r3_avx; + b_eval[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 18] = _mm256_add_epi16(b_eval[6 + 18], b_eval[7 + 18]); + + //-------4th poly---------------------------------------------------- + r0_avx = b[3 * L + 0]; + r1_avx = b[3 * L + 1]; + r2_avx = b[3 * L + 2]; + r3_avx = b[3 * L + 3]; + + b_eval[0 + 27] = r0_avx; + b_eval[1 + 27] = r1_avx; + b_eval[2 + 27] = r2_avx; + b_eval[3 + 27] = r3_avx; + b_eval[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 27] = _mm256_add_epi16(b_eval[6 + 27], b_eval[7 + 27]); + + //-------5th poly---------------------------------------------------- + r0_avx = b[4 * L + 0]; + r1_avx = b[4 * L + 1]; + r2_avx = b[4 * L + 2]; + r3_avx = b[4 * L + 3]; + + b_eval[0 + 36] = r0_avx; + b_eval[1 + 36] = r1_avx; + b_eval[2 + 36] = r2_avx; + b_eval[3 + 36] = r3_avx; + b_eval[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 36] = _mm256_add_epi16(b_eval[6 + 36], b_eval[7 + 36]); + + //-------6th poly---------------------------------------------------- + r0_avx = b[5 * L + 0]; + r1_avx = b[5 * L + 1]; + r2_avx = b[5 * L + 2]; + r3_avx = b[5 * L + 3]; + + b_eval[0 + 45] = r0_avx; + b_eval[1 + 45] = r1_avx; + b_eval[2 + 45] = r2_avx; + b_eval[3 + 45] = r3_avx; + b_eval[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 45] = _mm256_add_epi16(b_eval[6 + 45], b_eval[7 + 45]); + + //-------7th poly---------------------------------------------------- + r0_avx = b[6 * L + 0]; + r1_avx = b[6 * L + 1]; + r2_avx = b[6 * L + 2]; + r3_avx = b[6 * L + 3]; + + b_eval[0 + 54] = r0_avx; + b_eval[1 + 54] = r1_avx; + b_eval[2 + 54] = r2_avx; + b_eval[3 + 54] = r3_avx; + b_eval[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 54] = _mm256_add_epi16(b_eval[6 + 54], b_eval[7 + 54]); + + //--------------Evaluating B poly ends------------------------------- + transpose(b_eval); + transpose(b_eval + 16); + transpose(b_eval + 32); + transpose(b_eval + 48); +} + +static void karatsuba_interp(__m256i *result_final0, __m256i *result_final1, __m256i *result_final2, __m256i *result_final3, __m256i *result_final4, __m256i *result_final5, __m256i *result_final6, const __m256i *c_eval) { + __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results + __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; + + //------------------------AVX interpolation for 1st poly external------------------- + res_avx0 = c_eval[0]; + res_avx2 = c_eval[1]; + res_avx4 = c_eval[2]; + res_avx6 = c_eval[3]; + c6_avx = c_eval[6]; + c7_avx = c_eval[7]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[8], c6_avx), c7_avx); + + res_avx1 = c_eval[16]; + res_avx3 = c_eval[17]; + res_avx5 = c_eval[18]; + res_avx7 = c_eval[19]; + c22_avx = c_eval[22]; + c23_avx = c_eval[23]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[21], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[24], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[20], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[5], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[4], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final0[0] = res_avx0; + result_final0[1] = res_avx1; + result_final0[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final0[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final0[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final0[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final0[6] = res_avx6; + result_final0[7] = res_avx7; + //------------------------AVX interpolation for 1st poly ends-------------- + + + //------------------------AVX interpolation for 2nd poly external------------------- + res_avx0 = c_eval[9]; //c_eval0 + res_avx2 = c_eval[10]; //c_eval1 + res_avx4 = c_eval[11]; //c_eval2 + res_avx6 = c_eval[12]; //c_eval3 + c6_avx = c_eval[15]; //c_eval6 + c7_avx = c_eval[32]; //c_eval7 + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[33], c6_avx), c7_avx); + + res_avx1 = c_eval[25]; //c_eval0 + res_avx3 = c_eval[26]; //c_eval1 + res_avx5 = c_eval[27]; //c_eval2 + res_avx7 = c_eval[28]; //c_eval3 + c22_avx = c_eval[31]; + c23_avx = c_eval[48]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[30], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[49], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[29], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[14], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[13], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final1[0] = res_avx0; + result_final1[1] = res_avx1; + result_final1[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final1[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final1[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final1[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final1[6] = res_avx6; + result_final1[7] = res_avx7; + //------------------------AVX interpolation for 2nd poly ends-------------- + + //------------------------AVX interpolation for 3rd poly external------------------- + res_avx0 = c_eval[34]; //c_eval0 + res_avx2 = c_eval[35]; //c_eval1 + res_avx4 = c_eval[36]; + res_avx6 = c_eval[37]; + c6_avx = c_eval[40]; + c7_avx = c_eval[41]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[42], c6_avx), c7_avx); + + res_avx1 = c_eval[50]; //c_eval0 + res_avx3 = c_eval[51]; //c_eval1 + res_avx5 = c_eval[52]; + res_avx7 = c_eval[53]; + c22_avx = c_eval[56]; + c23_avx = c_eval[57]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[55], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[58], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[54], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[39], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[38], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final2[0] = res_avx0; + result_final2[1] = res_avx1; + result_final2[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final2[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final2[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final2[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final2[6] = res_avx6; + result_final2[7] = res_avx7; + //------------------------AVX interpolation for 3rd poly ends-------------- + + //------------------------AVX interpolation for 4th poly external------------------- + res_avx0 = c_eval[43]; + res_avx2 = c_eval[44]; + res_avx4 = c_eval[45]; + res_avx6 = c_eval[46]; + c6_avx = c_eval[65]; + c7_avx = c_eval[66]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[67], c6_avx), c7_avx); + + res_avx1 = c_eval[59]; + res_avx3 = c_eval[60]; + res_avx5 = c_eval[61]; + res_avx7 = c_eval[62]; + c22_avx = c_eval[81]; + c23_avx = c_eval[82]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[80], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[83], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[63], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[64], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[47], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final3[0] = res_avx0; + result_final3[1] = res_avx1; + result_final3[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final3[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final3[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final3[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final3[6] = res_avx6; + result_final3[7] = res_avx7; + //------------------------AVX interpolation for 4th poly ends-------------- + + //------------------------AVX interpolation for 5th poly external------------------- + res_avx0 = c_eval[68]; + res_avx2 = c_eval[69]; + res_avx4 = c_eval[70]; + res_avx6 = c_eval[71]; + c6_avx = c_eval[74]; + c7_avx = c_eval[75]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[76], c6_avx), c7_avx); + + res_avx1 = c_eval[84]; + res_avx3 = c_eval[85]; + res_avx5 = c_eval[86]; + res_avx7 = c_eval[87]; + c22_avx = c_eval[90]; + c23_avx = c_eval[91]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[89], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[92], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[88], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[73], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[72], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final4[0] = res_avx0; + result_final4[1] = res_avx1; + result_final4[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final4[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final4[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final4[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final4[6] = res_avx6; + result_final4[7] = res_avx7; + //------------------------AVX interpolation for 5th poly ends-------------- + + //------------------------AVX interpolation for 6th poly external------------------- + res_avx0 = c_eval[77]; + res_avx2 = c_eval[78]; + res_avx4 = c_eval[79]; + res_avx6 = c_eval[96]; + c6_avx = c_eval[99]; + c7_avx = c_eval[100]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[101], c6_avx), c7_avx); + + res_avx1 = c_eval[93]; + res_avx3 = c_eval[94]; + res_avx5 = c_eval[95]; + res_avx7 = c_eval[112]; + c22_avx = c_eval[115]; + c23_avx = c_eval[116]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[114], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[117], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[113], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[98], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[97], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final5[0] = res_avx0; + result_final5[1] = res_avx1; + result_final5[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final5[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final5[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final5[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final5[6] = res_avx6; + result_final5[7] = res_avx7; + //------------------------AVX interpolation for 6th poly ends-------------- + + //------------------------AVX interpolation for 7th poly external------------------- + res_avx0 = c_eval[102]; + res_avx2 = c_eval[103]; + res_avx4 = c_eval[104]; + res_avx6 = c_eval[105]; + c6_avx = c_eval[108]; + c7_avx = c_eval[109]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[110], c6_avx), c7_avx); + + res_avx1 = c_eval[118]; + res_avx3 = c_eval[119]; + res_avx5 = c_eval[120]; + res_avx7 = c_eval[121]; + c22_avx = c_eval[124]; + c23_avx = c_eval[125]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[123], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[126], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[122], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[107], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[106], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final6[0] = res_avx0; + result_final6[1] = res_avx1; + result_final6[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final6[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final6[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final6[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final6[6] = res_avx6; + result_final6[7] = res_avx7; + //------------------------AVX interpolation for 7th poly ends-------------- +} + +void PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a, const toom4_points *b_eval, int accumulate) { + size_t i; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i aw_avx[7 * L]; + __m256i *va = (__m256i *)a->coeffs; + + for (i = 0; i < L; i++) { + r0_avx = va[0 * L + i]; + r1_avx = va[1 * L + i]; + r2_avx = va[2 * L + i]; + r3_avx = va[3 * L + i]; + r4_avx = _mm256_add_epi16(r0_avx, r2_avx); + r5_avx = _mm256_add_epi16(r1_avx, r3_avx); + aw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + aw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r0_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r2_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r5_avx = _mm256_slli_epi16(r1_avx, 2); + r5_avx = _mm256_add_epi16(r5_avx, r3_avx); + aw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + aw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r3_avx, 3); + r6_avx = _mm256_slli_epi16(r2_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + r6_avx = _mm256_slli_epi16(r1_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + aw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx); + aw_avx[6 * L + i] = r0_avx; + aw_avx[0 * L + i] = r3_avx; + } + + batch_64coefficient_multiplications(c_eval, aw_avx, b_eval, accumulate); +} + +void PQCLEAN_LIGHTSABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b) { + size_t i; + __m256i bw_avx[7 * L]; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i *vb = (__m256i *)b->coeffs; + __m256i *vb_eval = (__m256i *)b_eval->coeffs; + + for (i = 0; i < L; i++) { + r0_avx = vb[0 * L + i]; + r1_avx = vb[1 * L + i]; + r2_avx = vb[2 * L + i]; + r3_avx = vb[3 * L + i]; + r4_avx = _mm256_add_epi16(r0_avx, r2_avx); + r5_avx = _mm256_add_epi16(r1_avx, r3_avx); + bw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + bw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r0_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r2_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r5_avx = _mm256_slli_epi16(r1_avx, 2); + r5_avx = _mm256_add_epi16(r5_avx, r3_avx); + bw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + bw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r3_avx, 3); + r6_avx = _mm256_slli_epi16(r2_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + r6_avx = _mm256_slli_epi16(r1_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + bw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx); + bw_avx[6 * L + i] = r0_avx; + bw_avx[0 * L + i] = r3_avx; + } + + karatsuba_eval(vb_eval, bw_avx); +} + + +void PQCLEAN_LIGHTSABER_AVX2_toom4_interp(poly *res, const toom4_points_product *c_eval) { + size_t i; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx; + __m256i w1_avx[2 * L], w2_avx[2 * L], w3_avx[2 * L], w4_avx[2 * L], w5_avx[2 * L], w6_avx[2 * L], w7_avx[2 * L]; + __m256i res_full[32]; + __m256i *vc = (__m256i *)c_eval->coeffs; + __m256i *vres = (__m256i *)res->coeffs; + + transpose(vc); + transpose(vc + 16); + transpose(vc + 32); + transpose(vc + 48); + transpose(vc + 64); + transpose(vc + 80); + transpose(vc + 96); + transpose(vc + 112); + + karatsuba_interp(w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx, vc); + + for (i = 0; i < 2 * L; i++) { + r0_avx = w1_avx[i]; + r1_avx = w2_avx[i]; + r2_avx = w3_avx[i]; + r3_avx = w4_avx[i]; + r4_avx = w5_avx[i]; + r5_avx = w6_avx[i]; + r6_avx = w7_avx[i]; + + r1_avx = _mm256_add_epi16(r1_avx, r4_avx); + r5_avx = _mm256_sub_epi16(r5_avx, r4_avx); + r3_avx = _mm256_sub_epi16(r3_avx, r2_avx); + r3_avx = _mm256_srli_epi16(r3_avx, 1); + r4_avx = _mm256_sub_epi16(r4_avx, r0_avx); + temp_avx = _mm256_slli_epi16(r6_avx, 6); + + r4_avx = _mm256_sub_epi16(r4_avx, temp_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r5_avx); + r2_avx = _mm256_add_epi16(r2_avx, r3_avx); + temp_avx = _mm256_slli_epi16(r2_avx, 6); + + r1_avx = _mm256_sub_epi16(r1_avx, temp_avx); + r1_avx = _mm256_sub_epi16(r1_avx, r2_avx); + r2_avx = _mm256_sub_epi16(r2_avx, r6_avx); + r2_avx = _mm256_sub_epi16(r2_avx, r0_avx); + temp_avx = _mm256_mullo_epi16(r2_avx, _mm256_set1_epi16(45)); + + r1_avx = _mm256_add_epi16(r1_avx, temp_avx); + temp_avx = _mm256_slli_epi16(r2_avx, 3); + + r4_avx = _mm256_sub_epi16(r4_avx, temp_avx); + r4_avx = _mm256_mullo_epi16(r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16) + r4_avx = _mm256_srli_epi16(r4_avx, 3); + r5_avx = _mm256_add_epi16(r5_avx, r1_avx); + temp_avx = _mm256_slli_epi16(r3_avx, 4); + + r1_avx = _mm256_add_epi16(r1_avx, temp_avx); + r1_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16) + r1_avx = _mm256_srli_epi16(r1_avx, 1); + r3_avx = _mm256_add_epi16(r1_avx, r3_avx); + r3_avx = _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx); + temp_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(30)); + temp_avx = _mm256_sub_epi16(temp_avx, r5_avx); + temp_avx = _mm256_mullo_epi16(temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16) + + r5_avx = _mm256_srli_epi16(temp_avx, 2); + r2_avx = _mm256_sub_epi16(r2_avx, r4_avx); + r1_avx = _mm256_sub_epi16(r1_avx, r5_avx); + + if (i < L) { + res_full[0 * L + i] = r6_avx; + res_full[1 * L + i] = r5_avx; + res_full[2 * L + i] = r4_avx; + res_full[3 * L + i] = r3_avx; + res_full[4 * L + i] = r2_avx; + res_full[5 * L + i] = r1_avx; + res_full[6 * L + i] = r0_avx; + } else { + res_full[0 * L + i] = _mm256_add_epi16(res_full[0 * L + i], r6_avx); + res_full[1 * L + i] = _mm256_add_epi16(res_full[1 * L + i], r5_avx); + res_full[2 * L + i] = _mm256_add_epi16(res_full[2 * L + i], r4_avx); + res_full[3 * L + i] = _mm256_add_epi16(res_full[3 * L + i], r3_avx); + res_full[4 * L + i] = _mm256_add_epi16(res_full[4 * L + i], r2_avx); + res_full[5 * L + i] = _mm256_add_epi16(res_full[5 * L + i], r1_avx); + res_full[6 * L + i] = r0_avx; + } + } + + // Reduction by X^256 + 1 + for (i = 0; i < 16; i++) { + vres[i] = _mm256_sub_epi16(res_full[i], res_full[i + 16]); + } +} diff --git a/crypto_kem/lightsaber/avx2/polymul/consts.h b/crypto_kem/lightsaber/avx2/polymul/consts.h deleted file mode 100644 index 40826398..00000000 --- a/crypto_kem/lightsaber/avx2/polymul/consts.h +++ /dev/null @@ -1,20 +0,0 @@ -#include "../SABER_params.h" - -#define AVX_N (SABER_N >> 4) -#define small_len_avx (AVX_N >> 2) - -#define SCHB_N 16 - -#define N_SB (SABER_N >> 2) -#define N_SB_RES (2*N_SB-1) - -#define N_SB_16 (N_SB >> 2) -#define N_SB_16_RES (2*N_SB_16-1) - -#define AVX_N1 16 /*N/16*/ - -#define SCM_SIZE 16 - -// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements -#define NUM_POLY SABER_K -//int NUM_POLY=2; diff --git a/crypto_kem/lightsaber/avx2/polymul/matrix.c b/crypto_kem/lightsaber/avx2/polymul/matrix.c deleted file mode 100644 index 5fa35783..00000000 --- a/crypto_kem/lightsaber/avx2/polymul/matrix.c +++ /dev/null @@ -1,303 +0,0 @@ -#include - -static void transpose_n1(__m256i *M) -{ - //int i; - register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; - register __m256i temp, temp0, temp1, temp2; - - //for(i=0; i<8; i=i+1) - //{ - r0 = _mm256_unpacklo_epi16(M[0], M[1]); - r1 = _mm256_unpacklo_epi16(M[2], M[3]); - r2 = _mm256_unpacklo_epi16(M[4], M[5]); - r3 = _mm256_unpacklo_epi16(M[6], M[7]); - r4 = _mm256_unpacklo_epi16(M[8], M[9]); - r5 = _mm256_unpacklo_epi16(M[10], M[11]); - r6 = _mm256_unpacklo_epi16(M[12], M[13]); - r7 = _mm256_unpacklo_epi16(M[14], M[15]); - - - temp = _mm256_unpacklo_epi32(r0, r1); - temp0 = _mm256_unpacklo_epi32(r2, r3); - temp1 = _mm256_unpacklo_epi32(r4, r5); - temp2 = _mm256_unpacklo_epi32(r6, r7); - - r8 = _mm256_unpackhi_epi32(r0, r1); - r9 = _mm256_unpackhi_epi32(r2, r3); - r10 = _mm256_unpackhi_epi32(r4, r5); - r11 = _mm256_unpackhi_epi32(r6, r7); - - r0 = _mm256_unpacklo_epi64(temp, temp0); - r2 = _mm256_unpackhi_epi64(temp, temp0); - - r1 = _mm256_unpacklo_epi64(temp1, temp2); - r3 = _mm256_unpackhi_epi64(temp1, temp2); - - temp = _mm256_unpackhi_epi16(M[0], M[1]); - temp0 = _mm256_unpackhi_epi16(M[2], M[3]); - temp1 = _mm256_unpackhi_epi16(M[4], M[5]); - temp2 = _mm256_unpackhi_epi16(M[6], M[7]); - r4 = _mm256_unpackhi_epi16(M[8], M[9]); - - M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); - M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); - M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); - M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); - - - r5 = _mm256_unpackhi_epi16(M[10], M[11]); - r6 = _mm256_unpackhi_epi16(M[12], M[13]); - r7 = _mm256_unpackhi_epi16(M[14], M[15]); - - - - r0 = _mm256_unpacklo_epi64(r8, r9); - r1 = _mm256_unpacklo_epi64(r10, r11); - - r2 = _mm256_unpackhi_epi64(r8, r9); - r3 = _mm256_unpackhi_epi64(r10, r11); - - - - M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); - M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); - M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); - M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); - - - //for(i=0; i<4; i=i+1) - //{ - r0 = _mm256_unpacklo_epi32(temp, temp0); - r1 = _mm256_unpacklo_epi32(temp1, temp2); - r2 = _mm256_unpacklo_epi32(r4, r5); - r3 = _mm256_unpacklo_epi32(r6, r7); - - //} - - - //for(i=0; i<2; i=i+1) - //{ - r8 = _mm256_unpacklo_epi64(r0, r1); - r10 = _mm256_unpackhi_epi64(r0, r1); - - r9 = _mm256_unpacklo_epi64(r2, r3); - r11 = _mm256_unpackhi_epi64(r2, r3); - - M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); - M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); - M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); - M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); - - r0 = _mm256_unpackhi_epi32(temp, temp0); - r1 = _mm256_unpackhi_epi32(temp1, temp2); - r2 = _mm256_unpackhi_epi32(r4, r5); - r3 = _mm256_unpackhi_epi32(r6, r7); - - //} -// for(i=0; i<2; i=i+1) -// { - r4 = _mm256_unpacklo_epi64(r0, r1); - r6 = _mm256_unpackhi_epi64(r0, r1); - - r5 = _mm256_unpacklo_epi64(r2, r3); - r7 = _mm256_unpackhi_epi64(r2, r3); - -// } - - //------------------------------------------------------- - - M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); - M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); - M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); - M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); -} - -/* -void transpose_unrolled(__m256i *M) -{ - int i; - __m256i tL[8], tH[8]; - __m256i bL[4], bH[4], cL[4], cH[4]; - __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; - - __m256i r0, r1, r2, r3, r4, r5, r6, r7; - - //for(i=0; i<8; i=i+1) - //{ - tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); - tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); - - tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); - tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); - - tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); - tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); - - tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); - tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); - - tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); - tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); - - tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); - tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); - - tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); - tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); - - tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); - tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); - - //} - - //------------------------------------------------------- - //for(i=0; i<4; i=i+1) - //{ - bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); - bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); - - bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); - bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); - - bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); - bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); - - bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); - bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); - - //} - - //for(i=0; i<2; i=i+1) - //{ - dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); - dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); - - dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); - dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); - - M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); - M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); - M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); - M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); - - //} - //for(i=0; i<2; i=i+1) - //{ - eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); - eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); - - eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); - eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); - - //} - - //------------------------------------------------------- - - //------------------------------------------------------- - for(i=0; i<4; i=i+1) - { - cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); - cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); - } - - - for(i=0; i<2; i=i+1) - { - fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); - fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); - gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); - } - - //------------------------------------------------------- - - - - M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); - M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); - M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); - M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); - - M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); - M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); - M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); - M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); - - M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); - M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); - M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); - M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); -} - - -void transpose1(__m256i *M) -{ - int i; - __m256i tL[8], tH[8]; - __m256i bL[4], bH[4], cL[4], cH[4]; - __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; - - for(i=0; i<8; i=i+1) - { - tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); - tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); - } - - for(i=0; i<4; i=i+1) - { - bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); - bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); - } - for(i=0; i<4; i=i+1) - { - cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); - cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); - } - - for(i=0; i<2; i=i+1) - { - dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); - dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); - eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); - } - - for(i=0; i<2; i=i+1) - { - fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); - fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); - gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); - } - - M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); - M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); - M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); - M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); - - M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); - M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); - M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); - M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); - - M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); - M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); - M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); - M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); - - M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); - M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); - M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); - M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); -} -*/ diff --git a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c deleted file mode 100644 index 48870f51..00000000 --- a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c +++ /dev/null @@ -1,753 +0,0 @@ -//#define SCM_SIZE 16 - -//#pragma STDC FP_CONTRACT ON - -#include - -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - - -static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched - //the c_avx are added cummulatively -{ - - register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - register __m256i temp; - - - a0=a[0]; - a1=a[1]; - a2=a[2]; - a3=a[3]; - a4=a[4]; - a5=a[5]; - a6=a[6]; - a7=a[7]; - - b0=b[0]; - b1=b[1]; - b2=b[2]; - b3=b[3]; - b4=b[4]; - b5=b[5]; - b6=b[6]; - b7=b[7]; - - // New Unrolled first triangle - - //otherwise accumulate - c_avx[0] = mul_add(a0, b0, c_avx[0]); - - - temp = _mm256_mullo_epi16 (a0, b1); - temp=mul_add(a1, b0, temp); - c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); - - - temp = _mm256_mullo_epi16 (a0, b2); - temp = mul_add(a1, b1, temp); - temp=mul_add(a2, b0, temp); - c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); - - - temp = _mm256_mullo_epi16 (a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp=mul_add(a3, b0, temp); - c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); - - temp = _mm256_mullo_epi16 (a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp=mul_add(a2, b2, temp); - c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); - - - temp = _mm256_mullo_epi16 (a0, b5); - temp = mul_add(a1, b4 , temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp=mul_add(a5, b0, temp); - c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); - - temp = _mm256_mullo_epi16 (a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp=mul_add(a4, b2, temp); - c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); - - - temp = _mm256_mullo_epi16 (a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add (a6, b1, temp); - temp = mul_add (a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add (a3, b4, temp); - temp = mul_add (a4, b3, temp); - temp=mul_add(a5, b2, temp); - c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); - - temp = _mm256_mullo_epi16 (a0, b[8]); - temp = mul_add (a1, b7, temp); - temp = mul_add (a7, b1, temp); - temp = mul_add (a[8], b0, temp); - temp = mul_add (a2, b6,temp); - temp = mul_add(a3, b5, temp); - temp = mul_add (a4, b4,temp); - temp = mul_add (a5, b3, temp); - - temp=mul_add(a6, b2, temp); - c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); - - - temp = _mm256_mullo_epi16 (a0, b[9]); - temp = mul_add (a1, b[8], temp); - temp = mul_add (a[8], b1, temp); - temp = mul_add (a[9], b0, temp); - temp = mul_add (a2, b7, temp); - temp = mul_add (a3, b6, temp); - temp = mul_add (a4, b5, temp); - temp = mul_add (a5, b4, temp); - temp = mul_add (a6, b3, temp); - temp=mul_add(a7, b2, temp); - c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); - - - temp= _mm256_mullo_epi16 (a0, b[10]); - temp = mul_add (a1, b[9], temp); - temp = mul_add (a[9], b1, temp); - temp = mul_add (a[10], b0, temp); - temp = mul_add (a2, b[8], temp); - temp = mul_add (a3, b7, temp); - temp = mul_add (a4, b6, temp); - temp = mul_add (a5, b5, temp); - temp = mul_add (a6, b4, temp); - temp = mul_add (a7, b3, temp); - temp=mul_add(a[8], b2, temp); - c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); - - - temp = _mm256_mullo_epi16 (a0, b[11]); - temp = mul_add (a1, b[10], temp ); - temp = mul_add (a[10], b1, temp ); - temp = mul_add (a[11], b0, temp ); - temp = mul_add (a2, b[9], temp ); - temp = mul_add (a3, b[8], temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a[8], b3, temp ); - temp=mul_add(a[9], b2, temp); - c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); - - - temp = _mm256_mullo_epi16 (a0, b[12]); - temp = mul_add (a1, b[11], temp); - temp = mul_add (a[11], b1, temp); - temp = mul_add (a[12], b0, temp); - temp = mul_add (a2, b[10], temp); - temp = mul_add (a3, b[9], temp); - temp = mul_add (a4, b[8], temp); - temp = mul_add (a5, b7, temp); - temp = mul_add (a6, b6, temp); - temp = mul_add (a7, b5, temp); - temp = mul_add (a[8], b4, temp); - temp = mul_add (a[9], b3, temp); - temp=mul_add(a[10], b2, temp); - c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); - - - temp = _mm256_mullo_epi16 (a0, b[13]); - temp = mul_add (a1, b[12], temp ); - temp = mul_add (a[12], b1, temp ); - temp = mul_add (a[13], b0, temp ); - temp = mul_add (a2, b[11], temp ); - temp = mul_add (a3, b[10], temp ); - temp = mul_add (a4, b[9], temp ); - temp = mul_add (a5, b[8], temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a[8], b5, temp ); - temp = mul_add (a[9], b4, temp ); - temp = mul_add (a[10], b3, temp ); - temp=mul_add(a[11], b2, temp); - c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); - - - - temp = _mm256_mullo_epi16 (a0, b[14]); - temp = mul_add (a1, b[13], temp ); - temp = mul_add (a[13], b1, temp ); - temp = mul_add (a[14], b0, temp ); - temp = mul_add (a2, b[12], temp ); - temp = mul_add (a3, b[11], temp ); - temp = mul_add (a4, b[10], temp ); - temp = mul_add (a5, b[9], temp ); - temp = mul_add (a6, b[8], temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a[8], b6, temp ); - temp = mul_add (a[9], b5, temp ); - temp = mul_add (a[10], b4, temp ); - temp = mul_add (a[11], b3, temp ); - temp=mul_add(a[12], b2, temp); - c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); - - - temp = _mm256_mullo_epi16 (a0, b[15]); - temp = mul_add (a1, b[14], temp ); - temp = mul_add (a[14], b1, temp ); - temp = mul_add (a[15], b0, temp ); - temp = mul_add (a2, b[13], temp ); - temp = mul_add (a3, b[12], temp ); - temp = mul_add (a4, b[11], temp ); - temp = mul_add (a5, b[10], temp ); - temp = mul_add (a6, b[9], temp ); - temp = mul_add (a7, b[8], temp ); - temp = mul_add (a[8], b7, temp ); - temp = mul_add (a[9], b6, temp ); - temp = mul_add (a[10], b5, temp ); - temp = mul_add (a[11], b4, temp ); - temp = mul_add (a[12], b3, temp ); - temp=mul_add(a[13], b2, temp); - c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); - - - // unrolled second triangle - a0=a[14]; - a1=a[15]; - a2=a[13]; - a3=a[12]; - a4=a[11]; - a5=a[10]; - a6=a[9]; - a7=a[8]; - - b0=b[14]; - b1=b[15]; - b2=b[13]; - b3=b[12]; - b4=b[11]; - b5=b[10]; - b6=b[9]; - b7=b[8]; - - temp = _mm256_mullo_epi16 (a[1], b1); - temp = mul_add (a[2], b0, temp ); - temp = mul_add (a[3], b2, temp ); - temp = mul_add (a[4], b3, temp ); - temp = mul_add (a[5], b4, temp ); - temp = mul_add (a[6], b5, temp ); - temp = mul_add (a[7], b6, temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a6, b[7], temp ); - temp = mul_add (a5, b[6], temp ); - temp = mul_add (a4, b[5], temp ); - temp = mul_add (a3, b[4], temp ); - temp = mul_add (a2, b[3], temp ); - temp = mul_add (a0, b[2], temp ); - temp=mul_add(a1, b[1], temp); - c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); - - - temp = _mm256_mullo_epi16 (a[2], b1); - temp = mul_add (a[3], b0, temp ); - temp = mul_add (a[4], b2, temp ); - temp = mul_add (a[5], b3, temp ); - temp = mul_add (a[6], b4, temp ); - temp = mul_add (a[7], b5, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a5, b[7], temp ); - temp = mul_add (a4, b[6], temp ); - temp = mul_add (a3, b[5], temp ); - temp = mul_add (a2, b[4], temp ); - temp = mul_add (a0, b[3], temp ); - temp=mul_add(a1, b[2], temp); - c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); - - - temp = _mm256_mullo_epi16 (a[3], b1); - temp = mul_add (a[4], b0, temp ); - temp = mul_add (a[5], b2, temp ); - temp = mul_add (a[6], b3, temp ); - temp = mul_add (a[7], b4, temp ); - temp = mul_add (a7, b5, temp ); - temp = mul_add (a6, b6, temp ); - temp = mul_add (a5, b7, temp ); - temp = mul_add (a4, b[7], temp ); - temp = mul_add (a3, b[6], temp ); - temp = mul_add (a2, b[5], temp ); - temp = mul_add (a0, b[4], temp ); - temp=mul_add(a1, b[3], temp); - c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); - - - temp = _mm256_mullo_epi16 (a[4], b1); - temp = mul_add (a[5], b0, temp ); - temp = mul_add (a[6], b2, temp ); - temp = mul_add (a[7], b3, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a3, b[7], temp ); - temp = mul_add (a2, b[6], temp ); - temp = mul_add (a0, b[5], temp ); - temp=mul_add(a1, b[4], temp); - c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); - - - temp = _mm256_mullo_epi16 (a[5], b1); - temp = mul_add (a[6], b0, temp ); - temp = mul_add (a[7], b2, temp ); - temp = mul_add (a7, b3, temp ); - temp = mul_add (a6, b4, temp ); - temp = mul_add (a5, b5, temp ); - temp = mul_add (a4, b6, temp ); - temp = mul_add (a3, b7, temp ); - temp = mul_add (a2, b[7], temp ); - temp = mul_add (a0, b[6], temp ); - temp=mul_add(a1, b[5], temp); - c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); - - - temp = _mm256_mullo_epi16 (a[6], b1); - temp = mul_add (a[7], b0, temp ); - temp = mul_add (a7, b2, temp ); - temp = mul_add (a6, b3, temp ); - temp = mul_add (a5, b4, temp ); - temp = mul_add (a4, b5, temp ); - temp = mul_add (a3, b6, temp ); - temp = mul_add (a2, b7, temp ); - temp = mul_add (a0, b[7], temp ); - temp=mul_add(a1, b[6], temp); - c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); - - - temp = _mm256_mullo_epi16 (a[7], b1); - temp = mul_add (a7, b0, temp ); - temp = mul_add (a6, b2, temp ); - temp = mul_add (a5, b3, temp ); - temp = mul_add (a4, b4, temp ); - temp = mul_add (a3, b5, temp ); - temp = mul_add (a2, b6, temp ); - temp = mul_add (a0, b7, temp ); - temp=mul_add(a1, b[7], temp); - c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); - - - temp = _mm256_mullo_epi16 (a7, b1); - temp = mul_add (a6, b0, temp ); - temp = mul_add (a5, b2, temp ); - temp = mul_add (a4, b3, temp ); - temp = mul_add (a3, b4, temp ); - temp = mul_add (a2, b5, temp ); - temp = mul_add (a0, b6, temp ); - temp=mul_add(a1, b7, temp); - c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); - - - temp = _mm256_mullo_epi16 (a6, b1); - temp = mul_add (a5, b0, temp ); - temp = mul_add (a4, b2, temp ); - temp = mul_add (a3, b3, temp ); - temp = mul_add (a2, b4, temp ); - temp = mul_add (a0, b5, temp ); - temp=mul_add(a1, b6, temp); - c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); - - - temp = _mm256_mullo_epi16 (a5, b1); - temp = mul_add (a4, b0, temp ); - temp = mul_add (a3, b2, temp ); - temp = mul_add (a2, b3, temp ); - temp = mul_add (a0, b4, temp ); - temp=mul_add(a1, b5, temp); - c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); - - - temp = _mm256_mullo_epi16 (a4, b1); - temp = mul_add (a3, b0, temp ); - temp = mul_add (a2, b2, temp ); - temp = mul_add (a0, b3, temp ); - temp=mul_add(a1, b4, temp); - c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); - - - temp = _mm256_mullo_epi16 (a3, b1); - temp = mul_add (a2, b0, temp ); - temp = mul_add (a0, b2, temp ); - temp=mul_add(a1, b3, temp); - c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); - - - temp = _mm256_mullo_epi16 (a2, b1); - temp = mul_add (a0, b0, temp ); - temp=mul_add(a1, b2, temp); - c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); - - - temp = _mm256_mullo_epi16 (a0, b1); - temp=mul_add(a1, b0, temp); - c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); - - - c_avx[30] = mul_add(a1, b1, c_avx[30]); - - - - c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); - - -} - - - -static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched - //the c_avx are not added cummulatively -{ - - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - - - a0=a[0]; - a1=a[1]; - a2=a[2]; - a3=a[3]; - a4=a[4]; - a5=a[5]; - a6=a[6]; - a7=a[7]; - - b0=b[0]; - b1=b[1]; - b2=b[2]; - b3=b[3]; - b4=b[4]; - b5=b[5]; - b6=b[6]; - b7=b[7]; - - // New Unrolled first triangle - c_avx[0] = _mm256_mullo_epi16 (a0, b0); - - temp = _mm256_mullo_epi16 (a0, b1); - c_avx[1]=mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b2); - - temp = mul_add(a1, b1, temp); - c_avx[2]= mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c_avx[3]= mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c_avx[4]= mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b5); - temp = mul_add(a1, b4 , temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c_avx[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c_avx[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add (a6, b1, temp); - temp = mul_add (a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add (a3, b4, temp); - temp = mul_add (a4, b3, temp); - c_avx[7] = mul_add (a5, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[8]); - temp = mul_add (a1, b7, temp); - temp = mul_add (a7, b1, temp); - temp = mul_add (a[8], b0, temp); - temp = mul_add (a2, b6,temp); - temp = mul_add(a3, b5, temp); - temp = mul_add (a4, b4,temp); - temp = mul_add (a5, b3, temp); - c_avx[8] = mul_add (a6, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[9]); - temp = mul_add (a1, b[8], temp); - temp = mul_add (a[8], b1, temp); - temp = mul_add (a[9], b0, temp); - temp = mul_add (a2, b7, temp); - temp = mul_add (a3, b6, temp); - temp = mul_add (a4, b5, temp); - temp = mul_add (a5, b4, temp); - temp = mul_add (a6, b3, temp); - c_avx[9] = mul_add (a7, b2, temp); - - temp= _mm256_mullo_epi16 (a0, b[10]); - temp = mul_add (a1, b[9], temp); - temp = mul_add (a[9], b1, temp); - temp = mul_add (a[10], b0, temp); - temp = mul_add (a2, b[8], temp); - temp = mul_add (a3, b7, temp); - temp = mul_add (a4, b6, temp); - temp = mul_add (a5, b5, temp); - temp = mul_add (a6, b4, temp); - temp = mul_add (a7, b3, temp); - c_avx[10] = mul_add (a[8], b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[11]); - temp = mul_add (a1, b[10], temp ); - temp = mul_add (a[10], b1, temp ); - temp = mul_add (a[11], b0, temp ); - temp = mul_add (a2, b[9], temp ); - temp = mul_add (a3, b[8], temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a[8], b3, temp ); - c_avx[11] = mul_add (a[9], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[12]); - temp = mul_add (a1, b[11], temp); - temp = mul_add (a[11], b1, temp); - temp = mul_add (a[12], b0, temp); - temp = mul_add (a2, b[10], temp); - temp = mul_add (a3, b[9], temp); - temp = mul_add (a4, b[8], temp); - temp = mul_add (a5, b7, temp); - temp = mul_add (a6, b6, temp); - temp = mul_add (a7, b5, temp); - temp = mul_add (a[8], b4, temp); - temp = mul_add (a[9], b3, temp); - c_avx[12] = mul_add (a[10], b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[13]); - temp = mul_add (a1, b[12], temp ); - temp = mul_add (a[12], b1, temp ); - temp = mul_add (a[13], b0, temp ); - temp = mul_add (a2, b[11], temp ); - temp = mul_add (a3, b[10], temp ); - temp = mul_add (a4, b[9], temp ); - temp = mul_add (a5, b[8], temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a[8], b5, temp ); - temp = mul_add (a[9], b4, temp ); - temp = mul_add (a[10], b3, temp ); - c_avx[13] = mul_add (a[11], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[14]); - temp = mul_add (a1, b[13], temp ); - temp = mul_add (a[13], b1, temp ); - temp = mul_add (a[14], b0, temp ); - temp = mul_add (a2, b[12], temp ); - temp = mul_add (a3, b[11], temp ); - temp = mul_add (a4, b[10], temp ); - temp = mul_add (a5, b[9], temp ); - temp = mul_add (a6, b[8], temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a[8], b6, temp ); - temp = mul_add (a[9], b5, temp ); - temp = mul_add (a[10], b4, temp ); - temp = mul_add (a[11], b3, temp ); - c_avx[14] = mul_add (a[12], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[15]); - temp = mul_add (a1, b[14], temp ); - temp = mul_add (a[14], b1, temp ); - temp = mul_add (a[15], b0, temp ); - temp = mul_add (a2, b[13], temp ); - temp = mul_add (a3, b[12], temp ); - temp = mul_add (a4, b[11], temp ); - temp = mul_add (a5, b[10], temp ); - temp = mul_add (a6, b[9], temp ); - temp = mul_add (a7, b[8], temp ); - temp = mul_add (a[8], b7, temp ); - temp = mul_add (a[9], b6, temp ); - temp = mul_add (a[10], b5, temp ); - temp = mul_add (a[11], b4, temp ); - temp = mul_add (a[12], b3, temp ); - c_avx[15] = mul_add (a[13], b2, temp ); - - - // unrolled second triangle - a0=a[14]; - a1=a[15]; - a2=a[13]; - a3=a[12]; - a4=a[11]; - a5=a[10]; - a6=a[9]; - a7=a[8]; - - b0=b[14]; - b1=b[15]; - b2=b[13]; - b3=b[12]; - b4=b[11]; - b5=b[10]; - b6=b[9]; - b7=b[8]; - - - temp = _mm256_mullo_epi16 (a[1], b1); - temp = mul_add (a[2], b0, temp ); - temp = mul_add (a[3], b2, temp ); - temp = mul_add (a[4], b3, temp ); - temp = mul_add (a[5], b4, temp ); - temp = mul_add (a[6], b5, temp ); - temp = mul_add (a[7], b6, temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a6, b[7], temp ); - temp = mul_add (a5, b[6], temp ); - temp = mul_add (a4, b[5], temp ); - temp = mul_add (a3, b[4], temp ); - temp = mul_add (a2, b[3], temp ); - temp = mul_add (a0, b[2], temp ); - c_avx[16] = mul_add (a1, b[1], temp ); - - temp = _mm256_mullo_epi16 (a[2], b1); - temp = mul_add (a[3], b0, temp ); - temp = mul_add (a[4], b2, temp ); - temp = mul_add (a[5], b3, temp ); - temp = mul_add (a[6], b4, temp ); - temp = mul_add (a[7], b5, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a5, b[7], temp ); - temp = mul_add (a4, b[6], temp ); - temp = mul_add (a3, b[5], temp ); - temp = mul_add (a2, b[4], temp ); - temp = mul_add (a0, b[3], temp ); - c_avx[17] = mul_add (a1, b[2], temp ); - - temp = _mm256_mullo_epi16 (a[3], b1); - temp = mul_add (a[4], b0, temp ); - temp = mul_add (a[5], b2, temp ); - temp = mul_add (a[6], b3, temp ); - temp = mul_add (a[7], b4, temp ); - temp = mul_add (a7, b5, temp ); - temp = mul_add (a6, b6, temp ); - temp = mul_add (a5, b7, temp ); - temp = mul_add (a4, b[7], temp ); - temp = mul_add (a3, b[6], temp ); - temp = mul_add (a2, b[5], temp ); - temp = mul_add (a0, b[4], temp ); - c_avx[18] = mul_add (a1, b[3], temp ); - - temp = _mm256_mullo_epi16 (a[4], b1); - temp = mul_add (a[5], b0, temp ); - temp = mul_add (a[6], b2, temp ); - temp = mul_add (a[7], b3, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a3, b[7], temp ); - temp = mul_add (a2, b[6], temp ); - temp = mul_add (a0, b[5], temp ); - c_avx[19] = mul_add (a1, b[4], temp ); - - temp = _mm256_mullo_epi16 (a[5], b1); - temp = mul_add (a[6], b0, temp ); - temp = mul_add (a[7], b2, temp ); - temp = mul_add (a7, b3, temp ); - temp = mul_add (a6, b4, temp ); - temp = mul_add (a5, b5, temp ); - temp = mul_add (a4, b6, temp ); - temp = mul_add (a3, b7, temp ); - temp = mul_add (a2, b[7], temp ); - temp = mul_add (a0, b[6], temp ); - c_avx[20] = mul_add (a1, b[5], temp ); - - temp = _mm256_mullo_epi16 (a[6], b1); - temp = mul_add (a[7], b0, temp ); - temp = mul_add (a7, b2, temp ); - temp = mul_add (a6, b3, temp ); - temp = mul_add (a5, b4, temp ); - temp = mul_add (a4, b5, temp ); - temp = mul_add (a3, b6, temp ); - temp = mul_add (a2, b7, temp ); - temp = mul_add (a0, b[7], temp ); - c_avx[21] = mul_add (a1, b[6], temp ); - - temp = _mm256_mullo_epi16 (a[7], b1); - temp = mul_add (a7, b0, temp ); - temp = mul_add (a6, b2, temp ); - temp = mul_add (a5, b3, temp ); - temp = mul_add (a4, b4, temp ); - temp = mul_add (a3, b5, temp ); - temp = mul_add (a2, b6, temp ); - temp = mul_add (a0, b7, temp ); - c_avx[22] = mul_add (a1, b[7], temp ); - - temp = _mm256_mullo_epi16 (a7, b1); - temp = mul_add (a6, b0, temp ); - temp = mul_add (a5, b2, temp ); - temp = mul_add (a4, b3, temp ); - temp = mul_add (a3, b4, temp ); - temp = mul_add (a2, b5, temp ); - temp = mul_add (a0, b6, temp ); - c_avx[23] = mul_add (a1, b7, temp ); - - temp = _mm256_mullo_epi16 (a6, b1); - temp = mul_add (a5, b0, temp ); - temp = mul_add (a4, b2, temp ); - temp = mul_add (a3, b3, temp ); - temp = mul_add (a2, b4, temp ); - temp = mul_add (a0, b5, temp ); - c_avx[24] = mul_add (a1, b6, temp ); - - temp = _mm256_mullo_epi16 (a5, b1); - temp = mul_add (a4, b0, temp ); - temp = mul_add (a3, b2, temp ); - temp = mul_add (a2, b3, temp ); - temp = mul_add (a0, b4, temp ); - c_avx[25] = mul_add (a1, b5, temp ); - - temp = _mm256_mullo_epi16 (a4, b1); - temp = mul_add (a3, b0, temp ); - temp = mul_add (a2, b2, temp ); - temp = mul_add (a0, b3, temp ); - c_avx[26] = mul_add (a1, b4, temp ); - - temp = _mm256_mullo_epi16 (a3, b1); - temp = mul_add (a2, b0, temp ); - temp = mul_add (a0, b2, temp ); - c_avx[27] = mul_add (a1, b3, temp ); - - temp = _mm256_mullo_epi16 (a2, b1); - temp = mul_add (a0, b0, temp ); - c_avx[28] = mul_add (a1, b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b1); - c_avx[29] = mul_add (a1, b0, temp); - - c_avx[30] = _mm256_mullo_epi16 (a1, b1); - - - c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); - -} diff --git a/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c b/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c deleted file mode 100644 index 78fb86c2..00000000 --- a/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c +++ /dev/null @@ -1,1010 +0,0 @@ -/* -Cleaned version for step by step approach look into the _debug file -*/ -//#include "timing.c" -#include "consts.h" -#include "matrix.c" -#include "scm_avx.c" - -static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX. -{ - __m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time - - //uint16_t i; - - register __m256i r0_avx, r1_avx, r2_avx, r3_avx; - - - - //CLOCK1=cpucycles(); - - //------------------AVX evaluation for 1st poly----------------------- - - r0_avx=a[0]; - r1_avx=a[1]; - r2_avx=a[2]; - r3_avx=a[3]; - a_bucket[0]=r0_avx; - a_bucket[1]=r1_avx; - a_bucket[2]=r2_avx; - a_bucket[3]=r3_avx; - a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]); - - - //------------------AVX evaluation for 1st poly ends------------------ - - - //------------------AVX evaluation for 2nd poly----------------------- - r0_avx=a[small_len_avx]; - r1_avx=a[small_len_avx+1]; - r2_avx=a[small_len_avx+2]; - r3_avx=a[small_len_avx+3]; - a_bucket[0+9]=r0_avx; - a_bucket[1+9]=r1_avx; - a_bucket[2+9]=r2_avx; - a_bucket[3+9]=r3_avx; - a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]); - - - //------------------AVX evaluation for 2nd poly ends------------------ - - - //------------------AVX evaluation for 3rd poly----------------------- - r0_avx=a[2*small_len_avx]; - r1_avx=a[2*small_len_avx+1]; - r2_avx=a[2*small_len_avx+2]; - r3_avx=a[2*small_len_avx+3]; - a_bucket[0+18]=r0_avx; - a_bucket[1+18]=r1_avx; - a_bucket[2+18]=r2_avx; - a_bucket[3+18]=r3_avx; - a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]); - - //------------------AVX evaluation for 3rd poly ends------------------ - - - //------------------AVX evaluation for 4th poly----------------------- - - r0_avx=a[3*small_len_avx]; - r1_avx=a[3*small_len_avx+1]; - r2_avx=a[3*small_len_avx+2]; - r3_avx=a[3*small_len_avx+3]; - a_bucket[0+27]=r0_avx; - a_bucket[1+27]=r1_avx; - a_bucket[2+27]=r2_avx; - a_bucket[3+27]=r3_avx; - a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]); - - //------------------AVX evaluation for 4th poly ends------------------ - - //------------------AVX evaluation for 5th poly----------------------- - - r0_avx=a[4*small_len_avx+0]; - r1_avx=a[4*small_len_avx+1]; - r2_avx=a[4*small_len_avx+2]; - r3_avx=a[4*small_len_avx+3]; - a_bucket[0+36]=r0_avx; - a_bucket[1+36]=r1_avx; - a_bucket[2+36]=r2_avx; - a_bucket[3+36]=r3_avx; - a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]); - - //------------------AVX evaluation for 5th poly ends------------------ - - - //------------------AVX evaluation for 6th poly----------------------- - r0_avx=a[5*small_len_avx]; - r1_avx=a[5*small_len_avx+1]; - r2_avx=a[5*small_len_avx+2]; - r3_avx=a[5*small_len_avx+3]; - a_bucket[0+45]=r0_avx; - a_bucket[1+45]=r1_avx; - a_bucket[2+45]=r2_avx; - a_bucket[3+45]=r3_avx; - a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]); - - //------------------AVX evaluation for 6th poly ends------------------ - - //------------------AVX evaluation for 7th poly----------------------- - - r0_avx=a[6*small_len_avx]; - r1_avx=a[6*small_len_avx+1]; - r2_avx=a[6*small_len_avx+2]; - r3_avx=a[6*small_len_avx+3]; - a_bucket[0+54]=r0_avx; - a_bucket[1+54]=r1_avx; - a_bucket[2+54]=r2_avx; - a_bucket[3+54]=r3_avx; - a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]); - - //------------------AVX evaluation for 7th poly ends------------------ - - - - //CLOCK2=cpucycles(); - //CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1); - //printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1); - - - //CLOCK1=cpucycles(); - //-----------------Forward transposes-------------------------------------- - transpose_n1(a_bucket); - transpose_n1(a_bucket+16); - transpose_n1(a_bucket+32); - transpose_n1(a_bucket+48); - - //-----------------Forwatrd transposes ends--------------------------------- - - //----------------------all multiplications--------------------------------- - if(f==0){ - schoolbook_avx_new2(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); - schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); - schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); - } - else{ - schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); - //schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); - schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); - } - /* - schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f); - schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f); - schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f); - schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f); - */ - - - //----------------------all multiplications ends----------------------------- - - - //-----------------Reverse transposes-------------------------------------- - - /* - transpose(c_bucket); - transpose(c_bucket+16); - - transpose(c_bucket+2*SCM_SIZE); - transpose(c_bucket+16+2*SCM_SIZE); - - transpose(c_bucket+4*SCM_SIZE); - transpose(c_bucket+16+4*SCM_SIZE); - - transpose(c_bucket+6*SCM_SIZE); - transpose(c_bucket+16+6*SCM_SIZE); - */ - //-----------------Reverse transposes ends--------------------------------- - - //CLOCK2=cpucycles(); - //CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1); - - //KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6); - -} - -static void KARA_eval(__m256i* b, __m256i *b_bucket){ - - __m256i r0_avx, r1_avx, r2_avx, r3_avx; - - - //-------1st poly---------------------------------------------------- - r0_avx=b[0]; - r1_avx=b[1]; - r2_avx=b[2]; - r3_avx=b[3]; - b_bucket[0]=r0_avx; - b_bucket[1]=r1_avx; - b_bucket[2]=r2_avx; - b_bucket[3]=r3_avx; - b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]); - //-------2nd poly---------------------------------------------------- - - r0_avx=b[small_len_avx]; - r1_avx=b[small_len_avx+1]; - r2_avx=b[small_len_avx+2]; - r3_avx=b[small_len_avx+3]; - b_bucket[0+9]=r0_avx; - b_bucket[1+9]=r1_avx; - b_bucket[2+9]=r2_avx; - b_bucket[3+9]=r3_avx; - b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]); - - //-------3rd poly---------------------------------------------------- - - r0_avx=b[2*small_len_avx+0]; - r1_avx=b[2*small_len_avx+1]; - r2_avx=b[2*small_len_avx+2]; - r3_avx=b[2*small_len_avx+3]; - b_bucket[0+18]=r0_avx; - b_bucket[1+18]=r1_avx; - b_bucket[2+18]=r2_avx; - b_bucket[3+18]=r3_avx; - b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]); - - //-------4th poly---------------------------------------------------- - r0_avx=b[3*small_len_avx]; - r1_avx=b[3*small_len_avx+1]; - r2_avx=b[3*small_len_avx+2]; - r3_avx=b[3*small_len_avx+3]; - b_bucket[0+27]=r0_avx; - b_bucket[1+27]=r1_avx; - b_bucket[2+27]=r2_avx; - b_bucket[3+27]=r3_avx; - b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]); - - //-------5th poly---------------------------------------------------- - - r0_avx=b[4*small_len_avx]; - r1_avx=b[4*small_len_avx+1]; - r2_avx=b[4*small_len_avx+2]; - r3_avx=b[4*small_len_avx+3]; - b_bucket[0+36]=r0_avx; - b_bucket[1+36]=r1_avx; - b_bucket[2+36]=r2_avx; - b_bucket[3+36]=r3_avx; - b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]); - - //-------6th poly---------------------------------------------------- - - r0_avx=b[5*small_len_avx]; - r1_avx=b[5*small_len_avx+1]; - r2_avx=b[5*small_len_avx+2]; - r3_avx=b[5*small_len_avx+3]; - b_bucket[0+45]=r0_avx; - b_bucket[1+45]=r1_avx; - b_bucket[2+45]=r2_avx; - b_bucket[3+45]=r3_avx; - b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]); - - //-------7th poly---------------------------------------------------- - - r0_avx=b[6*small_len_avx]; - r1_avx=b[6*small_len_avx+1]; - r2_avx=b[6*small_len_avx+2]; - r3_avx=b[6*small_len_avx+3]; - b_bucket[0+54]=r0_avx; - b_bucket[1+54]=r1_avx; - b_bucket[2+54]=r2_avx; - b_bucket[3+54]=r3_avx; - b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]); - - //--------------Evaluating B poly ends------------------------------- - - transpose_n1(b_bucket); - transpose_n1(b_bucket+16); - transpose_n1(b_bucket+32); - transpose_n1(b_bucket+48); -} - -static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){ - - //int64_t i; - register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results - - __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; - - //CLOCK1=cpucycles(); - - //------------------------AVX interpolation for 1st poly external------------------- - - //loop1 - res_avx0 = c_bucket[0]; - res_avx2 = c_bucket[1]; - res_avx4 = c_bucket[2]; - res_avx6 = c_bucket[3]; - - c6_avx=c_bucket[6]; - c7_avx=c_bucket[7]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx); - - res_avx1 = c_bucket[16]; - res_avx3 = c_bucket[17]; - res_avx5 = c_bucket[18]; - res_avx7 = c_bucket[19]; - - c22_avx=c_bucket[22]; - c23_avx=c_bucket[23]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final0[0]=res_avx0; - result_final0[1]=res_avx1; - - result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final0[6]=res_avx6; - result_final0[7]=res_avx7; - - - //------------------------AVX interpolation for 1st poly ends-------------- - - - //------------------------AVX interpolation for 2nd poly external------------------- - - //loop1 - res_avx0 = c_bucket[9]; //c_bucket0 - res_avx2 = c_bucket[10]; //c_bucket1 - res_avx4 = c_bucket[11]; //c_bucket2 - res_avx6 = c_bucket[12]; //c_bucket3 - - c6_avx=c_bucket[15]; //c_bucket6 - c7_avx=c_bucket[32]; //c_bucket7 - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx); - - res_avx1 = c_bucket[25]; //c_bucket0 - res_avx3 = c_bucket[26]; //c_bucket1 - res_avx5 = c_bucket[27]; //c_bucket2 - res_avx7 = c_bucket[28]; //c_bucket3 - - c22_avx=c_bucket[31]; - c23_avx=c_bucket[48]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final1[0]=res_avx0; - result_final1[1]=res_avx1; - - result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final1[6]=res_avx6; - result_final1[7]=res_avx7; - - - //------------------------AVX interpolation for 2nd poly ends-------------- - - //------------------------AVX interpolation for 3rd poly external------------------- - - //loop1 - res_avx0 = c_bucket[34]; //c_bucket0 - res_avx2 = c_bucket[35]; //c_bucket1 - res_avx4 = c_bucket[36]; - res_avx6 = c_bucket[37]; - - c6_avx=c_bucket[40]; - c7_avx=c_bucket[41]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx); - - res_avx1 = c_bucket[50]; //c_bucket0 - res_avx3 = c_bucket[51]; //c_bucket1 - res_avx5 = c_bucket[52]; - res_avx7 = c_bucket[53]; - - c22_avx=c_bucket[56]; - c23_avx=c_bucket[57]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - //loop4 - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - //loop5 - result_final2[0]=res_avx0; - result_final2[1]=res_avx1; - - result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final2[6]=res_avx6; - result_final2[7]=res_avx7; - - //------------------------AVX interpolation for 3rd poly ends-------------- - - //------------------------AVX interpolation for 4th poly external------------------- - - //loop1 - res_avx0 = c_bucket[43]; - res_avx2 = c_bucket[44]; - res_avx4 = c_bucket[45]; - res_avx6 = c_bucket[46]; - - c6_avx=c_bucket[65]; - c7_avx=c_bucket[66]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx); - - res_avx1 = c_bucket[59]; - res_avx3 = c_bucket[60]; - res_avx5 = c_bucket[61]; - res_avx7 = c_bucket[62]; - - c22_avx=c_bucket[81]; - c23_avx=c_bucket[82]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final3[0]=res_avx0; - result_final3[1]=res_avx1; - - result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final3[6]=res_avx6; - result_final3[7]=res_avx7; - - - //------------------------AVX interpolation for 4th poly ends-------------- - - //------------------------AVX interpolation for 5th poly external------------------- - - //loop1 - res_avx0 = c_bucket[68]; - res_avx2 = c_bucket[69]; - res_avx4 = c_bucket[70]; - res_avx6 = c_bucket[71]; - - c6_avx=c_bucket[74]; - c7_avx=c_bucket[75]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx); - - res_avx1 = c_bucket[84]; - res_avx3 = c_bucket[85]; - res_avx5 = c_bucket[86]; - res_avx7 = c_bucket[87]; - - c22_avx=c_bucket[90]; - c23_avx=c_bucket[91]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final4[0]=res_avx0; - result_final4[1]=res_avx1; - - result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final4[6]=res_avx6; - result_final4[7]=res_avx7; - - - //------------------------AVX interpolation for 5th poly ends-------------- - - //------------------------AVX interpolation for 6th poly external------------------- - - //loop1 - res_avx0 = c_bucket[77]; - res_avx2 = c_bucket[78]; - res_avx4 = c_bucket[79]; - res_avx6 = c_bucket[96]; - - c6_avx=c_bucket[99]; - c7_avx=c_bucket[100]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx); - - res_avx1 = c_bucket[93]; - res_avx3 = c_bucket[94]; - res_avx5 = c_bucket[95]; - res_avx7 = c_bucket[112]; - - c22_avx=c_bucket[115]; - c23_avx=c_bucket[116]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final5[0]=res_avx0; - result_final5[1]=res_avx1; - - result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final5[6]=res_avx6; - result_final5[7]=res_avx7; - - - //------------------------AVX interpolation for 6th poly ends-------------- - - //------------------------AVX interpolation for 7th poly external------------------- - - //loop1 - res_avx0 = c_bucket[102]; - res_avx2 = c_bucket[103]; - res_avx4 = c_bucket[104]; - res_avx6 = c_bucket[105]; - - c6_avx=c_bucket[108]; - c7_avx=c_bucket[109]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx); - - res_avx1 = c_bucket[118]; - res_avx3 = c_bucket[119]; - res_avx5 = c_bucket[120]; - res_avx7 = c_bucket[121]; - - c22_avx=c_bucket[124]; - c23_avx=c_bucket[125]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final6[0]=res_avx0; - result_final6[1]=res_avx1; - - result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final6[6]=res_avx6; - result_final6[7]=res_avx7; - - - //------------------------AVX interpolation for 7th poly ends-------------- - - //CLOCK2=cpucycles(); - //CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1); - //printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1); - - - -} - -static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ - - int i; - -//---------------AVX data----------------------------- - - __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; - __m256i aw_avx[7*small_len_avx]; - -//----------------AVX data---------------------------- - - -// EVALUATION - - //CLOCK1=cpucycles(); - - for (i=0; i> (SABER_EQ - SABER_EP); + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); - PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); - memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); + PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(pk, res); // pack public key } -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { - uint16_t A[SABER_L][SABER_L][SABER_N]; - uint16_t sp[SABER_L][SABER_N]; - uint16_t bp[SABER_L][SABER_N] = {{0}}; - uint16_t vp[SABER_N] = {0}; - uint16_t mp[SABER_N]; - uint16_t b[SABER_L][SABER_N]; + +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { size_t i, j; + + poly A[SABER_L][SABER_L]; + poly res[SABER_L]; + poly s[SABER_L]; + poly *temp = A[0]; // re-use stack space + poly *vprime = &A[0][0]; + poly *message = &A[0][1]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, noiseseed); PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A); - PQCLEAN_LIGHTSABER_CLEAN_GenSecret(sp, seed_sp); - PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); + PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed - for (i = 0; i < SABER_L; i++) { + + // rounding + for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits for (j = 0; j < SABER_N; j++) { - bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } + PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(ciphertext, res); - PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); - PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, pk); - PQCLEAN_LIGHTSABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); - - PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(mp, m); - - for (j = 0; j < SABER_N; j++) { - vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); - } - - PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); -} - -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { - - uint16_t s[SABER_L][SABER_N]; - uint16_t b[SABER_L][SABER_N]; - uint16_t v[SABER_N] = {0}; - uint16_t cm[SABER_N]; - size_t i; - - PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk); - PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, ciphertext); - PQCLEAN_LIGHTSABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); - PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); + // vector-vector scalar multiplication with mod p + PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(temp, pk); + PQCLEAN_LIGHTSABER_CLEAN_InnerProd(vprime, temp, s); + PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(message, m); for (i = 0; i < SABER_N; i++) { - v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); + vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); + vprime->coeffs[i] &= SABER_P - 1; + vprime->coeffs[i] >>= SABER_EP - SABER_ET; + } + + PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(msk_c, vprime); +} + + +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + size_t i; + + poly temp[SABER_L]; + poly s[SABER_L]; + + const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + poly *v = &temp[0]; + poly *cm = &temp[1]; + + PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk); + PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(temp, ciphertext); + PQCLEAN_LIGHTSABER_CLEAN_InnerProd(&temp[0], temp, s); + + PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(cm, packed_cm); + + for (i = 0; i < SABER_N; i++) { + v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); + v->coeffs[i] &= SABER_P - 1; + v->coeffs[i] >>= SABER_EP - 1; } PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(m, v); diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.h b/crypto_kem/lightsaber/clean/SABER_indcpa.h index efccbf5e..df8906ab 100644 --- a/crypto_kem/lightsaber/clean/SABER_indcpa.h +++ b/crypto_kem/lightsaber/clean/SABER_indcpa.h @@ -5,7 +5,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); +void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); diff --git a/crypto_kem/lightsaber/clean/SABER_params.h b/crypto_kem/lightsaber/clean/SABER_params.h index a6a9fc55..8da6ec34 100644 --- a/crypto_kem/lightsaber/clean/SABER_params.h +++ b/crypto_kem/lightsaber/clean/SABER_params.h @@ -2,19 +2,21 @@ #define PARAMS_H -/* Change this for different security strengths */ - /* Don't change anything below this line */ #define SABER_L 2 #define SABER_MU 10 #define SABER_ET 3 -#define SABER_EQ 13 -#define SABER_EP 10 #define SABER_N 256 +#define SABER_EP 10 +#define SABER_P (1 << SABER_EP) + +#define SABER_EQ 13 +#define SABER_Q (1 << SABER_EQ) + #define SABER_SEEDBYTES 32 -#define SABER_NOISE_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 #define SABER_KEYBYTES 32 #define SABER_HASHBYTES 32 diff --git a/crypto_kem/lightsaber/clean/api.h b/crypto_kem/lightsaber/clean/api.h index f0fe63f1..2e39ae02 100644 --- a/crypto_kem/lightsaber/clean/api.h +++ b/crypto_kem/lightsaber/clean/api.h @@ -15,4 +15,4 @@ int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); -#endif /* api_h */ +#endif /* PQCLEAN_LIGHTSABER_CLEAN_API_H */ diff --git a/crypto_kem/lightsaber/clean/pack_unpack.c b/crypto_kem/lightsaber/clean/pack_unpack.c index 2a39a1d7..f64c4143 100644 --- a/crypto_kem/lightsaber/clean/pack_unpack.c +++ b/crypto_kem/lightsaber/clean/pack_unpack.c @@ -1,140 +1,153 @@ -#include "api.h" +#include "SABER_params.h" #include "pack_unpack.h" +#include "poly.h" #include -void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ((data[offset_data + 1] & 0x7) << 3) | ((data[offset_data + 2] & 0x3) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2) & 0x01) | ((data[offset_data + 3] & 0x7) << 1) | ((data[offset_data + 4] & 0x7) << 4) | (((data[offset_data + 5]) & 0x01) << 7); - bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1) & 0x03) | ((data[offset_data + 6] & 0x7) << 2) | ((data[offset_data + 7] & 0x7) << 5); + out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6); + out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7); + out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5); + in += 8; + out += 3; } } -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { - size_t j, offset_byte, offset_data; +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; - data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3) & 0x07; - data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6) & 0x03) | (((bytes[offset_byte + 1]) & 0x01) << 2); - data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1) & 0x07; - data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4) & 0x07; - data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7) & 0x01) | (((bytes[offset_byte + 2]) & 0x03) << 1); - data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07); - data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07); + out[0] = (in[0]) & 0x07; + out[1] = ((in[0]) >> 3) & 0x07; + out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2); + out[3] = ((in[1]) >> 1) & 0x07; + out[4] = ((in[1]) >> 4) & 0x07; + out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1); + out[6] = ((in[2] >> 2) & 0x07); + out[7] = ((in[2] >> 5) & 0x07); + in += 3; + out += 8; } } -static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); - bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); - bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); - bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); - bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); - bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); - bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); - bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); - bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); - bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); - bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); - bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); + out[2] = ((in[1] >> 3) & 0xff); + out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); + out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); + out[5] = ((in[3] >> 1) & 0xff); + out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); + out[7] = ((in[4] >> 4) & 0xff); + out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); + out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); + out[10] = ((in[6] >> 2) & 0xff); + out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); + out[12] = ((in[7] >> 5) & 0xff); + in += 8; + out += 13; } } -static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { - size_t j, offset_byte, offset_data; +static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); + out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); + out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); + out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); + out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); + out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); + out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); + out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + in += 13; + out += 8; } } -static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); - bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); - bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); + out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); + out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); + out[4] = ((in[3] >> 2) & 0xff); + in += 4; + out += 5; } } -static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { - size_t j, offset_byte, offset_data; +static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 5 * j; - offset_data = 4 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); - data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); - data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); - data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); + out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); + out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); + out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + in += 5; + out += 4; } } -void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { +void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { size_t i; for (i = 0; i < SABER_L; i++) { - POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); } } -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { size_t i; for (i = 0; i < SABER_L; i++) { - BS2POLq(data[i], bytes + i * SABER_POLYBYTES); + BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); } } -void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { +void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { size_t i; for (i = 0; i < SABER_L; i++) { - POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); } } -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { size_t i; for (i = 0; i < SABER_L; i++) { - BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); + BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); } } -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { size_t i, j; for (j = 0; j < SABER_KEYBYTES; j++) { for (i = 0; i < 8; i++) { - data[j * 8 + i] = ((bytes[j] >> i) & 0x01); + data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); } } } -void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { +void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { size_t i, j; memset(bytes, 0, SABER_KEYBYTES); for (j = 0; j < SABER_KEYBYTES; j++) { for (i = 0; i < 8; i++) { - bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); + bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); } } } diff --git a/crypto_kem/lightsaber/clean/pack_unpack.h b/crypto_kem/lightsaber/clean/pack_unpack.h index 44ccf31a..0eda3392 100644 --- a/crypto_kem/lightsaber/clean/pack_unpack.h +++ b/crypto_kem/lightsaber/clean/pack_unpack.h @@ -1,27 +1,28 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H #include "SABER_params.h" +#include "poly.h" #include #include -void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); +void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); +void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); -void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); +void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); +void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); -void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); +void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); #endif diff --git a/crypto_kem/lightsaber/clean/poly.c b/crypto_kem/lightsaber/clean/poly.c index 9bb55afe..e5be857f 100644 --- a/crypto_kem/lightsaber/clean/poly.c +++ b/crypto_kem/lightsaber/clean/poly.c @@ -3,32 +3,40 @@ #include "fips202.h" #include "pack_unpack.h" #include "poly.h" -#include "poly_mul.h" #include -void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { +void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) { size_t i, j; - for (i = 0; i < SABER_L; i++) { - for (j = 0; j < SABER_L; j++) { - if (transpose == 1) { - PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); - } else { - PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); + + if (transpose) { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1); + } + } + } else { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1); } } } } -void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { - size_t j; - for (j = 0; j < SABER_L; j++) { - PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res, b[j], s[j]); +void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) { + size_t i; + + PQCLEAN_LIGHTSABER_CLEAN_poly_mul(c, &b[0], &s[0], 0); + for (i = 1; i < SABER_L; i++) { + PQCLEAN_LIGHTSABER_CLEAN_poly_mul(c, &b[i], &s[i], 1); } } -void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { - uint8_t buf[SABER_L * SABER_POLYVECBYTES]; +void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { size_t i; + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); @@ -37,13 +45,13 @@ void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], c } } -void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { - uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; +void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { size_t i; + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; - shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); for (i = 0; i < SABER_L; i++) { - PQCLEAN_LIGHTSABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); + PQCLEAN_LIGHTSABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); } } diff --git a/crypto_kem/lightsaber/clean/poly.h b/crypto_kem/lightsaber/clean/poly.h index 1f50c48e..be074e43 100644 --- a/crypto_kem/lightsaber/clean/poly.h +++ b/crypto_kem/lightsaber/clean/poly.h @@ -3,13 +3,21 @@ #include "SABER_params.h" #include -void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); +typedef union { + uint16_t coeffs[SABER_N]; +} poly; -void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); -void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); +void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose); -void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); +void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]); + +void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); + + +void PQCLEAN_LIGHTSABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate); #endif diff --git a/crypto_kem/lightsaber/clean/poly_mul.c b/crypto_kem/lightsaber/clean/poly_mul.c index c7f5c424..d82d8585 100644 --- a/crypto_kem/lightsaber/clean/poly_mul.c +++ b/crypto_kem/lightsaber/clean/poly_mul.c @@ -1,4 +1,4 @@ -#include "poly_mul.h" +#include "poly.h" #include #include @@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t } /* res += a*b */ -void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { - uint16_t c[2 * SABER_N] = {0}; +void PQCLEAN_LIGHTSABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) { + uint16_t C[2 * SABER_N] = {0}; size_t i; - toom_cook_4way(c, a, b); + toom_cook_4way(C, a->coeffs, b->coeffs); /* reduction */ - for (i = SABER_N; i < 2 * SABER_N; i++) { - res[i - SABER_N] += (c[i - SABER_N] - c[i]); + if (accumulate == 0) { + for (i = SABER_N; i < 2 * SABER_N; i++) { + c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]); + } + } else { + for (i = SABER_N; i < 2 * SABER_N; i++) { + c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]); + } } } diff --git a/crypto_kem/lightsaber/clean/poly_mul.h b/crypto_kem/lightsaber/clean/poly_mul.h index 5ec233bb..b28b04f6 100644 --- a/crypto_kem/lightsaber/clean/poly_mul.h +++ b/crypto_kem/lightsaber/clean/poly_mul.h @@ -1,9 +1,3 @@ -#ifndef POLY_MUL_H -#define POLY_MUL_H -#include "SABER_params.h" -#include - -void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); -#endif + diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml index 87187702..7eb15ca2 100644 --- a/crypto_kem/saber/META.yml +++ b/crypto_kem/saber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/saber/avx2/Makefile b/crypto_kem/saber/avx2/Makefile index 65cc21ef..070665b4 100644 --- a/crypto_kem/saber/avx2/Makefile +++ b/crypto_kem/saber/avx2/Makefile @@ -2,7 +2,7 @@ LIB=libsaber_avx2.a HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h -OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o +OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/saber/avx2/SABER_indcpa.c b/crypto_kem/saber/avx2/SABER_indcpa.c index 5515c610..e01eb650 100644 --- a/crypto_kem/saber/avx2/SABER_indcpa.c +++ b/crypto_kem/saber/avx2/SABER_indcpa.c @@ -1,416 +1,125 @@ -#include "./polymul/toom-cook_4way.c" #include "SABER_indcpa.h" #include "SABER_params.h" -#include "api.h" -#include "cbd.h" #include "fips202.h" #include "pack_unpack.h" +#include "poly.h" #include "randombytes.h" #include -#include #include -//#include "randombytes.h" -//#include "./polymul/toom_cook_4/toom-cook_4way.c" -#define h1 4 //2^(EQ-EP-1) +#define h1 (1 << (SABER_EQ - SABER_EP - 1)) +#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) -#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) +void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { + size_t i, j; + poly A[SABER_L][SABER_L]; + poly *skpv1 = A[0]; // use first row of A to hold sk temporarily + toom4_points skpv1_eval[SABER_L]; + poly res[SABER_L]; -static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { - int32_t i, j; + uint8_t rand[SABER_NOISESEEDBYTES]; + uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; - for (j = 0; j < SABER_KEYBYTES; j++) { - message_dec[j] = 0; - for (i = 0; i < 8; i++) { - message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); - } - } -} + randombytes(seed_A, SABER_SEEDBYTES); + shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state -/*----------------------------------------------------------------------------------- - This routine generates a=[Matrix K x K] of 256-coefficient polynomials --------------------------------------------------------------------------------------*/ + randombytes(rand, SABER_NOISESEEDBYTES); + PQCLEAN_SABER_AVX2_GenSecret(skpv1, rand); + PQCLEAN_SABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key -static void GenMatrix(polyvec *a, const uint8_t *seed) { - uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; - - uint16_t temp_ar[SABER_N]; - - int i, j, k; - uint16_t mod = (SABER_Q - 1); - - shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - PQCLEAN_SABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); - for (k = 0; k < SABER_N; k++) { - a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; - } - } - } -} - -static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { - - uint32_t i; - - uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; - - shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); - - for (i = 0; i < SABER_K; i++) { - PQCLEAN_SABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); - } -} - -//********************************matrix-vector mul routines***************************************************** -static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { - int64_t i, j; - - __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time - - for (i = 0; i < NUM_POLY; i++) { - for (j = 0; j < NUM_POLY; j++) { - - if (isTranspose == 0) { - toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); - } else { - toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); - } - } - - TC_interpol(c_bucket, res_avx[i]); + for (j = 0; j < SABER_L; j++) { + PQCLEAN_SABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]); } -} + PQCLEAN_SABER_AVX2_GenMatrix(A, seed_A); // sample matrix A + PQCLEAN_SABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order -static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { - - int64_t i; - - __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time - - for (i = 0; i < NUM_POLY; i++) { - toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); - } - TC_interpol(c_bucket, res_avx); -} - -//********************************matrix-vector mul routines***************************************************** - -void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { - - polyvec a[SABER_K]; - - uint16_t skpv1[SABER_K][SABER_N]; - - - - uint8_t seed[SABER_SEEDBYTES]; - uint8_t noiseseed[SABER_COINBYTES]; - int32_t i, j, k; - - -//--------------AVX declaration------------------ - - __m256i sk_avx[SABER_K][SABER_N / 16]; - __m256i mod; - __m256i res_avx[SABER_K][SABER_N / 16]; - __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; - //__m256i acc[2*SABER_N/16]; - - mod = _mm256_set1_epi16(SABER_Q - 1); - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - -//--------------AVX declaration ends------------------ - - randombytes(seed, SABER_SEEDBYTES); - - shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state - randombytes(noiseseed, SABER_COINBYTES); - - - GenMatrix(a, seed); //sample matrix A - - GenSecret(skpv1, noiseseed); - - -// Load sk into avx vectors - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); - } - - } - - // Load a into avx vectors - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - for (k = 0; k < SABER_N / 16; k++) { - a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); - } + // rounding + for (i = 0; i < SABER_L; i++) { + for (j = 0; j < SABER_N; j++) { + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - - - //------------------------do the matrix vector multiplication and rounding------------ - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sk_avx[j], b_bucket[j]); - } - matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order - - // Now truncation - - - for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits - for (j = 0; j < SABER_N / 16; j++) { - res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); - res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); - res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); - } - } - - //------------------Pack sk into byte string------- - - PQCLEAN_SABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); - - //------------------Pack pk into byte string------- - - for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); - } - } - PQCLEAN_SABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string - - - for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. - pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; - } - + PQCLEAN_SABER_AVX2_POLVECp2BS(pk, res); // pack public key } void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { + size_t i, j; + poly A[SABER_L][SABER_L]; + poly res[SABER_L]; + toom4_points skpv1_eval[SABER_L]; - uint32_t i, j, k; - polyvec a[SABER_K]; // skpv; - uint8_t seed[SABER_SEEDBYTES]; - uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client + poly *temp = A[0]; // re-use stack space + poly *vprime = &A[0][0]; + poly *message = &A[0][1]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; - uint16_t skpv1[SABER_K][SABER_N]; - uint16_t temp[SABER_K][SABER_N]; - uint16_t message[SABER_KEYBYTES * 8]; - - uint8_t msk_c[SABER_SCALEBYTES_KEM]; - - //--------------AVX declaration------------------ - - __m256i sk_avx[SABER_K][SABER_N / 16]; - __m256i mod, mod_p; - __m256i res_avx[SABER_K][SABER_N / 16]; - __m256i vprime_avx[SABER_N / 16]; - __m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; - //__m256i acc[2*SABER_N/16]; - - __m256i pkcl_avx[SABER_K][SABER_N / 16]; - - __m256i message_avx[SABER_N / 16]; - - mod = _mm256_set1_epi16(SABER_Q - 1); - mod_p = _mm256_set1_epi16(SABER_P - 1); - - - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - - //--------------AVX declaration ends------------------ - for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. - seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; + PQCLEAN_SABER_AVX2_GenSecret(temp, noiseseed); + for (j = 0; j < SABER_L; j++) { + PQCLEAN_SABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]); } - GenMatrix(a, seed); - GenSecret(skpv1, noiseseed); + PQCLEAN_SABER_AVX2_GenMatrix(A, seed_A); + PQCLEAN_SABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed - // ----------- Load skpv1 into avx vectors ---------- - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); + // rounding + for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits + for (j = 0; j < SABER_N; j++) { + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - - // ----------- Load skpv1 into avx vectors ---------- - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_K; j++) { - for (k = 0; k < SABER_N / 16; k++) { - a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); - } - } - } - //-----------------matrix-vector multiplication and rounding - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sk_avx[j], b_bucket[j]); - } - matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order - - // Now truncation - - for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits - for (j = 0; j < SABER_N / 16; j++) { - res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); - res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); - res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); - - } - } - - - //-----this result should be put in b_prime for later use in server. - for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); - } - } - - PQCLEAN_SABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string - -//**************client matrix-vector multiplication ends******************// - - //------now calculate the v' - - //-------unpack the public_key - PQCLEAN_SABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); - } - } - - // InnerProduct - //for(k=0;k> i) & 0x01); - } - } - // message encoding - for (i = 0; i < SABER_N / 16; i++) { - message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); - message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); - } - - // SHIFTRIGHT(v'+h1-m mod p, EP-ET) - for (k = 0; k < SABER_N / 16; k++) { - vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); - vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); - vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); - } - - // Unpack avx - for (j = 0; j < SABER_N / 16; j++) { - _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); - } - - PQCLEAN_SABER_AVX2_SABER_pack_4bit(msk_c, temp[0]); - - - for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { - ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; + for (i = 0; i < SABER_N; i++) { + vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); + vprime->coeffs[i] &= SABER_P - 1; + vprime->coeffs[i] >>= SABER_EP - SABER_ET; } + PQCLEAN_SABER_AVX2_POLT2BS(msk_c, vprime); } void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + size_t i; - uint32_t i, j; - uint16_t sksv[SABER_K][SABER_N]; //secret key of the server - uint16_t pksv[SABER_K][SABER_N]; - uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; - uint8_t scale_ar[SABER_SCALEBYTES_KEM]; - uint16_t op[SABER_N]; + poly temp[SABER_L]; + toom4_points sksv_eval[SABER_L]; - //--------------AVX declaration------------------ + const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + poly *v = &temp[0]; + poly *cm = &temp[1]; - - //__m256i mod_p; - - __m256i v_avx[SABER_N / 16]; - - //__m256i acc[2*SABER_N/16]; - - __m256i sksv_avx[SABER_K][SABER_N / 16]; - __m256i pksv_avx[SABER_K][SABER_N / 16]; - - //mod_p=_mm256_set1_epi16(SABER_P-1); - - __m256i b_bucket[NUM_POLY][SCHB_N * 4]; - //--------------AVX declaration ends------------------ - - //-------unpack the public_key - - PQCLEAN_SABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key - PQCLEAN_SABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext - - for (i = 0; i < SABER_K; i++) { - for (j = 0; j < SABER_N / 16; j++) { - sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); - pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); - } + PQCLEAN_SABER_AVX2_BS2POLVECq(temp, sk); + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]); } - for (i = 0; i < SABER_N / 16; i++) { - v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); - } + PQCLEAN_SABER_AVX2_BS2POLVECp(temp, ciphertext); + PQCLEAN_SABER_AVX2_InnerProd(v, temp, sksv_eval); + PQCLEAN_SABER_AVX2_BS2POLT(cm, packed_cm); - // InnerProduct(b', s, mod p) - - for (j = 0; j < NUM_POLY; j++) { - TC_eval(sksv_avx[j], b_bucket[j]); - } - - vector_vector_mul(v_avx, pksv_avx, b_bucket); - - for (i = 0; i < SABER_N / 16; i++) { - _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); - } - - - for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { - scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; - } - - PQCLEAN_SABER_AVX2_SABER_un_pack4bit(op, scale_ar); - - - //addition of h2 for (i = 0; i < SABER_N; i++) { - message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); + v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); + v->coeffs[i] &= SABER_P - 1; + v->coeffs[i] >>= SABER_EP - 1; } - - POL2MSG(m, message_dec_unpacked); + PQCLEAN_SABER_AVX2_POLmsg2BS(m, v); } diff --git a/crypto_kem/saber/avx2/SABER_params.h b/crypto_kem/saber/avx2/SABER_params.h index 9b0edafe..d1a5ddd7 100644 --- a/crypto_kem/saber/avx2/SABER_params.h +++ b/crypto_kem/saber/avx2/SABER_params.h @@ -1,46 +1,41 @@ #ifndef PARAMS_H #define PARAMS_H -#include "api.h" - - -#define SABER_K 3 +/* Don't change anything below this line */ +#define SABER_L 3 #define SABER_MU 8 #define SABER_ET 4 +#define SABER_N 256 + +#define SABER_EP 10 +#define SABER_P (1 << SABER_EP) #define SABER_EQ 13 -#define SABER_EP 10 +#define SABER_Q (1 << SABER_EQ) -#define SABER_N 256 -#define SABER_Q 8192 //2^13 -#define SABER_P 1024 +#define SABER_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 +#define SABER_KEYBYTES 32 +#define SABER_HASHBYTES 32 -#define SABER_SEEDBYTES 32 -#define SABER_NOISESEEDBYTES 32 -#define SABER_COINBYTES 32 -#define SABER_KEYBYTES 32 +#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) -#define SABER_HASHBYTES 32 +#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) +#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) -#define SABER_POLYBYTES 416 //13*256/8 +#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) +#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) -#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) - -#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation - -#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) - -#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) +#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) +#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) -#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) - -#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ +#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) #endif diff --git a/crypto_kem/saber/avx2/cbd.c b/crypto_kem/saber/avx2/cbd.c index 7639d7d2..53335375 100644 --- a/crypto_kem/saber/avx2/cbd.c +++ b/crypto_kem/saber/avx2/cbd.c @@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -static uint64_t load_littleendian(const unsigned char *x, int bytes) { +static uint64_t load_littleendian(const uint8_t *x, int bytes) { int i; uint64_t r = x[0]; for (i = 1; i < bytes; i++) { @@ -20,32 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { return r; } - -void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { - uint16_t Qmod_minus1 = SABER_Q - 1; - +void PQCLEAN_SABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { uint32_t t, d, a[4], b[4]; int i, j; for (i = 0; i < SABER_N / 4; i++) { - t = load_littleendian(buf + 4 * i, 4); + t = (uint32_t) load_littleendian(buf + 4 * i, 4); d = 0; for (j = 0; j < 4; j++) { d += (t >> j) & 0x11111111; } - a[0] = d & 0xf; - b[0] = (d >> 4) & 0xf; - a[1] = (d >> 8) & 0xf; + a[0] = d & 0xf; + b[0] = (d >> 4) & 0xf; + a[1] = (d >> 8) & 0xf; b[1] = (d >> 12) & 0xf; a[2] = (d >> 16) & 0xf; b[2] = (d >> 20) & 0xf; a[3] = (d >> 24) & 0xf; b[3] = (d >> 28); - r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; - r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; - r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; - r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; + s[4 * i + 0] = (uint16_t)(a[0] - b[0]); + s[4 * i + 1] = (uint16_t)(a[1] - b[1]); + s[4 * i + 2] = (uint16_t)(a[2] - b[2]); + s[4 * i + 3] = (uint16_t)(a[3] - b[3]); } } diff --git a/crypto_kem/saber/avx2/cbd.h b/crypto_kem/saber/avx2/cbd.h index e80ffc75..afe84bf3 100644 --- a/crypto_kem/saber/avx2/cbd.h +++ b/crypto_kem/saber/avx2/cbd.h @@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle ----------------------------------------------------------------------*/ -#include "poly.h" +#include "SABER_params.h" #include -void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); +void PQCLEAN_SABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); #endif diff --git a/crypto_kem/saber/avx2/kem.c b/crypto_kem/saber/avx2/kem.c index c88bb315..e47e985f 100644 --- a/crypto_kem/saber/avx2/kem.c +++ b/crypto_kem/saber/avx2/kem.c @@ -4,14 +4,12 @@ #include "fips202.h" #include "randombytes.h" #include "verify.h" -#include +#include #include -#include -#include int PQCLEAN_SABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { - int i; + size_t i; PQCLEAN_SABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { @@ -39,7 +37,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); // K^ <-- kr[0:31] // noiseseed (r) <-- kr[32:63]; - PQCLEAN_SABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; + PQCLEAN_SABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); @@ -49,7 +47,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) } int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { - int i; + size_t i; uint8_t fail; uint8_t cmp[SABER_BYTES_CCA_DEC]; uint8_t buf[64]; @@ -65,7 +63,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_ sha3_512(kr, buf, 64); - PQCLEAN_SABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); + PQCLEAN_SABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk); fail = PQCLEAN_SABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); diff --git a/crypto_kem/saber/avx2/kem.h b/crypto_kem/saber/avx2/kem.h index 612ff4ff..b28b04f6 100644 --- a/crypto_kem/saber/avx2/kem.h +++ b/crypto_kem/saber/avx2/kem.h @@ -1,35 +1,3 @@ -#ifndef INDCPA_H -#define INDCPA_H - -#include - -void PQCLEAN_SABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); - - -void PQCLEAN_SABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); - - -void PQCLEAN_SABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); - - -void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); - -void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); - -void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); - - -int PQCLEAN_SABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); - -int PQCLEAN_SABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); - -int PQCLEAN_SABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); -//uint64_t clock1,clock2; - -//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; - - -#endif diff --git a/crypto_kem/saber/avx2/pack_unpack.c b/crypto_kem/saber/avx2/pack_unpack.c index 00bf9c08..9bb46acb 100644 --- a/crypto_kem/saber/avx2/pack_unpack.c +++ b/crypto_kem/saber/avx2/pack_unpack.c @@ -1,502 +1,145 @@ +#include "SABER_params.h" #include "pack_unpack.h" +#include "poly.h" +#include - -void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); - bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); - } -} - -void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 3 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; - data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; - data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); - data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; - data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; - data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); - data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); - data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); - } - -} - -void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0; - +void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); + out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4); + in += 2; + out += 1; } } -void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0; - +void PQCLEAN_SABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 2; j++) { - offset_data = 2 * j; - data[offset_data] = bytes[j] & 0x0f; - data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; + out[0] = in[0] & 0x0f; + out[1] = (in[0] >> 4) & 0x0f; + in += 1; + out += 2; } } -void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); - bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); - } -} - - -void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 3 * j; - offset_data = 4 * j; - data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; - data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; - data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; - data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); - } - -} - -void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); - } - } -} - -void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); - } - } -} - -void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); - - bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); - - bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); - - } - } - - -} - -void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); + out[2] = ((in[1] >> 3) & 0xff); + out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); + out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); + out[5] = ((in[3] >> 1) & 0xff); + out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); + out[7] = ((in[4] >> 4) & 0xff); + out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); + out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); + out[10] = ((in[6] >> 2) & 0xff); + out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); + out[12] = ((in[7] >> 5) & 0xff); + in += 8; + out += 13; } } - - -void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); - data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); - data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); - data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); - - } - } -} - -void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); - } - } - - -} - - - -void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 10) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 5 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); - data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); - data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); - data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); - - } - } - - -} - - -void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); - - bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); - - bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); - - } - } - - -} - -void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 13) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 13 * j; - offset_data = 8 * j; - data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); - } - } - - -} - -void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { - - uint32_t j; - uint32_t offset_data = 0, offset_byte = 0; - - //for(i=0;icoeffs; for (j = 0; j < SABER_N / 8; j++) { - //offset_byte=offset_byte1+13*j; - offset_byte = 13 * j; - offset_data = 8 * j; - data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); + out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); + out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); + out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); + out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); + out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); + out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); + out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + in += 13; + out += 8; } - //} - - } +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; + for (j = 0; j < SABER_N / 4; j++) { + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); + out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); + out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); + out[4] = ((in[3] >> 2) & 0xff); + in += 4; + out += 5; + } +} -void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - /*This function packs 11 bit data stream into 8 bits of data. - */ - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; +static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; + for (j = 0; j < SABER_N / 4; j++) { + out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); + out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); + out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); + out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + in += 5; + out += 4; + } +} - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 11) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 11 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); +void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); + } +} - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); +void PQCLEAN_SABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); + } +} - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); +void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); + } +} - bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); - - bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); - - bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); - - bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); - - bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); +void PQCLEAN_SABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { + size_t i; + for (i = 0; i < SABER_L; i++) { + BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); + } +} +void PQCLEAN_SABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { + size_t i, j; + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); } } - } -void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { +void PQCLEAN_SABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { + size_t i, j; + memset(bytes, 0, SABER_KEYBYTES); - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 11) / 8; - for (j = 0; j < SABER_N / 8; j++) { - offset_byte = offset_byte1 + 11 * j; - offset_data = 8 * j; - - data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); - - data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); - - data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); - - data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); - - data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); - - data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); - - data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); - - data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); + for (j = 0; j < SABER_KEYBYTES; j++) { + for (i = 0; i < 8; i++) { + bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); } } - - -} - -void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 14) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 7 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); - - bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); - - bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); - - bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); - - bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); - - bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); - - bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); - } - } - - -} - - -void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { - - uint32_t i, j; - uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; - - for (i = 0; i < SABER_K; i++) { - offset_byte1 = i * (SABER_N * 14) / 8; - for (j = 0; j < SABER_N / 4; j++) { - offset_byte = offset_byte1 + 7 * j; - offset_data = 4 * j; - data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); - - data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); - - data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); - - data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); - } - } - - -} - -void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { - - if (modulus == 1024) { - PQCLEAN_SABER_AVX2_POLVECp2BS(bytes, data); - } else if (modulus == 8192) { - PQCLEAN_SABER_AVX2_POLVECq2BS(bytes, data); - } -} - -void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { - - if (modulus == 1024) { - PQCLEAN_SABER_AVX2_BS2POLVECp(data, bytes); - } else if (modulus == 8192) { - PQCLEAN_SABER_AVX2_BS2POLVECq(data, bytes); - } - } diff --git a/crypto_kem/saber/avx2/pack_unpack.h b/crypto_kem/saber/avx2/pack_unpack.h index e1608d4c..0965bbcd 100644 --- a/crypto_kem/saber/avx2/pack_unpack.h +++ b/crypto_kem/saber/avx2/pack_unpack.h @@ -1,56 +1,28 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H #include "SABER_params.h" +#include "poly.h" #include #include -void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); +void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); -void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); - -void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); +void PQCLEAN_SABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); +void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); -void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); +void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); -void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); +void PQCLEAN_SABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); -void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); - -void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); - -void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); +void PQCLEAN_SABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); +void PQCLEAN_SABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); - -void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); - -void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); - -void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); +void PQCLEAN_SABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); #endif diff --git a/crypto_kem/saber/avx2/poly.c b/crypto_kem/saber/avx2/poly.c new file mode 100644 index 00000000..1bc268b6 --- /dev/null +++ b/crypto_kem/saber/avx2/poly.c @@ -0,0 +1,62 @@ +#include "cbd.h" +#include "fips202.h" +#include "pack_unpack.h" +#include "poly.h" + + +void PQCLEAN_SABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) { + size_t i, j; + toom4_points_product c_eval; + + if (transpose) { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1); + } + PQCLEAN_SABER_AVX2_toom4_interp(&c[i], &c_eval); + } + } else { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1); + } + PQCLEAN_SABER_AVX2_toom4_interp(&c[i], &c_eval); + } + } +} + +void PQCLEAN_SABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) { + size_t i; + toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time + + PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0); + for (i = 1; i < SABER_L; i++) { + PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1); + } + + PQCLEAN_SABER_AVX2_toom4_interp(c, &c_eval); +} + +void PQCLEAN_SABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { + size_t i; + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; + + shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); + } +} + +void PQCLEAN_SABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { + size_t i; + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; + + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); + + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); + } +} diff --git a/crypto_kem/saber/avx2/poly.h b/crypto_kem/saber/avx2/poly.h index 2978d0d8..188e31e7 100644 --- a/crypto_kem/saber/avx2/poly.h +++ b/crypto_kem/saber/avx2/poly.h @@ -1,27 +1,38 @@ #ifndef POLY_H #define POLY_H -/*--------------------------------------------------------------------- -This file has been adapted from the implementation -(available at, Public Domain https://github.com/pq-crystals/kyber) -of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" -by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, -Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle -----------------------------------------------------------------------*/ #include "SABER_params.h" +#include #include -typedef struct { +typedef union { uint16_t coeffs[SABER_N]; + __m256i dummy; } poly; -typedef struct { - poly vec[SABER_K]; -} polyvec; +typedef union { + uint16_t coeffs[4 * SABER_N]; + __m256i dummy; +} toom4_points; -void PQCLEAN_SABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); +typedef union { + uint16_t coeffs[8 * SABER_N]; + __m256i dummy; +} toom4_points_product; + +void PQCLEAN_SABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose); + +void PQCLEAN_SABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]); + +void PQCLEAN_SABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_SABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); -void PQCLEAN_SABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); +void PQCLEAN_SABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval); + +void PQCLEAN_SABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b); + +void PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate); #endif diff --git a/crypto_kem/saber/avx2/poly_mul.c b/crypto_kem/saber/avx2/poly_mul.c new file mode 100644 index 00000000..5ec0aa73 --- /dev/null +++ b/crypto_kem/saber/avx2/poly_mul.c @@ -0,0 +1,1524 @@ +#include "SABER_params.h" +#include "poly.h" + + +#define L (SABER_N / 64) + +static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { + return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); +} + +static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + b4 = b[4]; + b5 = b[5]; + b6 = b[6]; + b7 = b[7]; + + c[0] = mul_add(a0, b0, c[0]); + + temp = _mm256_mullo_epi16(a0, b1); + temp = mul_add(a1, b0, temp); + c[1] = _mm256_add_epi16(temp, c[1]); + + temp = _mm256_mullo_epi16(a0, b2); + temp = mul_add(a1, b1, temp); + temp = mul_add(a2, b0, temp); + c[2] = _mm256_add_epi16(temp, c[2]); + + temp = _mm256_mullo_epi16(a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + temp = mul_add(a3, b0, temp); + c[3] = _mm256_add_epi16(temp, c[3]); + + temp = _mm256_mullo_epi16(a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + temp = mul_add(a2, b2, temp); + c[4] = _mm256_add_epi16(temp, c[4]); + + temp = _mm256_mullo_epi16(a0, b5); + temp = mul_add(a1, b4, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + temp = mul_add(a5, b0, temp); + c[5] = _mm256_add_epi16(temp, c[5]); + + temp = _mm256_mullo_epi16(a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a4, b2, temp); + c[6] = _mm256_add_epi16(temp, c[6]); + + temp = _mm256_mullo_epi16(a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add(a6, b1, temp); + temp = mul_add(a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a5, b2, temp); + c[7] = _mm256_add_epi16(temp, c[7]); + + temp = _mm256_mullo_epi16(a0, b[8]); + temp = mul_add(a1, b7, temp); + temp = mul_add(a7, b1, temp); + temp = mul_add(a[8], b0, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a6, b2, temp); + c[8] = _mm256_add_epi16(temp, c[8]); + + temp = _mm256_mullo_epi16(a0, b[9]); + temp = mul_add(a1, b[8], temp); + temp = mul_add(a[8], b1, temp); + temp = mul_add(a[9], b0, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a7, b2, temp); + c[9] = _mm256_add_epi16(temp, c[9]); + + temp = _mm256_mullo_epi16(a0, b[10]); + temp = mul_add(a1, b[9], temp); + temp = mul_add(a[9], b1, temp); + temp = mul_add(a[10], b0, temp); + temp = mul_add(a2, b[8], temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a[8], b2, temp); + c[10] = _mm256_add_epi16(temp, c[10]); + + temp = _mm256_mullo_epi16(a0, b[11]); + temp = mul_add(a1, b[10], temp); + temp = mul_add(a[10], b1, temp); + temp = mul_add(a[11], b0, temp); + temp = mul_add(a2, b[9], temp); + temp = mul_add(a3, b[8], temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a[8], b3, temp); + temp = mul_add(a[9], b2, temp); + c[11] = _mm256_add_epi16(temp, c[11]); + + temp = _mm256_mullo_epi16(a0, b[12]); + temp = mul_add(a1, b[11], temp); + temp = mul_add(a[11], b1, temp); + temp = mul_add(a[12], b0, temp); + temp = mul_add(a2, b[10], temp); + temp = mul_add(a3, b[9], temp); + temp = mul_add(a4, b[8], temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a[8], b4, temp); + temp = mul_add(a[9], b3, temp); + temp = mul_add(a[10], b2, temp); + c[12] = _mm256_add_epi16(temp, c[12]); + + temp = _mm256_mullo_epi16(a0, b[13]); + temp = mul_add(a1, b[12], temp); + temp = mul_add(a[12], b1, temp); + temp = mul_add(a[13], b0, temp); + temp = mul_add(a2, b[11], temp); + temp = mul_add(a3, b[10], temp); + temp = mul_add(a4, b[9], temp); + temp = mul_add(a5, b[8], temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a[8], b5, temp); + temp = mul_add(a[9], b4, temp); + temp = mul_add(a[10], b3, temp); + temp = mul_add(a[11], b2, temp); + c[13] = _mm256_add_epi16(temp, c[13]); + + temp = _mm256_mullo_epi16(a0, b[14]); + temp = mul_add(a1, b[13], temp); + temp = mul_add(a[13], b1, temp); + temp = mul_add(a[14], b0, temp); + temp = mul_add(a2, b[12], temp); + temp = mul_add(a3, b[11], temp); + temp = mul_add(a4, b[10], temp); + temp = mul_add(a5, b[9], temp); + temp = mul_add(a6, b[8], temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a[8], b6, temp); + temp = mul_add(a[9], b5, temp); + temp = mul_add(a[10], b4, temp); + temp = mul_add(a[11], b3, temp); + temp = mul_add(a[12], b2, temp); + c[14] = _mm256_add_epi16(temp, c[14]); + + temp = _mm256_mullo_epi16(a0, b[15]); + temp = mul_add(a1, b[14], temp); + temp = mul_add(a[14], b1, temp); + temp = mul_add(a[15], b0, temp); + temp = mul_add(a2, b[13], temp); + temp = mul_add(a3, b[12], temp); + temp = mul_add(a4, b[11], temp); + temp = mul_add(a5, b[10], temp); + temp = mul_add(a6, b[9], temp); + temp = mul_add(a7, b[8], temp); + temp = mul_add(a[8], b7, temp); + temp = mul_add(a[9], b6, temp); + temp = mul_add(a[10], b5, temp); + temp = mul_add(a[11], b4, temp); + temp = mul_add(a[12], b3, temp); + temp = mul_add(a[13], b2, temp); + c[15] = _mm256_add_epi16(temp, c[15]); + + a0 = a[14]; + a1 = a[15]; + a2 = a[13]; + a3 = a[12]; + a4 = a[11]; + a5 = a[10]; + a6 = a[9]; + a7 = a[8]; + + b0 = b[14]; + b1 = b[15]; + b2 = b[13]; + b3 = b[12]; + b4 = b[11]; + b5 = b[10]; + b6 = b[9]; + b7 = b[8]; + + temp = _mm256_mullo_epi16(a[1], b1); + temp = mul_add(a[2], b0, temp); + temp = mul_add(a[3], b2, temp); + temp = mul_add(a[4], b3, temp); + temp = mul_add(a[5], b4, temp); + temp = mul_add(a[6], b5, temp); + temp = mul_add(a[7], b6, temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a6, b[7], temp); + temp = mul_add(a5, b[6], temp); + temp = mul_add(a4, b[5], temp); + temp = mul_add(a3, b[4], temp); + temp = mul_add(a2, b[3], temp); + temp = mul_add(a0, b[2], temp); + temp = mul_add(a1, b[1], temp); + c[16] = _mm256_add_epi16(temp, c[16]); + + temp = _mm256_mullo_epi16(a[2], b1); + temp = mul_add(a[3], b0, temp); + temp = mul_add(a[4], b2, temp); + temp = mul_add(a[5], b3, temp); + temp = mul_add(a[6], b4, temp); + temp = mul_add(a[7], b5, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a5, b[7], temp); + temp = mul_add(a4, b[6], temp); + temp = mul_add(a3, b[5], temp); + temp = mul_add(a2, b[4], temp); + temp = mul_add(a0, b[3], temp); + temp = mul_add(a1, b[2], temp); + c[17] = _mm256_add_epi16(temp, c[17]); + + temp = _mm256_mullo_epi16(a[3], b1); + temp = mul_add(a[4], b0, temp); + temp = mul_add(a[5], b2, temp); + temp = mul_add(a[6], b3, temp); + temp = mul_add(a[7], b4, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a4, b[7], temp); + temp = mul_add(a3, b[6], temp); + temp = mul_add(a2, b[5], temp); + temp = mul_add(a0, b[4], temp); + temp = mul_add(a1, b[3], temp); + c[18] = _mm256_add_epi16(temp, c[18]); + + temp = _mm256_mullo_epi16(a[4], b1); + temp = mul_add(a[5], b0, temp); + temp = mul_add(a[6], b2, temp); + temp = mul_add(a[7], b3, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a3, b[7], temp); + temp = mul_add(a2, b[6], temp); + temp = mul_add(a0, b[5], temp); + temp = mul_add(a1, b[4], temp); + c[19] = _mm256_add_epi16(temp, c[19]); + + temp = _mm256_mullo_epi16(a[5], b1); + temp = mul_add(a[6], b0, temp); + temp = mul_add(a[7], b2, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a2, b[7], temp); + temp = mul_add(a0, b[6], temp); + temp = mul_add(a1, b[5], temp); + c[20] = _mm256_add_epi16(temp, c[20]); + + temp = _mm256_mullo_epi16(a[6], b1); + temp = mul_add(a[7], b0, temp); + temp = mul_add(a7, b2, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a0, b[7], temp); + temp = mul_add(a1, b[6], temp); + c[21] = _mm256_add_epi16(temp, c[21]); + + temp = _mm256_mullo_epi16(a[7], b1); + temp = mul_add(a7, b0, temp); + temp = mul_add(a6, b2, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a0, b7, temp); + temp = mul_add(a1, b[7], temp); + c[22] = _mm256_add_epi16(temp, c[22]); + + temp = _mm256_mullo_epi16(a7, b1); + temp = mul_add(a6, b0, temp); + temp = mul_add(a5, b2, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a0, b6, temp); + temp = mul_add(a1, b7, temp); + c[23] = _mm256_add_epi16(temp, c[23]); + + temp = _mm256_mullo_epi16(a6, b1); + temp = mul_add(a5, b0, temp); + temp = mul_add(a4, b2, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a0, b5, temp); + temp = mul_add(a1, b6, temp); + c[24] = _mm256_add_epi16(temp, c[24]); + + temp = _mm256_mullo_epi16(a5, b1); + temp = mul_add(a4, b0, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a0, b4, temp); + temp = mul_add(a1, b5, temp); + c[25] = _mm256_add_epi16(temp, c[25]); + + temp = _mm256_mullo_epi16(a4, b1); + temp = mul_add(a3, b0, temp); + temp = mul_add(a2, b2, temp); + temp = mul_add(a0, b3, temp); + temp = mul_add(a1, b4, temp); + c[26] = _mm256_add_epi16(temp, c[26]); + + temp = _mm256_mullo_epi16(a3, b1); + temp = mul_add(a2, b0, temp); + temp = mul_add(a0, b2, temp); + temp = mul_add(a1, b3, temp); + c[27] = _mm256_add_epi16(temp, c[27]); + + temp = _mm256_mullo_epi16(a2, b1); + temp = mul_add(a0, b0, temp); + temp = mul_add(a1, b2, temp); + c[28] = _mm256_add_epi16(temp, c[28]); + + temp = _mm256_mullo_epi16(a0, b1); + temp = mul_add(a1, b0, temp); + c[29] = _mm256_add_epi16(temp, c[29]); + + c[30] = mul_add(a1, b1, c[30]); + + c[31] = _mm256_set_epi64x(0, 0, 0, 0); +} + + +static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; + __m256i temp; + + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + b4 = b[4]; + b5 = b[5]; + b6 = b[6]; + b7 = b[7]; + + c[0] = _mm256_mullo_epi16(a0, b0); + + temp = _mm256_mullo_epi16(a0, b1); + c[1] = mul_add(a1, b0, temp); + + temp = _mm256_mullo_epi16(a0, b2); + temp = mul_add(a1, b1, temp); + c[2] = mul_add(a2, b0, temp); + + temp = _mm256_mullo_epi16(a0, b3); + temp = mul_add(a1, b2, temp); + temp = mul_add(a2, b1, temp); + c[3] = mul_add(a3, b0, temp); + + temp = _mm256_mullo_epi16(a0, b4); + temp = mul_add(a1, b3, temp); + temp = mul_add(a3, b1, temp); + temp = mul_add(a4, b0, temp); + c[4] = mul_add(a2, b2, temp); + + temp = _mm256_mullo_epi16(a0, b5); + temp = mul_add(a1, b4, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add( a4, b1, temp); + c[5] = mul_add(a5, b0, temp); + + temp = _mm256_mullo_epi16(a0, b6); + temp = mul_add(a1, b5, temp); + temp = mul_add(a5, b1, temp); + temp = mul_add(a6, b0, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a3, b3, temp); + c[6] = mul_add(a4, b2, temp); + + temp = _mm256_mullo_epi16(a0, b7); + temp = mul_add(a1, b6, temp); + temp = mul_add(a6, b1, temp); + temp = mul_add(a7, b0, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a4, b3, temp); + c[7] = mul_add(a5, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[8]); + temp = mul_add(a1, b7, temp); + temp = mul_add(a7, b1, temp); + temp = mul_add(a[8], b0, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a5, b3, temp); + c[8] = mul_add(a6, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[9]); + temp = mul_add(a1, b[8], temp); + temp = mul_add(a[8], b1, temp); + temp = mul_add(a[9], b0, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a6, b3, temp); + c[9] = mul_add(a7, b2, temp); + + temp = _mm256_mullo_epi16(a0, b[10]); + temp = mul_add(a1, b[9], temp); + temp = mul_add(a[9], b1, temp); + temp = mul_add(a[10], b0, temp); + temp = mul_add(a2, b[8], temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a7, b3, temp); + c[10] = mul_add(a[8], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[11]); + temp = mul_add(a1, b[10], temp); + temp = mul_add(a[10], b1, temp); + temp = mul_add(a[11], b0, temp); + temp = mul_add(a2, b[9], temp); + temp = mul_add(a3, b[8], temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a[8], b3, temp); + c[11] = mul_add(a[9], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[12]); + temp = mul_add(a1, b[11], temp); + temp = mul_add(a[11], b1, temp); + temp = mul_add(a[12], b0, temp); + temp = mul_add(a2, b[10], temp); + temp = mul_add(a3, b[9], temp); + temp = mul_add(a4, b[8], temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a[8], b4, temp); + temp = mul_add(a[9], b3, temp); + c[12] = mul_add(a[10], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[13]); + temp = mul_add(a1, b[12], temp); + temp = mul_add(a[12], b1, temp); + temp = mul_add(a[13], b0, temp); + temp = mul_add(a2, b[11], temp); + temp = mul_add(a3, b[10], temp); + temp = mul_add(a4, b[9], temp); + temp = mul_add(a5, b[8], temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a[8], b5, temp); + temp = mul_add(a[9], b4, temp); + temp = mul_add(a[10], b3, temp); + c[13] = mul_add(a[11], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[14]); + temp = mul_add(a1, b[13], temp); + temp = mul_add(a[13], b1, temp); + temp = mul_add(a[14], b0, temp); + temp = mul_add(a2, b[12], temp); + temp = mul_add(a3, b[11], temp); + temp = mul_add(a4, b[10], temp); + temp = mul_add(a5, b[9], temp); + temp = mul_add(a6, b[8], temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a[8], b6, temp); + temp = mul_add(a[9], b5, temp); + temp = mul_add(a[10], b4, temp); + temp = mul_add(a[11], b3, temp); + c[14] = mul_add(a[12], b2, temp); + + temp = _mm256_mullo_epi16(a0, b[15]); + temp = mul_add(a1, b[14], temp); + temp = mul_add(a[14], b1, temp); + temp = mul_add(a[15], b0, temp); + temp = mul_add(a2, b[13], temp); + temp = mul_add(a3, b[12], temp); + temp = mul_add(a4, b[11], temp); + temp = mul_add(a5, b[10], temp); + temp = mul_add(a6, b[9], temp); + temp = mul_add(a7, b[8], temp); + temp = mul_add(a[8], b7, temp); + temp = mul_add(a[9], b6, temp); + temp = mul_add(a[10], b5, temp); + temp = mul_add(a[11], b4, temp); + temp = mul_add(a[12], b3, temp); + c[15] = mul_add(a[13], b2, temp); + + // unrolled second triangle + a0 = a[14]; + a1 = a[15]; + a2 = a[13]; + a3 = a[12]; + a4 = a[11]; + a5 = a[10]; + a6 = a[9]; + a7 = a[8]; + + b0 = b[14]; + b1 = b[15]; + b2 = b[13]; + b3 = b[12]; + b4 = b[11]; + b5 = b[10]; + b6 = b[9]; + b7 = b[8]; + + temp = _mm256_mullo_epi16(a[1], b1); + temp = mul_add(a[2], b0, temp); + temp = mul_add(a[3], b2, temp); + temp = mul_add(a[4], b3, temp); + temp = mul_add(a[5], b4, temp); + temp = mul_add(a[6], b5, temp); + temp = mul_add(a[7], b6, temp); + temp = mul_add(a7, b7, temp); + temp = mul_add(a6, b[7], temp); + temp = mul_add(a5, b[6], temp); + temp = mul_add(a4, b[5], temp); + temp = mul_add(a3, b[4], temp); + temp = mul_add(a2, b[3], temp); + temp = mul_add(a0, b[2], temp); + c[16] = mul_add(a1, b[1], temp); + + temp = _mm256_mullo_epi16(a[2], b1); + temp = mul_add(a[3], b0, temp); + temp = mul_add(a[4], b2, temp); + temp = mul_add(a[5], b3, temp); + temp = mul_add(a[6], b4, temp); + temp = mul_add(a[7], b5, temp); + temp = mul_add(a7, b6, temp); + temp = mul_add(a6, b7, temp); + temp = mul_add(a5, b[7], temp); + temp = mul_add(a4, b[6], temp); + temp = mul_add(a3, b[5], temp); + temp = mul_add(a2, b[4], temp); + temp = mul_add(a0, b[3], temp); + c[17] = mul_add(a1, b[2], temp); + + temp = _mm256_mullo_epi16(a[3], b1); + temp = mul_add(a[4], b0, temp); + temp = mul_add(a[5], b2, temp); + temp = mul_add(a[6], b3, temp); + temp = mul_add(a[7], b4, temp); + temp = mul_add(a7, b5, temp); + temp = mul_add(a6, b6, temp); + temp = mul_add(a5, b7, temp); + temp = mul_add(a4, b[7], temp); + temp = mul_add(a3, b[6], temp); + temp = mul_add(a2, b[5], temp); + temp = mul_add(a0, b[4], temp); + c[18] = mul_add(a1, b[3], temp); + + temp = _mm256_mullo_epi16(a[4], b1); + temp = mul_add(a[5], b0, temp); + temp = mul_add(a[6], b2, temp); + temp = mul_add(a[7], b3, temp); + temp = mul_add(a7, b4, temp); + temp = mul_add(a6, b5, temp); + temp = mul_add(a5, b6, temp); + temp = mul_add(a4, b7, temp); + temp = mul_add(a3, b[7], temp); + temp = mul_add(a2, b[6], temp); + temp = mul_add(a0, b[5], temp); + c[19] = mul_add(a1, b[4], temp); + + temp = _mm256_mullo_epi16(a[5], b1); + temp = mul_add(a[6], b0, temp); + temp = mul_add(a[7], b2, temp); + temp = mul_add(a7, b3, temp); + temp = mul_add(a6, b4, temp); + temp = mul_add(a5, b5, temp); + temp = mul_add(a4, b6, temp); + temp = mul_add(a3, b7, temp); + temp = mul_add(a2, b[7], temp); + temp = mul_add(a0, b[6], temp); + c[20] = mul_add(a1, b[5], temp); + + temp = _mm256_mullo_epi16(a[6], b1); + temp = mul_add(a[7], b0, temp); + temp = mul_add(a7, b2, temp); + temp = mul_add(a6, b3, temp); + temp = mul_add(a5, b4, temp); + temp = mul_add(a4, b5, temp); + temp = mul_add(a3, b6, temp); + temp = mul_add(a2, b7, temp); + temp = mul_add(a0, b[7], temp); + c[21] = mul_add(a1, b[6], temp); + + temp = _mm256_mullo_epi16(a[7], b1); + temp = mul_add(a7, b0, temp); + temp = mul_add(a6, b2, temp); + temp = mul_add(a5, b3, temp); + temp = mul_add(a4, b4, temp); + temp = mul_add(a3, b5, temp); + temp = mul_add(a2, b6, temp); + temp = mul_add(a0, b7, temp); + c[22] = mul_add(a1, b[7], temp); + + temp = _mm256_mullo_epi16(a7, b1); + temp = mul_add(a6, b0, temp); + temp = mul_add(a5, b2, temp); + temp = mul_add(a4, b3, temp); + temp = mul_add(a3, b4, temp); + temp = mul_add(a2, b5, temp); + temp = mul_add(a0, b6, temp); + c[23] = mul_add(a1, b7, temp); + + temp = _mm256_mullo_epi16(a6, b1); + temp = mul_add(a5, b0, temp); + temp = mul_add(a4, b2, temp); + temp = mul_add(a3, b3, temp); + temp = mul_add(a2, b4, temp); + temp = mul_add(a0, b5, temp); + c[24] = mul_add(a1, b6, temp); + + temp = _mm256_mullo_epi16(a5, b1); + temp = mul_add(a4, b0, temp); + temp = mul_add(a3, b2, temp); + temp = mul_add(a2, b3, temp); + temp = mul_add(a0, b4, temp); + c[25] = mul_add(a1, b5, temp); + + temp = _mm256_mullo_epi16(a4, b1); + temp = mul_add(a3, b0, temp); + temp = mul_add(a2, b2, temp); + temp = mul_add(a0, b3, temp); + c[26] = mul_add(a1, b4, temp); + + temp = _mm256_mullo_epi16(a3, b1); + temp = mul_add(a2, b0, temp); + temp = mul_add(a0, b2, temp); + c[27] = mul_add(a1, b3, temp); + + temp = _mm256_mullo_epi16(a2, b1); + temp = mul_add(a0, b0, temp); + c[28] = mul_add(a1, b2, temp); + + temp = _mm256_mullo_epi16(a0, b1); + c[29] = mul_add(a1, b0, temp); + + c[30] = _mm256_mullo_epi16(a1, b1); + + c[31] = _mm256_set_epi64x(0, 0, 0, 0); +} + +static void transpose(__m256i *M) { + __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; + __m256i temp, temp0, temp1, temp2; + + r0 = _mm256_unpacklo_epi16(M[0], M[1]); + r1 = _mm256_unpacklo_epi16(M[2], M[3]); + r2 = _mm256_unpacklo_epi16(M[4], M[5]); + r3 = _mm256_unpacklo_epi16(M[6], M[7]); + r4 = _mm256_unpacklo_epi16(M[8], M[9]); + r5 = _mm256_unpacklo_epi16(M[10], M[11]); + r6 = _mm256_unpacklo_epi16(M[12], M[13]); + r7 = _mm256_unpacklo_epi16(M[14], M[15]); + + temp = _mm256_unpacklo_epi32(r0, r1); + temp0 = _mm256_unpacklo_epi32(r2, r3); + temp1 = _mm256_unpacklo_epi32(r4, r5); + temp2 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpackhi_epi32(r0, r1); + r9 = _mm256_unpackhi_epi32(r2, r3); + r10 = _mm256_unpackhi_epi32(r4, r5); + r11 = _mm256_unpackhi_epi32(r6, r7); + + r0 = _mm256_unpacklo_epi64(temp, temp0); + r2 = _mm256_unpackhi_epi64(temp, temp0); + r1 = _mm256_unpacklo_epi64(temp1, temp2); + r3 = _mm256_unpackhi_epi64(temp1, temp2); + + temp = _mm256_unpackhi_epi16(M[0], M[1]); + temp0 = _mm256_unpackhi_epi16(M[2], M[3]); + temp1 = _mm256_unpackhi_epi16(M[4], M[5]); + temp2 = _mm256_unpackhi_epi16(M[6], M[7]); + + r4 = _mm256_unpackhi_epi16(M[8], M[9]); + M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); + M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); + r5 = _mm256_unpackhi_epi16(M[10], M[11]); + r6 = _mm256_unpackhi_epi16(M[12], M[13]); + r7 = _mm256_unpackhi_epi16(M[14], M[15]); + + r0 = _mm256_unpacklo_epi64(r8, r9); + r1 = _mm256_unpacklo_epi64(r10, r11); + r2 = _mm256_unpackhi_epi64(r8, r9); + r3 = _mm256_unpackhi_epi64(r10, r11); + + M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); + M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); + M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); + M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); + + r0 = _mm256_unpacklo_epi32(temp, temp0); + r1 = _mm256_unpacklo_epi32(temp1, temp2); + r2 = _mm256_unpacklo_epi32(r4, r5); + r3 = _mm256_unpacklo_epi32(r6, r7); + + r8 = _mm256_unpacklo_epi64(r0, r1); + r10 = _mm256_unpackhi_epi64(r0, r1); + r9 = _mm256_unpacklo_epi64(r2, r3); + r11 = _mm256_unpackhi_epi64(r2, r3); + + M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); + M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); + M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); + M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); + + r0 = _mm256_unpackhi_epi32(temp, temp0); + r1 = _mm256_unpackhi_epi32(temp1, temp2); + r2 = _mm256_unpackhi_epi32(r4, r5); + r3 = _mm256_unpackhi_epi32(r6, r7); + + r4 = _mm256_unpacklo_epi64(r0, r1); + r6 = _mm256_unpackhi_epi64(r0, r1); + r5 = _mm256_unpacklo_epi64(r2, r3); + r7 = _mm256_unpackhi_epi64(r2, r3); + + M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); + M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); + M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); + M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); +} + +static void batch_64coefficient_multiplications(toom4_points_product *c_eval, const __m256i *a, const toom4_points *b_eval, int accumulate) { + toom4_points a_eval;// Holds evaluation (a & b) for 7 Karatsuba at a time + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + __m256i *va = (__m256i *)a_eval.coeffs; + __m256i *vb = (__m256i *)b_eval->coeffs; + __m256i *vc = (__m256i *)c_eval->coeffs; + + //------------------AVX evaluation for 1st poly----------------------- + r0_avx = a[0 * L + 0]; + r1_avx = a[0 * L + 1]; + r2_avx = a[0 * L + 2]; + r3_avx = a[0 * L + 3]; + + va[0] = r0_avx; + va[1] = r1_avx; + va[2] = r2_avx; + va[3] = r3_avx; + va[4] = _mm256_add_epi16(r0_avx, r1_avx); + va[5] = _mm256_add_epi16(r2_avx, r3_avx); + va[6] = _mm256_add_epi16(r0_avx, r2_avx); + va[7] = _mm256_add_epi16(r1_avx, r3_avx); + va[8] = _mm256_add_epi16(va[6], va[7]); + //------------------AVX evaluation for 1st poly ends------------------ + + //------------------AVX evaluation for 2nd poly----------------------- + r0_avx = a[1 * L + 0]; + r1_avx = a[1 * L + 1]; + r2_avx = a[1 * L + 2]; + r3_avx = a[1 * L + 3]; + + va[0 + 9] = r0_avx; + va[1 + 9] = r1_avx; + va[2 + 9] = r2_avx; + va[3 + 9] = r3_avx; + va[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 9] = _mm256_add_epi16(va[6 + 9], va[7 + 9]); + //------------------AVX evaluation for 2nd poly ends------------------ + + //------------------AVX evaluation for 3rd poly----------------------- + r0_avx = a[2 * L + 0]; + r1_avx = a[2 * L + 1]; + r2_avx = a[2 * L + 2]; + r3_avx = a[2 * L + 3]; + + va[0 + 18] = r0_avx; + va[1 + 18] = r1_avx; + va[2 + 18] = r2_avx; + va[3 + 18] = r3_avx; + va[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 18] = _mm256_add_epi16(va[6 + 18], va[7 + 18]); + //------------------AVX evaluation for 3rd poly ends------------------ + + //------------------AVX evaluation for 4th poly----------------------- + r0_avx = a[3 * L + 0]; + r1_avx = a[3 * L + 1]; + r2_avx = a[3 * L + 2]; + r3_avx = a[3 * L + 3]; + + va[0 + 27] = r0_avx; + va[1 + 27] = r1_avx; + va[2 + 27] = r2_avx; + va[3 + 27] = r3_avx; + va[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 27] = _mm256_add_epi16(va[6 + 27], va[7 + 27]); + //------------------AVX evaluation for 4th poly ends------------------ + + //------------------AVX evaluation for 5th poly----------------------- + r0_avx = a[4 * L + 0]; + r1_avx = a[4 * L + 1]; + r2_avx = a[4 * L + 2]; + r3_avx = a[4 * L + 3]; + + va[0 + 36] = r0_avx; + va[1 + 36] = r1_avx; + va[2 + 36] = r2_avx; + va[3 + 36] = r3_avx; + va[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 36] = _mm256_add_epi16(va[6 + 36], va[7 + 36]); + //------------------AVX evaluation for 5th poly ends------------------ + + //------------------AVX evaluation for 6th poly----------------------- + r0_avx = a[5 * L + 0]; + r1_avx = a[5 * L + 1]; + r2_avx = a[5 * L + 2]; + r3_avx = a[5 * L + 3]; + + va[0 + 45] = r0_avx; + va[1 + 45] = r1_avx; + va[2 + 45] = r2_avx; + va[3 + 45] = r3_avx; + va[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 45] = _mm256_add_epi16(va[6 + 45], va[7 + 45]); + //------------------AVX evaluation for 6th poly ends------------------ + + //------------------AVX evaluation for 7th poly----------------------- + r0_avx = a[6 * L + 0]; + r1_avx = a[6 * L + 1]; + r2_avx = a[6 * L + 2]; + r3_avx = a[6 * L + 3]; + + va[0 + 54] = r0_avx; + va[1 + 54] = r1_avx; + va[2 + 54] = r2_avx; + va[3 + 54] = r3_avx; + va[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx); + va[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx); + va[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx); + va[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx); + va[8 + 54] = _mm256_add_epi16(va[6 + 54], va[7 + 54]); + //------------------AVX evaluation for 7th poly ends------------------ + + //-----------------Forward transposes-------------------------------------- + transpose(va); + transpose(va + 16); + transpose(va + 32); + transpose(va + 48); + //-----------------Forward transposes ends--------------------------------- + + if (accumulate == 0) { + schoolbook_avx(vc, va, vb); + schoolbook_avx(vc + 32, va + 16, vb + 16); + schoolbook_avx(vc + 64, va + 32, vb + 32); + schoolbook_avx(vc + 96, va + 48, vb + 48); + } else { + schoolbook_avx_acc(vc, va, vb); + schoolbook_avx_acc(vc + 32, va + 16, vb + 16); + schoolbook_avx_acc(vc + 64, va + 32, vb + 32); + schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + } +} + +static void karatsuba_eval(__m256i *b_eval, const __m256i *b) { + __m256i r0_avx, r1_avx, r2_avx, r3_avx; + + //-------1st poly---------------------------------------------------- + r0_avx = b[0 * L + 0]; + r1_avx = b[0 * L + 1]; + r2_avx = b[0 * L + 2]; + r3_avx = b[0 * L + 3]; + + b_eval[0] = r0_avx; + b_eval[1] = r1_avx; + b_eval[2] = r2_avx; + b_eval[3] = r3_avx; + b_eval[4] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8] = _mm256_add_epi16(b_eval[6], b_eval[7]); + + //-------2nd poly---------------------------------------------------- + r0_avx = b[1 * L + 0]; + r1_avx = b[1 * L + 1]; + r2_avx = b[1 * L + 2]; + r3_avx = b[1 * L + 3]; + + b_eval[0 + 9] = r0_avx; + b_eval[1 + 9] = r1_avx; + b_eval[2 + 9] = r2_avx; + b_eval[3 + 9] = r3_avx; + b_eval[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 9] = _mm256_add_epi16(b_eval[6 + 9], b_eval[7 + 9]); + + //-------3rd poly---------------------------------------------------- + r0_avx = b[2 * L + 0]; + r1_avx = b[2 * L + 1]; + r2_avx = b[2 * L + 2]; + r3_avx = b[2 * L + 3]; + + b_eval[0 + 18] = r0_avx; + b_eval[1 + 18] = r1_avx; + b_eval[2 + 18] = r2_avx; + b_eval[3 + 18] = r3_avx; + b_eval[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 18] = _mm256_add_epi16(b_eval[6 + 18], b_eval[7 + 18]); + + //-------4th poly---------------------------------------------------- + r0_avx = b[3 * L + 0]; + r1_avx = b[3 * L + 1]; + r2_avx = b[3 * L + 2]; + r3_avx = b[3 * L + 3]; + + b_eval[0 + 27] = r0_avx; + b_eval[1 + 27] = r1_avx; + b_eval[2 + 27] = r2_avx; + b_eval[3 + 27] = r3_avx; + b_eval[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 27] = _mm256_add_epi16(b_eval[6 + 27], b_eval[7 + 27]); + + //-------5th poly---------------------------------------------------- + r0_avx = b[4 * L + 0]; + r1_avx = b[4 * L + 1]; + r2_avx = b[4 * L + 2]; + r3_avx = b[4 * L + 3]; + + b_eval[0 + 36] = r0_avx; + b_eval[1 + 36] = r1_avx; + b_eval[2 + 36] = r2_avx; + b_eval[3 + 36] = r3_avx; + b_eval[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 36] = _mm256_add_epi16(b_eval[6 + 36], b_eval[7 + 36]); + + //-------6th poly---------------------------------------------------- + r0_avx = b[5 * L + 0]; + r1_avx = b[5 * L + 1]; + r2_avx = b[5 * L + 2]; + r3_avx = b[5 * L + 3]; + + b_eval[0 + 45] = r0_avx; + b_eval[1 + 45] = r1_avx; + b_eval[2 + 45] = r2_avx; + b_eval[3 + 45] = r3_avx; + b_eval[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 45] = _mm256_add_epi16(b_eval[6 + 45], b_eval[7 + 45]); + + //-------7th poly---------------------------------------------------- + r0_avx = b[6 * L + 0]; + r1_avx = b[6 * L + 1]; + r2_avx = b[6 * L + 2]; + r3_avx = b[6 * L + 3]; + + b_eval[0 + 54] = r0_avx; + b_eval[1 + 54] = r1_avx; + b_eval[2 + 54] = r2_avx; + b_eval[3 + 54] = r3_avx; + b_eval[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx); + b_eval[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx); + b_eval[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx); + b_eval[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx); + b_eval[8 + 54] = _mm256_add_epi16(b_eval[6 + 54], b_eval[7 + 54]); + + //--------------Evaluating B poly ends------------------------------- + transpose(b_eval); + transpose(b_eval + 16); + transpose(b_eval + 32); + transpose(b_eval + 48); +} + +static void karatsuba_interp(__m256i *result_final0, __m256i *result_final1, __m256i *result_final2, __m256i *result_final3, __m256i *result_final4, __m256i *result_final5, __m256i *result_final6, const __m256i *c_eval) { + __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results + __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; + + //------------------------AVX interpolation for 1st poly external------------------- + res_avx0 = c_eval[0]; + res_avx2 = c_eval[1]; + res_avx4 = c_eval[2]; + res_avx6 = c_eval[3]; + c6_avx = c_eval[6]; + c7_avx = c_eval[7]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[8], c6_avx), c7_avx); + + res_avx1 = c_eval[16]; + res_avx3 = c_eval[17]; + res_avx5 = c_eval[18]; + res_avx7 = c_eval[19]; + c22_avx = c_eval[22]; + c23_avx = c_eval[23]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[21], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[24], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[20], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[5], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[4], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final0[0] = res_avx0; + result_final0[1] = res_avx1; + result_final0[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final0[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final0[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final0[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final0[6] = res_avx6; + result_final0[7] = res_avx7; + //------------------------AVX interpolation for 1st poly ends-------------- + + + //------------------------AVX interpolation for 2nd poly external------------------- + res_avx0 = c_eval[9]; //c_eval0 + res_avx2 = c_eval[10]; //c_eval1 + res_avx4 = c_eval[11]; //c_eval2 + res_avx6 = c_eval[12]; //c_eval3 + c6_avx = c_eval[15]; //c_eval6 + c7_avx = c_eval[32]; //c_eval7 + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[33], c6_avx), c7_avx); + + res_avx1 = c_eval[25]; //c_eval0 + res_avx3 = c_eval[26]; //c_eval1 + res_avx5 = c_eval[27]; //c_eval2 + res_avx7 = c_eval[28]; //c_eval3 + c22_avx = c_eval[31]; + c23_avx = c_eval[48]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[30], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[49], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[29], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[14], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[13], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final1[0] = res_avx0; + result_final1[1] = res_avx1; + result_final1[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final1[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final1[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final1[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final1[6] = res_avx6; + result_final1[7] = res_avx7; + //------------------------AVX interpolation for 2nd poly ends-------------- + + //------------------------AVX interpolation for 3rd poly external------------------- + res_avx0 = c_eval[34]; //c_eval0 + res_avx2 = c_eval[35]; //c_eval1 + res_avx4 = c_eval[36]; + res_avx6 = c_eval[37]; + c6_avx = c_eval[40]; + c7_avx = c_eval[41]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[42], c6_avx), c7_avx); + + res_avx1 = c_eval[50]; //c_eval0 + res_avx3 = c_eval[51]; //c_eval1 + res_avx5 = c_eval[52]; + res_avx7 = c_eval[53]; + c22_avx = c_eval[56]; + c23_avx = c_eval[57]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[55], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[58], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[54], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[39], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[38], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final2[0] = res_avx0; + result_final2[1] = res_avx1; + result_final2[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final2[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final2[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final2[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final2[6] = res_avx6; + result_final2[7] = res_avx7; + //------------------------AVX interpolation for 3rd poly ends-------------- + + //------------------------AVX interpolation for 4th poly external------------------- + res_avx0 = c_eval[43]; + res_avx2 = c_eval[44]; + res_avx4 = c_eval[45]; + res_avx6 = c_eval[46]; + c6_avx = c_eval[65]; + c7_avx = c_eval[66]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[67], c6_avx), c7_avx); + + res_avx1 = c_eval[59]; + res_avx3 = c_eval[60]; + res_avx5 = c_eval[61]; + res_avx7 = c_eval[62]; + c22_avx = c_eval[81]; + c23_avx = c_eval[82]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[80], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[83], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[63], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[64], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[47], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final3[0] = res_avx0; + result_final3[1] = res_avx1; + result_final3[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final3[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final3[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final3[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final3[6] = res_avx6; + result_final3[7] = res_avx7; + //------------------------AVX interpolation for 4th poly ends-------------- + + //------------------------AVX interpolation for 5th poly external------------------- + res_avx0 = c_eval[68]; + res_avx2 = c_eval[69]; + res_avx4 = c_eval[70]; + res_avx6 = c_eval[71]; + c6_avx = c_eval[74]; + c7_avx = c_eval[75]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[76], c6_avx), c7_avx); + + res_avx1 = c_eval[84]; + res_avx3 = c_eval[85]; + res_avx5 = c_eval[86]; + res_avx7 = c_eval[87]; + c22_avx = c_eval[90]; + c23_avx = c_eval[91]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[89], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[92], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[88], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[73], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[72], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final4[0] = res_avx0; + result_final4[1] = res_avx1; + result_final4[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final4[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final4[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final4[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final4[6] = res_avx6; + result_final4[7] = res_avx7; + //------------------------AVX interpolation for 5th poly ends-------------- + + //------------------------AVX interpolation for 6th poly external------------------- + res_avx0 = c_eval[77]; + res_avx2 = c_eval[78]; + res_avx4 = c_eval[79]; + res_avx6 = c_eval[96]; + c6_avx = c_eval[99]; + c7_avx = c_eval[100]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[101], c6_avx), c7_avx); + + res_avx1 = c_eval[93]; + res_avx3 = c_eval[94]; + res_avx5 = c_eval[95]; + res_avx7 = c_eval[112]; + c22_avx = c_eval[115]; + c23_avx = c_eval[116]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[114], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[117], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[113], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[98], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[97], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final5[0] = res_avx0; + result_final5[1] = res_avx1; + result_final5[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final5[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final5[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final5[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final5[6] = res_avx6; + result_final5[7] = res_avx7; + //------------------------AVX interpolation for 6th poly ends-------------- + + //------------------------AVX interpolation for 7th poly external------------------- + res_avx0 = c_eval[102]; + res_avx2 = c_eval[103]; + res_avx4 = c_eval[104]; + res_avx6 = c_eval[105]; + c6_avx = c_eval[108]; + c7_avx = c_eval[109]; + + c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[110], c6_avx), c7_avx); + + res_avx1 = c_eval[118]; + res_avx3 = c_eval[119]; + res_avx5 = c_eval[120]; + res_avx7 = c_eval[121]; + c22_avx = c_eval[124]; + c23_avx = c_eval[125]; + + c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[123], res_avx5), res_avx7); + c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[126], c22_avx), c23_avx); + c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[122], res_avx1), res_avx3); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[107], res_avx4), res_avx6); + res_avx5 = _mm256_add_epi16(res_avx5, temp); + temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[106], res_avx0), res_avx2); + res_avx1 = _mm256_add_epi16(res_avx1, temp); + c22_avx = _mm256_add_epi16(c22_avx, c8_avx); + res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); + res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); + c7_avx = _mm256_add_epi16(c7_avx, c24_avx); + c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); + c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); + c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); + c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); + + result_final6[0] = res_avx0; + result_final6[1] = res_avx1; + result_final6[2] = _mm256_add_epi16(res_avx2, c6_avx); + result_final6[3] = _mm256_add_epi16(res_avx3, c22_avx); + result_final6[4] = _mm256_add_epi16(res_avx4, c7_avx); + result_final6[5] = _mm256_add_epi16(res_avx5, c23_avx); + result_final6[6] = res_avx6; + result_final6[7] = res_avx7; + //------------------------AVX interpolation for 7th poly ends-------------- +} + +void PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a, const toom4_points *b_eval, int accumulate) { + size_t i; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i aw_avx[7 * L]; + __m256i *va = (__m256i *)a->coeffs; + + for (i = 0; i < L; i++) { + r0_avx = va[0 * L + i]; + r1_avx = va[1 * L + i]; + r2_avx = va[2 * L + i]; + r3_avx = va[3 * L + i]; + r4_avx = _mm256_add_epi16(r0_avx, r2_avx); + r5_avx = _mm256_add_epi16(r1_avx, r3_avx); + aw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + aw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r0_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r2_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r5_avx = _mm256_slli_epi16(r1_avx, 2); + r5_avx = _mm256_add_epi16(r5_avx, r3_avx); + aw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + aw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r3_avx, 3); + r6_avx = _mm256_slli_epi16(r2_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + r6_avx = _mm256_slli_epi16(r1_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + aw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx); + aw_avx[6 * L + i] = r0_avx; + aw_avx[0 * L + i] = r3_avx; + } + + batch_64coefficient_multiplications(c_eval, aw_avx, b_eval, accumulate); +} + +void PQCLEAN_SABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b) { + size_t i; + __m256i bw_avx[7 * L]; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; + __m256i *vb = (__m256i *)b->coeffs; + __m256i *vb_eval = (__m256i *)b_eval->coeffs; + + for (i = 0; i < L; i++) { + r0_avx = vb[0 * L + i]; + r1_avx = vb[1 * L + i]; + r2_avx = vb[2 * L + i]; + r3_avx = vb[3 * L + i]; + r4_avx = _mm256_add_epi16(r0_avx, r2_avx); + r5_avx = _mm256_add_epi16(r1_avx, r3_avx); + bw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + bw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r0_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r2_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r5_avx = _mm256_slli_epi16(r1_avx, 2); + r5_avx = _mm256_add_epi16(r5_avx, r3_avx); + bw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx); + bw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx); + r4_avx = _mm256_slli_epi16(r3_avx, 3); + r6_avx = _mm256_slli_epi16(r2_avx, 2); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + r6_avx = _mm256_slli_epi16(r1_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r6_avx); + bw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx); + bw_avx[6 * L + i] = r0_avx; + bw_avx[0 * L + i] = r3_avx; + } + + karatsuba_eval(vb_eval, bw_avx); +} + + +void PQCLEAN_SABER_AVX2_toom4_interp(poly *res, const toom4_points_product *c_eval) { + size_t i; + __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx; + __m256i w1_avx[2 * L], w2_avx[2 * L], w3_avx[2 * L], w4_avx[2 * L], w5_avx[2 * L], w6_avx[2 * L], w7_avx[2 * L]; + __m256i res_full[32]; + __m256i *vc = (__m256i *)c_eval->coeffs; + __m256i *vres = (__m256i *)res->coeffs; + + transpose(vc); + transpose(vc + 16); + transpose(vc + 32); + transpose(vc + 48); + transpose(vc + 64); + transpose(vc + 80); + transpose(vc + 96); + transpose(vc + 112); + + karatsuba_interp(w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx, vc); + + for (i = 0; i < 2 * L; i++) { + r0_avx = w1_avx[i]; + r1_avx = w2_avx[i]; + r2_avx = w3_avx[i]; + r3_avx = w4_avx[i]; + r4_avx = w5_avx[i]; + r5_avx = w6_avx[i]; + r6_avx = w7_avx[i]; + + r1_avx = _mm256_add_epi16(r1_avx, r4_avx); + r5_avx = _mm256_sub_epi16(r5_avx, r4_avx); + r3_avx = _mm256_sub_epi16(r3_avx, r2_avx); + r3_avx = _mm256_srli_epi16(r3_avx, 1); + r4_avx = _mm256_sub_epi16(r4_avx, r0_avx); + temp_avx = _mm256_slli_epi16(r6_avx, 6); + + r4_avx = _mm256_sub_epi16(r4_avx, temp_avx); + r4_avx = _mm256_slli_epi16(r4_avx, 1); + r4_avx = _mm256_add_epi16(r4_avx, r5_avx); + r2_avx = _mm256_add_epi16(r2_avx, r3_avx); + temp_avx = _mm256_slli_epi16(r2_avx, 6); + + r1_avx = _mm256_sub_epi16(r1_avx, temp_avx); + r1_avx = _mm256_sub_epi16(r1_avx, r2_avx); + r2_avx = _mm256_sub_epi16(r2_avx, r6_avx); + r2_avx = _mm256_sub_epi16(r2_avx, r0_avx); + temp_avx = _mm256_mullo_epi16(r2_avx, _mm256_set1_epi16(45)); + + r1_avx = _mm256_add_epi16(r1_avx, temp_avx); + temp_avx = _mm256_slli_epi16(r2_avx, 3); + + r4_avx = _mm256_sub_epi16(r4_avx, temp_avx); + r4_avx = _mm256_mullo_epi16(r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16) + r4_avx = _mm256_srli_epi16(r4_avx, 3); + r5_avx = _mm256_add_epi16(r5_avx, r1_avx); + temp_avx = _mm256_slli_epi16(r3_avx, 4); + + r1_avx = _mm256_add_epi16(r1_avx, temp_avx); + r1_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16) + r1_avx = _mm256_srli_epi16(r1_avx, 1); + r3_avx = _mm256_add_epi16(r1_avx, r3_avx); + r3_avx = _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx); + temp_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(30)); + temp_avx = _mm256_sub_epi16(temp_avx, r5_avx); + temp_avx = _mm256_mullo_epi16(temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16) + + r5_avx = _mm256_srli_epi16(temp_avx, 2); + r2_avx = _mm256_sub_epi16(r2_avx, r4_avx); + r1_avx = _mm256_sub_epi16(r1_avx, r5_avx); + + if (i < L) { + res_full[0 * L + i] = r6_avx; + res_full[1 * L + i] = r5_avx; + res_full[2 * L + i] = r4_avx; + res_full[3 * L + i] = r3_avx; + res_full[4 * L + i] = r2_avx; + res_full[5 * L + i] = r1_avx; + res_full[6 * L + i] = r0_avx; + } else { + res_full[0 * L + i] = _mm256_add_epi16(res_full[0 * L + i], r6_avx); + res_full[1 * L + i] = _mm256_add_epi16(res_full[1 * L + i], r5_avx); + res_full[2 * L + i] = _mm256_add_epi16(res_full[2 * L + i], r4_avx); + res_full[3 * L + i] = _mm256_add_epi16(res_full[3 * L + i], r3_avx); + res_full[4 * L + i] = _mm256_add_epi16(res_full[4 * L + i], r2_avx); + res_full[5 * L + i] = _mm256_add_epi16(res_full[5 * L + i], r1_avx); + res_full[6 * L + i] = r0_avx; + } + } + + // Reduction by X^256 + 1 + for (i = 0; i < 16; i++) { + vres[i] = _mm256_sub_epi16(res_full[i], res_full[i + 16]); + } +} diff --git a/crypto_kem/saber/avx2/polymul/consts.h b/crypto_kem/saber/avx2/polymul/consts.h deleted file mode 100644 index 40826398..00000000 --- a/crypto_kem/saber/avx2/polymul/consts.h +++ /dev/null @@ -1,20 +0,0 @@ -#include "../SABER_params.h" - -#define AVX_N (SABER_N >> 4) -#define small_len_avx (AVX_N >> 2) - -#define SCHB_N 16 - -#define N_SB (SABER_N >> 2) -#define N_SB_RES (2*N_SB-1) - -#define N_SB_16 (N_SB >> 2) -#define N_SB_16_RES (2*N_SB_16-1) - -#define AVX_N1 16 /*N/16*/ - -#define SCM_SIZE 16 - -// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements -#define NUM_POLY SABER_K -//int NUM_POLY=2; diff --git a/crypto_kem/saber/avx2/polymul/matrix.c b/crypto_kem/saber/avx2/polymul/matrix.c deleted file mode 100644 index 5fa35783..00000000 --- a/crypto_kem/saber/avx2/polymul/matrix.c +++ /dev/null @@ -1,303 +0,0 @@ -#include - -static void transpose_n1(__m256i *M) -{ - //int i; - register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; - register __m256i temp, temp0, temp1, temp2; - - //for(i=0; i<8; i=i+1) - //{ - r0 = _mm256_unpacklo_epi16(M[0], M[1]); - r1 = _mm256_unpacklo_epi16(M[2], M[3]); - r2 = _mm256_unpacklo_epi16(M[4], M[5]); - r3 = _mm256_unpacklo_epi16(M[6], M[7]); - r4 = _mm256_unpacklo_epi16(M[8], M[9]); - r5 = _mm256_unpacklo_epi16(M[10], M[11]); - r6 = _mm256_unpacklo_epi16(M[12], M[13]); - r7 = _mm256_unpacklo_epi16(M[14], M[15]); - - - temp = _mm256_unpacklo_epi32(r0, r1); - temp0 = _mm256_unpacklo_epi32(r2, r3); - temp1 = _mm256_unpacklo_epi32(r4, r5); - temp2 = _mm256_unpacklo_epi32(r6, r7); - - r8 = _mm256_unpackhi_epi32(r0, r1); - r9 = _mm256_unpackhi_epi32(r2, r3); - r10 = _mm256_unpackhi_epi32(r4, r5); - r11 = _mm256_unpackhi_epi32(r6, r7); - - r0 = _mm256_unpacklo_epi64(temp, temp0); - r2 = _mm256_unpackhi_epi64(temp, temp0); - - r1 = _mm256_unpacklo_epi64(temp1, temp2); - r3 = _mm256_unpackhi_epi64(temp1, temp2); - - temp = _mm256_unpackhi_epi16(M[0], M[1]); - temp0 = _mm256_unpackhi_epi16(M[2], M[3]); - temp1 = _mm256_unpackhi_epi16(M[4], M[5]); - temp2 = _mm256_unpackhi_epi16(M[6], M[7]); - r4 = _mm256_unpackhi_epi16(M[8], M[9]); - - M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); - M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); - M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); - M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); - - - r5 = _mm256_unpackhi_epi16(M[10], M[11]); - r6 = _mm256_unpackhi_epi16(M[12], M[13]); - r7 = _mm256_unpackhi_epi16(M[14], M[15]); - - - - r0 = _mm256_unpacklo_epi64(r8, r9); - r1 = _mm256_unpacklo_epi64(r10, r11); - - r2 = _mm256_unpackhi_epi64(r8, r9); - r3 = _mm256_unpackhi_epi64(r10, r11); - - - - M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); - M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); - M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); - M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); - - - //for(i=0; i<4; i=i+1) - //{ - r0 = _mm256_unpacklo_epi32(temp, temp0); - r1 = _mm256_unpacklo_epi32(temp1, temp2); - r2 = _mm256_unpacklo_epi32(r4, r5); - r3 = _mm256_unpacklo_epi32(r6, r7); - - //} - - - //for(i=0; i<2; i=i+1) - //{ - r8 = _mm256_unpacklo_epi64(r0, r1); - r10 = _mm256_unpackhi_epi64(r0, r1); - - r9 = _mm256_unpacklo_epi64(r2, r3); - r11 = _mm256_unpackhi_epi64(r2, r3); - - M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); - M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); - M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); - M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); - - r0 = _mm256_unpackhi_epi32(temp, temp0); - r1 = _mm256_unpackhi_epi32(temp1, temp2); - r2 = _mm256_unpackhi_epi32(r4, r5); - r3 = _mm256_unpackhi_epi32(r6, r7); - - //} -// for(i=0; i<2; i=i+1) -// { - r4 = _mm256_unpacklo_epi64(r0, r1); - r6 = _mm256_unpackhi_epi64(r0, r1); - - r5 = _mm256_unpacklo_epi64(r2, r3); - r7 = _mm256_unpackhi_epi64(r2, r3); - -// } - - //------------------------------------------------------- - - M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); - M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); - M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); - M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); -} - -/* -void transpose_unrolled(__m256i *M) -{ - int i; - __m256i tL[8], tH[8]; - __m256i bL[4], bH[4], cL[4], cH[4]; - __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; - - __m256i r0, r1, r2, r3, r4, r5, r6, r7; - - //for(i=0; i<8; i=i+1) - //{ - tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); - tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); - - tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); - tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); - - tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); - tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); - - tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); - tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); - - tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); - tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); - - tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); - tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); - - tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); - tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); - - tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); - tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); - - //} - - //------------------------------------------------------- - //for(i=0; i<4; i=i+1) - //{ - bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); - bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); - - bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); - bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); - - bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); - bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); - - bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); - bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); - - //} - - //for(i=0; i<2; i=i+1) - //{ - dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); - dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); - - dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); - dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); - - M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); - M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); - M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); - M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); - - //} - //for(i=0; i<2; i=i+1) - //{ - eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); - eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); - - eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); - eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); - - //} - - //------------------------------------------------------- - - //------------------------------------------------------- - for(i=0; i<4; i=i+1) - { - cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); - cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); - } - - - for(i=0; i<2; i=i+1) - { - fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); - fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); - gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); - } - - //------------------------------------------------------- - - - - M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); - M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); - M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); - M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); - - M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); - M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); - M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); - M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); - - M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); - M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); - M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); - M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); -} - - -void transpose1(__m256i *M) -{ - int i; - __m256i tL[8], tH[8]; - __m256i bL[4], bH[4], cL[4], cH[4]; - __m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; - - for(i=0; i<8; i=i+1) - { - tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); - tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); - } - - for(i=0; i<4; i=i+1) - { - bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); - bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); - } - for(i=0; i<4; i=i+1) - { - cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); - cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); - } - - for(i=0; i<2; i=i+1) - { - dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); - dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); - eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); - } - - for(i=0; i<2; i=i+1) - { - fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); - fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); - } - for(i=0; i<2; i=i+1) - { - gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); - gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); - } - - M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); - M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); - M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); - M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); - - M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); - M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); - M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); - M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); - - M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); - M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); - M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); - M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); - - M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); - M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); - M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); - M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); -} -*/ diff --git a/crypto_kem/saber/avx2/polymul/scm_avx.c b/crypto_kem/saber/avx2/polymul/scm_avx.c deleted file mode 100644 index 48870f51..00000000 --- a/crypto_kem/saber/avx2/polymul/scm_avx.c +++ /dev/null @@ -1,753 +0,0 @@ -//#define SCM_SIZE 16 - -//#pragma STDC FP_CONTRACT ON - -#include - -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - - -static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched - //the c_avx are added cummulatively -{ - - register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - register __m256i temp; - - - a0=a[0]; - a1=a[1]; - a2=a[2]; - a3=a[3]; - a4=a[4]; - a5=a[5]; - a6=a[6]; - a7=a[7]; - - b0=b[0]; - b1=b[1]; - b2=b[2]; - b3=b[3]; - b4=b[4]; - b5=b[5]; - b6=b[6]; - b7=b[7]; - - // New Unrolled first triangle - - //otherwise accumulate - c_avx[0] = mul_add(a0, b0, c_avx[0]); - - - temp = _mm256_mullo_epi16 (a0, b1); - temp=mul_add(a1, b0, temp); - c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); - - - temp = _mm256_mullo_epi16 (a0, b2); - temp = mul_add(a1, b1, temp); - temp=mul_add(a2, b0, temp); - c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); - - - temp = _mm256_mullo_epi16 (a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp=mul_add(a3, b0, temp); - c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); - - temp = _mm256_mullo_epi16 (a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp=mul_add(a2, b2, temp); - c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); - - - temp = _mm256_mullo_epi16 (a0, b5); - temp = mul_add(a1, b4 , temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp=mul_add(a5, b0, temp); - c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); - - temp = _mm256_mullo_epi16 (a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp=mul_add(a4, b2, temp); - c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); - - - temp = _mm256_mullo_epi16 (a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add (a6, b1, temp); - temp = mul_add (a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add (a3, b4, temp); - temp = mul_add (a4, b3, temp); - temp=mul_add(a5, b2, temp); - c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); - - temp = _mm256_mullo_epi16 (a0, b[8]); - temp = mul_add (a1, b7, temp); - temp = mul_add (a7, b1, temp); - temp = mul_add (a[8], b0, temp); - temp = mul_add (a2, b6,temp); - temp = mul_add(a3, b5, temp); - temp = mul_add (a4, b4,temp); - temp = mul_add (a5, b3, temp); - - temp=mul_add(a6, b2, temp); - c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); - - - temp = _mm256_mullo_epi16 (a0, b[9]); - temp = mul_add (a1, b[8], temp); - temp = mul_add (a[8], b1, temp); - temp = mul_add (a[9], b0, temp); - temp = mul_add (a2, b7, temp); - temp = mul_add (a3, b6, temp); - temp = mul_add (a4, b5, temp); - temp = mul_add (a5, b4, temp); - temp = mul_add (a6, b3, temp); - temp=mul_add(a7, b2, temp); - c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); - - - temp= _mm256_mullo_epi16 (a0, b[10]); - temp = mul_add (a1, b[9], temp); - temp = mul_add (a[9], b1, temp); - temp = mul_add (a[10], b0, temp); - temp = mul_add (a2, b[8], temp); - temp = mul_add (a3, b7, temp); - temp = mul_add (a4, b6, temp); - temp = mul_add (a5, b5, temp); - temp = mul_add (a6, b4, temp); - temp = mul_add (a7, b3, temp); - temp=mul_add(a[8], b2, temp); - c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); - - - temp = _mm256_mullo_epi16 (a0, b[11]); - temp = mul_add (a1, b[10], temp ); - temp = mul_add (a[10], b1, temp ); - temp = mul_add (a[11], b0, temp ); - temp = mul_add (a2, b[9], temp ); - temp = mul_add (a3, b[8], temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a[8], b3, temp ); - temp=mul_add(a[9], b2, temp); - c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); - - - temp = _mm256_mullo_epi16 (a0, b[12]); - temp = mul_add (a1, b[11], temp); - temp = mul_add (a[11], b1, temp); - temp = mul_add (a[12], b0, temp); - temp = mul_add (a2, b[10], temp); - temp = mul_add (a3, b[9], temp); - temp = mul_add (a4, b[8], temp); - temp = mul_add (a5, b7, temp); - temp = mul_add (a6, b6, temp); - temp = mul_add (a7, b5, temp); - temp = mul_add (a[8], b4, temp); - temp = mul_add (a[9], b3, temp); - temp=mul_add(a[10], b2, temp); - c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); - - - temp = _mm256_mullo_epi16 (a0, b[13]); - temp = mul_add (a1, b[12], temp ); - temp = mul_add (a[12], b1, temp ); - temp = mul_add (a[13], b0, temp ); - temp = mul_add (a2, b[11], temp ); - temp = mul_add (a3, b[10], temp ); - temp = mul_add (a4, b[9], temp ); - temp = mul_add (a5, b[8], temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a[8], b5, temp ); - temp = mul_add (a[9], b4, temp ); - temp = mul_add (a[10], b3, temp ); - temp=mul_add(a[11], b2, temp); - c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); - - - - temp = _mm256_mullo_epi16 (a0, b[14]); - temp = mul_add (a1, b[13], temp ); - temp = mul_add (a[13], b1, temp ); - temp = mul_add (a[14], b0, temp ); - temp = mul_add (a2, b[12], temp ); - temp = mul_add (a3, b[11], temp ); - temp = mul_add (a4, b[10], temp ); - temp = mul_add (a5, b[9], temp ); - temp = mul_add (a6, b[8], temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a[8], b6, temp ); - temp = mul_add (a[9], b5, temp ); - temp = mul_add (a[10], b4, temp ); - temp = mul_add (a[11], b3, temp ); - temp=mul_add(a[12], b2, temp); - c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); - - - temp = _mm256_mullo_epi16 (a0, b[15]); - temp = mul_add (a1, b[14], temp ); - temp = mul_add (a[14], b1, temp ); - temp = mul_add (a[15], b0, temp ); - temp = mul_add (a2, b[13], temp ); - temp = mul_add (a3, b[12], temp ); - temp = mul_add (a4, b[11], temp ); - temp = mul_add (a5, b[10], temp ); - temp = mul_add (a6, b[9], temp ); - temp = mul_add (a7, b[8], temp ); - temp = mul_add (a[8], b7, temp ); - temp = mul_add (a[9], b6, temp ); - temp = mul_add (a[10], b5, temp ); - temp = mul_add (a[11], b4, temp ); - temp = mul_add (a[12], b3, temp ); - temp=mul_add(a[13], b2, temp); - c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); - - - // unrolled second triangle - a0=a[14]; - a1=a[15]; - a2=a[13]; - a3=a[12]; - a4=a[11]; - a5=a[10]; - a6=a[9]; - a7=a[8]; - - b0=b[14]; - b1=b[15]; - b2=b[13]; - b3=b[12]; - b4=b[11]; - b5=b[10]; - b6=b[9]; - b7=b[8]; - - temp = _mm256_mullo_epi16 (a[1], b1); - temp = mul_add (a[2], b0, temp ); - temp = mul_add (a[3], b2, temp ); - temp = mul_add (a[4], b3, temp ); - temp = mul_add (a[5], b4, temp ); - temp = mul_add (a[6], b5, temp ); - temp = mul_add (a[7], b6, temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a6, b[7], temp ); - temp = mul_add (a5, b[6], temp ); - temp = mul_add (a4, b[5], temp ); - temp = mul_add (a3, b[4], temp ); - temp = mul_add (a2, b[3], temp ); - temp = mul_add (a0, b[2], temp ); - temp=mul_add(a1, b[1], temp); - c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); - - - temp = _mm256_mullo_epi16 (a[2], b1); - temp = mul_add (a[3], b0, temp ); - temp = mul_add (a[4], b2, temp ); - temp = mul_add (a[5], b3, temp ); - temp = mul_add (a[6], b4, temp ); - temp = mul_add (a[7], b5, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a5, b[7], temp ); - temp = mul_add (a4, b[6], temp ); - temp = mul_add (a3, b[5], temp ); - temp = mul_add (a2, b[4], temp ); - temp = mul_add (a0, b[3], temp ); - temp=mul_add(a1, b[2], temp); - c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); - - - temp = _mm256_mullo_epi16 (a[3], b1); - temp = mul_add (a[4], b0, temp ); - temp = mul_add (a[5], b2, temp ); - temp = mul_add (a[6], b3, temp ); - temp = mul_add (a[7], b4, temp ); - temp = mul_add (a7, b5, temp ); - temp = mul_add (a6, b6, temp ); - temp = mul_add (a5, b7, temp ); - temp = mul_add (a4, b[7], temp ); - temp = mul_add (a3, b[6], temp ); - temp = mul_add (a2, b[5], temp ); - temp = mul_add (a0, b[4], temp ); - temp=mul_add(a1, b[3], temp); - c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); - - - temp = _mm256_mullo_epi16 (a[4], b1); - temp = mul_add (a[5], b0, temp ); - temp = mul_add (a[6], b2, temp ); - temp = mul_add (a[7], b3, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a3, b[7], temp ); - temp = mul_add (a2, b[6], temp ); - temp = mul_add (a0, b[5], temp ); - temp=mul_add(a1, b[4], temp); - c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); - - - temp = _mm256_mullo_epi16 (a[5], b1); - temp = mul_add (a[6], b0, temp ); - temp = mul_add (a[7], b2, temp ); - temp = mul_add (a7, b3, temp ); - temp = mul_add (a6, b4, temp ); - temp = mul_add (a5, b5, temp ); - temp = mul_add (a4, b6, temp ); - temp = mul_add (a3, b7, temp ); - temp = mul_add (a2, b[7], temp ); - temp = mul_add (a0, b[6], temp ); - temp=mul_add(a1, b[5], temp); - c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); - - - temp = _mm256_mullo_epi16 (a[6], b1); - temp = mul_add (a[7], b0, temp ); - temp = mul_add (a7, b2, temp ); - temp = mul_add (a6, b3, temp ); - temp = mul_add (a5, b4, temp ); - temp = mul_add (a4, b5, temp ); - temp = mul_add (a3, b6, temp ); - temp = mul_add (a2, b7, temp ); - temp = mul_add (a0, b[7], temp ); - temp=mul_add(a1, b[6], temp); - c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); - - - temp = _mm256_mullo_epi16 (a[7], b1); - temp = mul_add (a7, b0, temp ); - temp = mul_add (a6, b2, temp ); - temp = mul_add (a5, b3, temp ); - temp = mul_add (a4, b4, temp ); - temp = mul_add (a3, b5, temp ); - temp = mul_add (a2, b6, temp ); - temp = mul_add (a0, b7, temp ); - temp=mul_add(a1, b[7], temp); - c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); - - - temp = _mm256_mullo_epi16 (a7, b1); - temp = mul_add (a6, b0, temp ); - temp = mul_add (a5, b2, temp ); - temp = mul_add (a4, b3, temp ); - temp = mul_add (a3, b4, temp ); - temp = mul_add (a2, b5, temp ); - temp = mul_add (a0, b6, temp ); - temp=mul_add(a1, b7, temp); - c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); - - - temp = _mm256_mullo_epi16 (a6, b1); - temp = mul_add (a5, b0, temp ); - temp = mul_add (a4, b2, temp ); - temp = mul_add (a3, b3, temp ); - temp = mul_add (a2, b4, temp ); - temp = mul_add (a0, b5, temp ); - temp=mul_add(a1, b6, temp); - c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); - - - temp = _mm256_mullo_epi16 (a5, b1); - temp = mul_add (a4, b0, temp ); - temp = mul_add (a3, b2, temp ); - temp = mul_add (a2, b3, temp ); - temp = mul_add (a0, b4, temp ); - temp=mul_add(a1, b5, temp); - c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); - - - temp = _mm256_mullo_epi16 (a4, b1); - temp = mul_add (a3, b0, temp ); - temp = mul_add (a2, b2, temp ); - temp = mul_add (a0, b3, temp ); - temp=mul_add(a1, b4, temp); - c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); - - - temp = _mm256_mullo_epi16 (a3, b1); - temp = mul_add (a2, b0, temp ); - temp = mul_add (a0, b2, temp ); - temp=mul_add(a1, b3, temp); - c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); - - - temp = _mm256_mullo_epi16 (a2, b1); - temp = mul_add (a0, b0, temp ); - temp=mul_add(a1, b2, temp); - c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); - - - temp = _mm256_mullo_epi16 (a0, b1); - temp=mul_add(a1, b0, temp); - c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); - - - c_avx[30] = mul_add(a1, b1, c_avx[30]); - - - - c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); - - -} - - - -static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched - //the c_avx are not added cummulatively -{ - - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - - - a0=a[0]; - a1=a[1]; - a2=a[2]; - a3=a[3]; - a4=a[4]; - a5=a[5]; - a6=a[6]; - a7=a[7]; - - b0=b[0]; - b1=b[1]; - b2=b[2]; - b3=b[3]; - b4=b[4]; - b5=b[5]; - b6=b[6]; - b7=b[7]; - - // New Unrolled first triangle - c_avx[0] = _mm256_mullo_epi16 (a0, b0); - - temp = _mm256_mullo_epi16 (a0, b1); - c_avx[1]=mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b2); - - temp = mul_add(a1, b1, temp); - c_avx[2]= mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c_avx[3]= mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c_avx[4]= mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b5); - temp = mul_add(a1, b4 , temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c_avx[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16 (a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c_avx[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add (a6, b1, temp); - temp = mul_add (a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add (a3, b4, temp); - temp = mul_add (a4, b3, temp); - c_avx[7] = mul_add (a5, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[8]); - temp = mul_add (a1, b7, temp); - temp = mul_add (a7, b1, temp); - temp = mul_add (a[8], b0, temp); - temp = mul_add (a2, b6,temp); - temp = mul_add(a3, b5, temp); - temp = mul_add (a4, b4,temp); - temp = mul_add (a5, b3, temp); - c_avx[8] = mul_add (a6, b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[9]); - temp = mul_add (a1, b[8], temp); - temp = mul_add (a[8], b1, temp); - temp = mul_add (a[9], b0, temp); - temp = mul_add (a2, b7, temp); - temp = mul_add (a3, b6, temp); - temp = mul_add (a4, b5, temp); - temp = mul_add (a5, b4, temp); - temp = mul_add (a6, b3, temp); - c_avx[9] = mul_add (a7, b2, temp); - - temp= _mm256_mullo_epi16 (a0, b[10]); - temp = mul_add (a1, b[9], temp); - temp = mul_add (a[9], b1, temp); - temp = mul_add (a[10], b0, temp); - temp = mul_add (a2, b[8], temp); - temp = mul_add (a3, b7, temp); - temp = mul_add (a4, b6, temp); - temp = mul_add (a5, b5, temp); - temp = mul_add (a6, b4, temp); - temp = mul_add (a7, b3, temp); - c_avx[10] = mul_add (a[8], b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[11]); - temp = mul_add (a1, b[10], temp ); - temp = mul_add (a[10], b1, temp ); - temp = mul_add (a[11], b0, temp ); - temp = mul_add (a2, b[9], temp ); - temp = mul_add (a3, b[8], temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a[8], b3, temp ); - c_avx[11] = mul_add (a[9], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[12]); - temp = mul_add (a1, b[11], temp); - temp = mul_add (a[11], b1, temp); - temp = mul_add (a[12], b0, temp); - temp = mul_add (a2, b[10], temp); - temp = mul_add (a3, b[9], temp); - temp = mul_add (a4, b[8], temp); - temp = mul_add (a5, b7, temp); - temp = mul_add (a6, b6, temp); - temp = mul_add (a7, b5, temp); - temp = mul_add (a[8], b4, temp); - temp = mul_add (a[9], b3, temp); - c_avx[12] = mul_add (a[10], b2, temp); - - temp = _mm256_mullo_epi16 (a0, b[13]); - temp = mul_add (a1, b[12], temp ); - temp = mul_add (a[12], b1, temp ); - temp = mul_add (a[13], b0, temp ); - temp = mul_add (a2, b[11], temp ); - temp = mul_add (a3, b[10], temp ); - temp = mul_add (a4, b[9], temp ); - temp = mul_add (a5, b[8], temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a[8], b5, temp ); - temp = mul_add (a[9], b4, temp ); - temp = mul_add (a[10], b3, temp ); - c_avx[13] = mul_add (a[11], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[14]); - temp = mul_add (a1, b[13], temp ); - temp = mul_add (a[13], b1, temp ); - temp = mul_add (a[14], b0, temp ); - temp = mul_add (a2, b[12], temp ); - temp = mul_add (a3, b[11], temp ); - temp = mul_add (a4, b[10], temp ); - temp = mul_add (a5, b[9], temp ); - temp = mul_add (a6, b[8], temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a[8], b6, temp ); - temp = mul_add (a[9], b5, temp ); - temp = mul_add (a[10], b4, temp ); - temp = mul_add (a[11], b3, temp ); - c_avx[14] = mul_add (a[12], b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b[15]); - temp = mul_add (a1, b[14], temp ); - temp = mul_add (a[14], b1, temp ); - temp = mul_add (a[15], b0, temp ); - temp = mul_add (a2, b[13], temp ); - temp = mul_add (a3, b[12], temp ); - temp = mul_add (a4, b[11], temp ); - temp = mul_add (a5, b[10], temp ); - temp = mul_add (a6, b[9], temp ); - temp = mul_add (a7, b[8], temp ); - temp = mul_add (a[8], b7, temp ); - temp = mul_add (a[9], b6, temp ); - temp = mul_add (a[10], b5, temp ); - temp = mul_add (a[11], b4, temp ); - temp = mul_add (a[12], b3, temp ); - c_avx[15] = mul_add (a[13], b2, temp ); - - - // unrolled second triangle - a0=a[14]; - a1=a[15]; - a2=a[13]; - a3=a[12]; - a4=a[11]; - a5=a[10]; - a6=a[9]; - a7=a[8]; - - b0=b[14]; - b1=b[15]; - b2=b[13]; - b3=b[12]; - b4=b[11]; - b5=b[10]; - b6=b[9]; - b7=b[8]; - - - temp = _mm256_mullo_epi16 (a[1], b1); - temp = mul_add (a[2], b0, temp ); - temp = mul_add (a[3], b2, temp ); - temp = mul_add (a[4], b3, temp ); - temp = mul_add (a[5], b4, temp ); - temp = mul_add (a[6], b5, temp ); - temp = mul_add (a[7], b6, temp ); - temp = mul_add (a7, b7, temp ); - temp = mul_add (a6, b[7], temp ); - temp = mul_add (a5, b[6], temp ); - temp = mul_add (a4, b[5], temp ); - temp = mul_add (a3, b[4], temp ); - temp = mul_add (a2, b[3], temp ); - temp = mul_add (a0, b[2], temp ); - c_avx[16] = mul_add (a1, b[1], temp ); - - temp = _mm256_mullo_epi16 (a[2], b1); - temp = mul_add (a[3], b0, temp ); - temp = mul_add (a[4], b2, temp ); - temp = mul_add (a[5], b3, temp ); - temp = mul_add (a[6], b4, temp ); - temp = mul_add (a[7], b5, temp ); - temp = mul_add (a7, b6, temp ); - temp = mul_add (a6, b7, temp ); - temp = mul_add (a5, b[7], temp ); - temp = mul_add (a4, b[6], temp ); - temp = mul_add (a3, b[5], temp ); - temp = mul_add (a2, b[4], temp ); - temp = mul_add (a0, b[3], temp ); - c_avx[17] = mul_add (a1, b[2], temp ); - - temp = _mm256_mullo_epi16 (a[3], b1); - temp = mul_add (a[4], b0, temp ); - temp = mul_add (a[5], b2, temp ); - temp = mul_add (a[6], b3, temp ); - temp = mul_add (a[7], b4, temp ); - temp = mul_add (a7, b5, temp ); - temp = mul_add (a6, b6, temp ); - temp = mul_add (a5, b7, temp ); - temp = mul_add (a4, b[7], temp ); - temp = mul_add (a3, b[6], temp ); - temp = mul_add (a2, b[5], temp ); - temp = mul_add (a0, b[4], temp ); - c_avx[18] = mul_add (a1, b[3], temp ); - - temp = _mm256_mullo_epi16 (a[4], b1); - temp = mul_add (a[5], b0, temp ); - temp = mul_add (a[6], b2, temp ); - temp = mul_add (a[7], b3, temp ); - temp = mul_add (a7, b4, temp ); - temp = mul_add (a6, b5, temp ); - temp = mul_add (a5, b6, temp ); - temp = mul_add (a4, b7, temp ); - temp = mul_add (a3, b[7], temp ); - temp = mul_add (a2, b[6], temp ); - temp = mul_add (a0, b[5], temp ); - c_avx[19] = mul_add (a1, b[4], temp ); - - temp = _mm256_mullo_epi16 (a[5], b1); - temp = mul_add (a[6], b0, temp ); - temp = mul_add (a[7], b2, temp ); - temp = mul_add (a7, b3, temp ); - temp = mul_add (a6, b4, temp ); - temp = mul_add (a5, b5, temp ); - temp = mul_add (a4, b6, temp ); - temp = mul_add (a3, b7, temp ); - temp = mul_add (a2, b[7], temp ); - temp = mul_add (a0, b[6], temp ); - c_avx[20] = mul_add (a1, b[5], temp ); - - temp = _mm256_mullo_epi16 (a[6], b1); - temp = mul_add (a[7], b0, temp ); - temp = mul_add (a7, b2, temp ); - temp = mul_add (a6, b3, temp ); - temp = mul_add (a5, b4, temp ); - temp = mul_add (a4, b5, temp ); - temp = mul_add (a3, b6, temp ); - temp = mul_add (a2, b7, temp ); - temp = mul_add (a0, b[7], temp ); - c_avx[21] = mul_add (a1, b[6], temp ); - - temp = _mm256_mullo_epi16 (a[7], b1); - temp = mul_add (a7, b0, temp ); - temp = mul_add (a6, b2, temp ); - temp = mul_add (a5, b3, temp ); - temp = mul_add (a4, b4, temp ); - temp = mul_add (a3, b5, temp ); - temp = mul_add (a2, b6, temp ); - temp = mul_add (a0, b7, temp ); - c_avx[22] = mul_add (a1, b[7], temp ); - - temp = _mm256_mullo_epi16 (a7, b1); - temp = mul_add (a6, b0, temp ); - temp = mul_add (a5, b2, temp ); - temp = mul_add (a4, b3, temp ); - temp = mul_add (a3, b4, temp ); - temp = mul_add (a2, b5, temp ); - temp = mul_add (a0, b6, temp ); - c_avx[23] = mul_add (a1, b7, temp ); - - temp = _mm256_mullo_epi16 (a6, b1); - temp = mul_add (a5, b0, temp ); - temp = mul_add (a4, b2, temp ); - temp = mul_add (a3, b3, temp ); - temp = mul_add (a2, b4, temp ); - temp = mul_add (a0, b5, temp ); - c_avx[24] = mul_add (a1, b6, temp ); - - temp = _mm256_mullo_epi16 (a5, b1); - temp = mul_add (a4, b0, temp ); - temp = mul_add (a3, b2, temp ); - temp = mul_add (a2, b3, temp ); - temp = mul_add (a0, b4, temp ); - c_avx[25] = mul_add (a1, b5, temp ); - - temp = _mm256_mullo_epi16 (a4, b1); - temp = mul_add (a3, b0, temp ); - temp = mul_add (a2, b2, temp ); - temp = mul_add (a0, b3, temp ); - c_avx[26] = mul_add (a1, b4, temp ); - - temp = _mm256_mullo_epi16 (a3, b1); - temp = mul_add (a2, b0, temp ); - temp = mul_add (a0, b2, temp ); - c_avx[27] = mul_add (a1, b3, temp ); - - temp = _mm256_mullo_epi16 (a2, b1); - temp = mul_add (a0, b0, temp ); - c_avx[28] = mul_add (a1, b2, temp ); - - temp = _mm256_mullo_epi16 (a0, b1); - c_avx[29] = mul_add (a1, b0, temp); - - c_avx[30] = _mm256_mullo_epi16 (a1, b1); - - - c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); - -} diff --git a/crypto_kem/saber/avx2/polymul/toom-cook_4way.c b/crypto_kem/saber/avx2/polymul/toom-cook_4way.c deleted file mode 100644 index 78fb86c2..00000000 --- a/crypto_kem/saber/avx2/polymul/toom-cook_4way.c +++ /dev/null @@ -1,1010 +0,0 @@ -/* -Cleaned version for step by step approach look into the _debug file -*/ -//#include "timing.c" -#include "consts.h" -#include "matrix.c" -#include "scm_avx.c" - -static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX. -{ - __m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time - - //uint16_t i; - - register __m256i r0_avx, r1_avx, r2_avx, r3_avx; - - - - //CLOCK1=cpucycles(); - - //------------------AVX evaluation for 1st poly----------------------- - - r0_avx=a[0]; - r1_avx=a[1]; - r2_avx=a[2]; - r3_avx=a[3]; - a_bucket[0]=r0_avx; - a_bucket[1]=r1_avx; - a_bucket[2]=r2_avx; - a_bucket[3]=r3_avx; - a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]); - - - //------------------AVX evaluation for 1st poly ends------------------ - - - //------------------AVX evaluation for 2nd poly----------------------- - r0_avx=a[small_len_avx]; - r1_avx=a[small_len_avx+1]; - r2_avx=a[small_len_avx+2]; - r3_avx=a[small_len_avx+3]; - a_bucket[0+9]=r0_avx; - a_bucket[1+9]=r1_avx; - a_bucket[2+9]=r2_avx; - a_bucket[3+9]=r3_avx; - a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]); - - - //------------------AVX evaluation for 2nd poly ends------------------ - - - //------------------AVX evaluation for 3rd poly----------------------- - r0_avx=a[2*small_len_avx]; - r1_avx=a[2*small_len_avx+1]; - r2_avx=a[2*small_len_avx+2]; - r3_avx=a[2*small_len_avx+3]; - a_bucket[0+18]=r0_avx; - a_bucket[1+18]=r1_avx; - a_bucket[2+18]=r2_avx; - a_bucket[3+18]=r3_avx; - a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]); - - //------------------AVX evaluation for 3rd poly ends------------------ - - - //------------------AVX evaluation for 4th poly----------------------- - - r0_avx=a[3*small_len_avx]; - r1_avx=a[3*small_len_avx+1]; - r2_avx=a[3*small_len_avx+2]; - r3_avx=a[3*small_len_avx+3]; - a_bucket[0+27]=r0_avx; - a_bucket[1+27]=r1_avx; - a_bucket[2+27]=r2_avx; - a_bucket[3+27]=r3_avx; - a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]); - - //------------------AVX evaluation for 4th poly ends------------------ - - //------------------AVX evaluation for 5th poly----------------------- - - r0_avx=a[4*small_len_avx+0]; - r1_avx=a[4*small_len_avx+1]; - r2_avx=a[4*small_len_avx+2]; - r3_avx=a[4*small_len_avx+3]; - a_bucket[0+36]=r0_avx; - a_bucket[1+36]=r1_avx; - a_bucket[2+36]=r2_avx; - a_bucket[3+36]=r3_avx; - a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]); - - //------------------AVX evaluation for 5th poly ends------------------ - - - //------------------AVX evaluation for 6th poly----------------------- - r0_avx=a[5*small_len_avx]; - r1_avx=a[5*small_len_avx+1]; - r2_avx=a[5*small_len_avx+2]; - r3_avx=a[5*small_len_avx+3]; - a_bucket[0+45]=r0_avx; - a_bucket[1+45]=r1_avx; - a_bucket[2+45]=r2_avx; - a_bucket[3+45]=r3_avx; - a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]); - - //------------------AVX evaluation for 6th poly ends------------------ - - //------------------AVX evaluation for 7th poly----------------------- - - r0_avx=a[6*small_len_avx]; - r1_avx=a[6*small_len_avx+1]; - r2_avx=a[6*small_len_avx+2]; - r3_avx=a[6*small_len_avx+3]; - a_bucket[0+54]=r0_avx; - a_bucket[1+54]=r1_avx; - a_bucket[2+54]=r2_avx; - a_bucket[3+54]=r3_avx; - a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); - a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); - a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); - a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); - a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]); - - //------------------AVX evaluation for 7th poly ends------------------ - - - - //CLOCK2=cpucycles(); - //CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1); - //printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1); - - - //CLOCK1=cpucycles(); - //-----------------Forward transposes-------------------------------------- - transpose_n1(a_bucket); - transpose_n1(a_bucket+16); - transpose_n1(a_bucket+32); - transpose_n1(a_bucket+48); - - //-----------------Forwatrd transposes ends--------------------------------- - - //----------------------all multiplications--------------------------------- - if(f==0){ - schoolbook_avx_new2(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); - schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); - schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); - } - else{ - schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE); - //schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket); - schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE); - schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE); - } - /* - schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f); - schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f); - schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f); - schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f); - */ - - - //----------------------all multiplications ends----------------------------- - - - //-----------------Reverse transposes-------------------------------------- - - /* - transpose(c_bucket); - transpose(c_bucket+16); - - transpose(c_bucket+2*SCM_SIZE); - transpose(c_bucket+16+2*SCM_SIZE); - - transpose(c_bucket+4*SCM_SIZE); - transpose(c_bucket+16+4*SCM_SIZE); - - transpose(c_bucket+6*SCM_SIZE); - transpose(c_bucket+16+6*SCM_SIZE); - */ - //-----------------Reverse transposes ends--------------------------------- - - //CLOCK2=cpucycles(); - //CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1); - - //KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6); - -} - -static void KARA_eval(__m256i* b, __m256i *b_bucket){ - - __m256i r0_avx, r1_avx, r2_avx, r3_avx; - - - //-------1st poly---------------------------------------------------- - r0_avx=b[0]; - r1_avx=b[1]; - r2_avx=b[2]; - r3_avx=b[3]; - b_bucket[0]=r0_avx; - b_bucket[1]=r1_avx; - b_bucket[2]=r2_avx; - b_bucket[3]=r3_avx; - b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]); - //-------2nd poly---------------------------------------------------- - - r0_avx=b[small_len_avx]; - r1_avx=b[small_len_avx+1]; - r2_avx=b[small_len_avx+2]; - r3_avx=b[small_len_avx+3]; - b_bucket[0+9]=r0_avx; - b_bucket[1+9]=r1_avx; - b_bucket[2+9]=r2_avx; - b_bucket[3+9]=r3_avx; - b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]); - - //-------3rd poly---------------------------------------------------- - - r0_avx=b[2*small_len_avx+0]; - r1_avx=b[2*small_len_avx+1]; - r2_avx=b[2*small_len_avx+2]; - r3_avx=b[2*small_len_avx+3]; - b_bucket[0+18]=r0_avx; - b_bucket[1+18]=r1_avx; - b_bucket[2+18]=r2_avx; - b_bucket[3+18]=r3_avx; - b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]); - - //-------4th poly---------------------------------------------------- - r0_avx=b[3*small_len_avx]; - r1_avx=b[3*small_len_avx+1]; - r2_avx=b[3*small_len_avx+2]; - r3_avx=b[3*small_len_avx+3]; - b_bucket[0+27]=r0_avx; - b_bucket[1+27]=r1_avx; - b_bucket[2+27]=r2_avx; - b_bucket[3+27]=r3_avx; - b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]); - - //-------5th poly---------------------------------------------------- - - r0_avx=b[4*small_len_avx]; - r1_avx=b[4*small_len_avx+1]; - r2_avx=b[4*small_len_avx+2]; - r3_avx=b[4*small_len_avx+3]; - b_bucket[0+36]=r0_avx; - b_bucket[1+36]=r1_avx; - b_bucket[2+36]=r2_avx; - b_bucket[3+36]=r3_avx; - b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]); - - //-------6th poly---------------------------------------------------- - - r0_avx=b[5*small_len_avx]; - r1_avx=b[5*small_len_avx+1]; - r2_avx=b[5*small_len_avx+2]; - r3_avx=b[5*small_len_avx+3]; - b_bucket[0+45]=r0_avx; - b_bucket[1+45]=r1_avx; - b_bucket[2+45]=r2_avx; - b_bucket[3+45]=r3_avx; - b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]); - - //-------7th poly---------------------------------------------------- - - r0_avx=b[6*small_len_avx]; - r1_avx=b[6*small_len_avx+1]; - r2_avx=b[6*small_len_avx+2]; - r3_avx=b[6*small_len_avx+3]; - b_bucket[0+54]=r0_avx; - b_bucket[1+54]=r1_avx; - b_bucket[2+54]=r2_avx; - b_bucket[3+54]=r3_avx; - b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx); - b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx); - b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx); - b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx); - b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]); - - //--------------Evaluating B poly ends------------------------------- - - transpose_n1(b_bucket); - transpose_n1(b_bucket+16); - transpose_n1(b_bucket+32); - transpose_n1(b_bucket+48); -} - -static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){ - - //int64_t i; - register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results - - __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx; - - //CLOCK1=cpucycles(); - - //------------------------AVX interpolation for 1st poly external------------------- - - //loop1 - res_avx0 = c_bucket[0]; - res_avx2 = c_bucket[1]; - res_avx4 = c_bucket[2]; - res_avx6 = c_bucket[3]; - - c6_avx=c_bucket[6]; - c7_avx=c_bucket[7]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx); - - res_avx1 = c_bucket[16]; - res_avx3 = c_bucket[17]; - res_avx5 = c_bucket[18]; - res_avx7 = c_bucket[19]; - - c22_avx=c_bucket[22]; - c23_avx=c_bucket[23]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final0[0]=res_avx0; - result_final0[1]=res_avx1; - - result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final0[6]=res_avx6; - result_final0[7]=res_avx7; - - - //------------------------AVX interpolation for 1st poly ends-------------- - - - //------------------------AVX interpolation for 2nd poly external------------------- - - //loop1 - res_avx0 = c_bucket[9]; //c_bucket0 - res_avx2 = c_bucket[10]; //c_bucket1 - res_avx4 = c_bucket[11]; //c_bucket2 - res_avx6 = c_bucket[12]; //c_bucket3 - - c6_avx=c_bucket[15]; //c_bucket6 - c7_avx=c_bucket[32]; //c_bucket7 - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx); - - res_avx1 = c_bucket[25]; //c_bucket0 - res_avx3 = c_bucket[26]; //c_bucket1 - res_avx5 = c_bucket[27]; //c_bucket2 - res_avx7 = c_bucket[28]; //c_bucket3 - - c22_avx=c_bucket[31]; - c23_avx=c_bucket[48]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final1[0]=res_avx0; - result_final1[1]=res_avx1; - - result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final1[6]=res_avx6; - result_final1[7]=res_avx7; - - - //------------------------AVX interpolation for 2nd poly ends-------------- - - //------------------------AVX interpolation for 3rd poly external------------------- - - //loop1 - res_avx0 = c_bucket[34]; //c_bucket0 - res_avx2 = c_bucket[35]; //c_bucket1 - res_avx4 = c_bucket[36]; - res_avx6 = c_bucket[37]; - - c6_avx=c_bucket[40]; - c7_avx=c_bucket[41]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx); - - res_avx1 = c_bucket[50]; //c_bucket0 - res_avx3 = c_bucket[51]; //c_bucket1 - res_avx5 = c_bucket[52]; - res_avx7 = c_bucket[53]; - - c22_avx=c_bucket[56]; - c23_avx=c_bucket[57]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - //loop4 - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - //loop5 - result_final2[0]=res_avx0; - result_final2[1]=res_avx1; - - result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final2[6]=res_avx6; - result_final2[7]=res_avx7; - - //------------------------AVX interpolation for 3rd poly ends-------------- - - //------------------------AVX interpolation for 4th poly external------------------- - - //loop1 - res_avx0 = c_bucket[43]; - res_avx2 = c_bucket[44]; - res_avx4 = c_bucket[45]; - res_avx6 = c_bucket[46]; - - c6_avx=c_bucket[65]; - c7_avx=c_bucket[66]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx); - - res_avx1 = c_bucket[59]; - res_avx3 = c_bucket[60]; - res_avx5 = c_bucket[61]; - res_avx7 = c_bucket[62]; - - c22_avx=c_bucket[81]; - c23_avx=c_bucket[82]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final3[0]=res_avx0; - result_final3[1]=res_avx1; - - result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final3[6]=res_avx6; - result_final3[7]=res_avx7; - - - //------------------------AVX interpolation for 4th poly ends-------------- - - //------------------------AVX interpolation for 5th poly external------------------- - - //loop1 - res_avx0 = c_bucket[68]; - res_avx2 = c_bucket[69]; - res_avx4 = c_bucket[70]; - res_avx6 = c_bucket[71]; - - c6_avx=c_bucket[74]; - c7_avx=c_bucket[75]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx); - - res_avx1 = c_bucket[84]; - res_avx3 = c_bucket[85]; - res_avx5 = c_bucket[86]; - res_avx7 = c_bucket[87]; - - c22_avx=c_bucket[90]; - c23_avx=c_bucket[91]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final4[0]=res_avx0; - result_final4[1]=res_avx1; - - result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final4[6]=res_avx6; - result_final4[7]=res_avx7; - - - //------------------------AVX interpolation for 5th poly ends-------------- - - //------------------------AVX interpolation for 6th poly external------------------- - - //loop1 - res_avx0 = c_bucket[77]; - res_avx2 = c_bucket[78]; - res_avx4 = c_bucket[79]; - res_avx6 = c_bucket[96]; - - c6_avx=c_bucket[99]; - c7_avx=c_bucket[100]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx); - - res_avx1 = c_bucket[93]; - res_avx3 = c_bucket[94]; - res_avx5 = c_bucket[95]; - res_avx7 = c_bucket[112]; - - c22_avx=c_bucket[115]; - c23_avx=c_bucket[116]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final5[0]=res_avx0; - result_final5[1]=res_avx1; - - result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final5[6]=res_avx6; - result_final5[7]=res_avx7; - - - //------------------------AVX interpolation for 6th poly ends-------------- - - //------------------------AVX interpolation for 7th poly external------------------- - - //loop1 - res_avx0 = c_bucket[102]; - res_avx2 = c_bucket[103]; - res_avx4 = c_bucket[104]; - res_avx6 = c_bucket[105]; - - c6_avx=c_bucket[108]; - c7_avx=c_bucket[109]; - - c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx); - - res_avx1 = c_bucket[118]; - res_avx3 = c_bucket[119]; - res_avx5 = c_bucket[120]; - res_avx7 = c_bucket[121]; - - c22_avx=c_bucket[124]; - c23_avx=c_bucket[125]; - - c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7); - - c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx); - - c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3); - - temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6); - res_avx5 = _mm256_add_epi16(res_avx5, temp); - - temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2); - res_avx1 = _mm256_add_epi16(res_avx1, temp); - - c22_avx=_mm256_add_epi16(c22_avx, c8_avx); - - res_avx6 = _mm256_add_epi16(res_avx6, c21_avx); - - res_avx2 = _mm256_add_epi16(res_avx2, c20_avx); - - c7_avx=_mm256_add_epi16(c7_avx, c24_avx); - - - //loop4 - - c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4); - c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5); - - c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6); - c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7); - - //loop5 - result_final6[0]=res_avx0; - result_final6[1]=res_avx1; - - result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx); - result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx); - - - result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx); - result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx); - - result_final6[6]=res_avx6; - result_final6[7]=res_avx7; - - - //------------------------AVX interpolation for 7th poly ends-------------- - - //CLOCK2=cpucycles(); - //CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1); - //printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1); - - - -} - -static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ - - int i; - -//---------------AVX data----------------------------- - - __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx; - __m256i aw_avx[7*small_len_avx]; - -//----------------AVX data---------------------------- - - -// EVALUATION - - //CLOCK1=cpucycles(); - - for (i=0; i> (SABER_EQ - SABER_EP); + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } - PQCLEAN_SABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); - PQCLEAN_SABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); - memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); + PQCLEAN_SABER_CLEAN_POLVECp2BS(pk, res); // pack public key } -void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { - uint16_t A[SABER_L][SABER_L][SABER_N]; - uint16_t sp[SABER_L][SABER_N]; - uint16_t bp[SABER_L][SABER_N] = {{0}}; - uint16_t vp[SABER_N] = {0}; - uint16_t mp[SABER_N]; - uint16_t b[SABER_L][SABER_N]; + +void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { size_t i, j; + + poly A[SABER_L][SABER_L]; + poly res[SABER_L]; + poly s[SABER_L]; + poly *temp = A[0]; // re-use stack space + poly *vprime = &A[0][0]; + poly *message = &A[0][1]; + const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; + uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + PQCLEAN_SABER_CLEAN_GenSecret(s, noiseseed); PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A); - PQCLEAN_SABER_CLEAN_GenSecret(sp, seed_sp); - PQCLEAN_SABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); + PQCLEAN_SABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed - for (i = 0; i < SABER_L; i++) { + + // rounding + for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits for (j = 0; j < SABER_N; j++) { - bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); + res[i].coeffs[j] += h1; + res[i].coeffs[j] >>= SABER_EQ - SABER_EP; + res[i].coeffs[j] &= SABER_Q - 1; } } + PQCLEAN_SABER_CLEAN_POLVECp2BS(ciphertext, res); - PQCLEAN_SABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); - PQCLEAN_SABER_CLEAN_BS2POLVECp(b, pk); - PQCLEAN_SABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); - - PQCLEAN_SABER_CLEAN_BS2POLmsg(mp, m); - - for (j = 0; j < SABER_N; j++) { - vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); - } - - PQCLEAN_SABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); -} - -void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { - - uint16_t s[SABER_L][SABER_N]; - uint16_t b[SABER_L][SABER_N]; - uint16_t v[SABER_N] = {0}; - uint16_t cm[SABER_N]; - size_t i; - - PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk); - PQCLEAN_SABER_CLEAN_BS2POLVECp(b, ciphertext); - PQCLEAN_SABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); - PQCLEAN_SABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); + // vector-vector scalar multiplication with mod p + PQCLEAN_SABER_CLEAN_BS2POLVECp(temp, pk); + PQCLEAN_SABER_CLEAN_InnerProd(vprime, temp, s); + PQCLEAN_SABER_CLEAN_BS2POLmsg(message, m); for (i = 0; i < SABER_N; i++) { - v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); + vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); + vprime->coeffs[i] &= SABER_P - 1; + vprime->coeffs[i] >>= SABER_EP - SABER_ET; + } + + PQCLEAN_SABER_CLEAN_POLT2BS(msk_c, vprime); +} + + +void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { + size_t i; + + poly temp[SABER_L]; + poly s[SABER_L]; + + const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; + poly *v = &temp[0]; + poly *cm = &temp[1]; + + PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk); + PQCLEAN_SABER_CLEAN_BS2POLVECp(temp, ciphertext); + PQCLEAN_SABER_CLEAN_InnerProd(&temp[0], temp, s); + + PQCLEAN_SABER_CLEAN_BS2POLT(cm, packed_cm); + + for (i = 0; i < SABER_N; i++) { + v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); + v->coeffs[i] &= SABER_P - 1; + v->coeffs[i] >>= SABER_EP - 1; } PQCLEAN_SABER_CLEAN_POLmsg2BS(m, v); diff --git a/crypto_kem/saber/clean/SABER_indcpa.h b/crypto_kem/saber/clean/SABER_indcpa.h index 3be3ce1c..a5e89e96 100644 --- a/crypto_kem/saber/clean/SABER_indcpa.h +++ b/crypto_kem/saber/clean/SABER_indcpa.h @@ -5,7 +5,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); -void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); +void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); diff --git a/crypto_kem/saber/clean/SABER_params.h b/crypto_kem/saber/clean/SABER_params.h index 200ed0e6..d1a5ddd7 100644 --- a/crypto_kem/saber/clean/SABER_params.h +++ b/crypto_kem/saber/clean/SABER_params.h @@ -2,19 +2,21 @@ #define PARAMS_H -/* Change this for different security strengths */ - /* Don't change anything below this line */ #define SABER_L 3 #define SABER_MU 8 #define SABER_ET 4 -#define SABER_EQ 13 -#define SABER_EP 10 #define SABER_N 256 +#define SABER_EP 10 +#define SABER_P (1 << SABER_EP) + +#define SABER_EQ 13 +#define SABER_Q (1 << SABER_EQ) + #define SABER_SEEDBYTES 32 -#define SABER_NOISE_SEEDBYTES 32 +#define SABER_NOISESEEDBYTES 32 #define SABER_KEYBYTES 32 #define SABER_HASHBYTES 32 diff --git a/crypto_kem/saber/clean/api.h b/crypto_kem/saber/clean/api.h index 699a19f4..7448d46d 100644 --- a/crypto_kem/saber/clean/api.h +++ b/crypto_kem/saber/clean/api.h @@ -15,4 +15,4 @@ int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, cons int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); -#endif /* api_h */ +#endif /* PQCLEAN_SABER_CLEAN_API_H */ diff --git a/crypto_kem/saber/clean/pack_unpack.c b/crypto_kem/saber/clean/pack_unpack.c index e196bd34..2542f0b0 100644 --- a/crypto_kem/saber/clean/pack_unpack.c +++ b/crypto_kem/saber/clean/pack_unpack.c @@ -1,132 +1,145 @@ -#include "api.h" +#include "SABER_params.h" #include "pack_unpack.h" +#include "poly.h" #include -void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 2; j++) { - offset_byte = j; - offset_data = 2 * j; - bytes[offset_byte] = (data[offset_data] & 0x0f) | ((data[offset_data + 1] & 0x0f) << 4); + out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4); + in += 2; + out += 1; } } -void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { - size_t j, offset_byte, offset_data; +void PQCLEAN_SABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 2; j++) { - offset_byte = j; - offset_data = 2 * j; - data[offset_data] = bytes[offset_byte] & 0x0f; - data[offset_data + 1] = (bytes[offset_byte] >> 4) & 0x0f; + out[0] = in[0] & 0x0f; + out[1] = (in[0] >> 4) & 0x0f; + in += 1; + out += 2; } } -static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); - bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); - bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); - bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); - bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); - bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); - bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); - bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); - bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); - bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); - bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); - bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); + out[2] = ((in[1] >> 3) & 0xff); + out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); + out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); + out[5] = ((in[3] >> 1) & 0xff); + out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); + out[7] = ((in[4] >> 4) & 0xff); + out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); + out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); + out[10] = ((in[6] >> 2) & 0xff); + out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); + out[12] = ((in[7] >> 5) & 0xff); + in += 8; + out += 13; } } -static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { - size_t j, offset_byte, offset_data; +static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - offset_byte = 13 * j; - offset_data = 8 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); - data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); - data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); - data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); - data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); - data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); - data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); - data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); + out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); + out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); + out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); + out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); + out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); + out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); + out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + in += 13; + out += 8; } } -static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { - size_t j, offset_byte, offset_data; +static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { + size_t j; + const uint16_t *in = data->coeffs; + uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 5 * j; - offset_data = 4 * j; - bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); - bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); - bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); - bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); - bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); + out[0] = (in[0] & (0xff)); + out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); + out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); + out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); + out[4] = ((in[3] >> 2) & 0xff); + in += 4; + out += 5; } } -static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { - size_t j, offset_byte, offset_data; +static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { + size_t j; + const uint8_t *in = bytes; + uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - offset_byte = 5 * j; - offset_data = 4 * j; - data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); - data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); - data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); - data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); + out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); + out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); + out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); + out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + in += 5; + out += 4; } } -void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { +void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { size_t i; for (i = 0; i < SABER_L; i++) { - POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); + POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); } } -void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { +void PQCLEAN_SABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { size_t i; for (i = 0; i < SABER_L; i++) { - BS2POLq(data[i], bytes + i * SABER_POLYBYTES); + BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); } } -void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { +void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { size_t i; for (i = 0; i < SABER_L; i++) { - POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); + POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); } } -void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { +void PQCLEAN_SABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { size_t i; for (i = 0; i < SABER_L; i++) { - BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); + BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); } } -void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { +void PQCLEAN_SABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { size_t i, j; for (j = 0; j < SABER_KEYBYTES; j++) { for (i = 0; i < 8; i++) { - data[j * 8 + i] = ((bytes[j] >> i) & 0x01); + data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); } } } -void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { +void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { size_t i, j; memset(bytes, 0, SABER_KEYBYTES); for (j = 0; j < SABER_KEYBYTES; j++) { for (i = 0; i < 8; i++) { - bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); + bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); } } } diff --git a/crypto_kem/saber/clean/pack_unpack.h b/crypto_kem/saber/clean/pack_unpack.h index 52537c07..fc6a3abf 100644 --- a/crypto_kem/saber/clean/pack_unpack.h +++ b/crypto_kem/saber/clean/pack_unpack.h @@ -1,27 +1,28 @@ #ifndef PACK_UNPACK_H #define PACK_UNPACK_H #include "SABER_params.h" +#include "poly.h" #include #include -void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); +void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); -void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); +void PQCLEAN_SABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); -void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); +void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); -void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); +void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); -void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); +void PQCLEAN_SABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); -void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); +void PQCLEAN_SABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); -void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); +void PQCLEAN_SABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); -void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); +void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); #endif diff --git a/crypto_kem/saber/clean/poly.c b/crypto_kem/saber/clean/poly.c index 2c44e962..588d0c99 100644 --- a/crypto_kem/saber/clean/poly.c +++ b/crypto_kem/saber/clean/poly.c @@ -3,32 +3,40 @@ #include "fips202.h" #include "pack_unpack.h" #include "poly.h" -#include "poly_mul.h" #include -void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { +void PQCLEAN_SABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) { size_t i, j; - for (i = 0; i < SABER_L; i++) { - for (j = 0; j < SABER_L; j++) { - if (transpose == 1) { - PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); - } else { - PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); + + if (transpose) { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1); + } + } + } else { + for (i = 0; i < SABER_L; i++) { + PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0); + for (j = 1; j < SABER_L; j++) { + PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1); } } } } -void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { - size_t j; - for (j = 0; j < SABER_L; j++) { - PQCLEAN_SABER_CLEAN_poly_mul_acc(res, b[j], s[j]); +void PQCLEAN_SABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) { + size_t i; + + PQCLEAN_SABER_CLEAN_poly_mul(c, &b[0], &s[0], 0); + for (i = 1; i < SABER_L; i++) { + PQCLEAN_SABER_CLEAN_poly_mul(c, &b[i], &s[i], 1); } } -void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { - uint8_t buf[SABER_L * SABER_POLYVECBYTES]; +void PQCLEAN_SABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { size_t i; + uint8_t buf[SABER_L * SABER_POLYVECBYTES]; shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); @@ -37,13 +45,13 @@ void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const } } -void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { - uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; +void PQCLEAN_SABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { size_t i; + uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; - shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); + shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); for (i = 0; i < SABER_L; i++) { - PQCLEAN_SABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); + PQCLEAN_SABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); } } diff --git a/crypto_kem/saber/clean/poly.h b/crypto_kem/saber/clean/poly.h index dd882cb7..d365b489 100644 --- a/crypto_kem/saber/clean/poly.h +++ b/crypto_kem/saber/clean/poly.h @@ -3,13 +3,21 @@ #include "SABER_params.h" #include -void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); +typedef union { + uint16_t coeffs[SABER_N]; +} poly; -void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); -void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); +void PQCLEAN_SABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose); -void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); +void PQCLEAN_SABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]); + +void PQCLEAN_SABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); + +void PQCLEAN_SABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); + + +void PQCLEAN_SABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate); #endif diff --git a/crypto_kem/saber/clean/poly_mul.c b/crypto_kem/saber/clean/poly_mul.c index 686960dc..0e03ff99 100644 --- a/crypto_kem/saber/clean/poly_mul.c +++ b/crypto_kem/saber/clean/poly_mul.c @@ -1,4 +1,4 @@ -#include "poly_mul.h" +#include "poly.h" #include #include @@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t } /* res += a*b */ -void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { - uint16_t c[2 * SABER_N] = {0}; +void PQCLEAN_SABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) { + uint16_t C[2 * SABER_N] = {0}; size_t i; - toom_cook_4way(c, a, b); + toom_cook_4way(C, a->coeffs, b->coeffs); /* reduction */ - for (i = SABER_N; i < 2 * SABER_N; i++) { - res[i - SABER_N] += (c[i - SABER_N] - c[i]); + if (accumulate == 0) { + for (i = SABER_N; i < 2 * SABER_N; i++) { + c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]); + } + } else { + for (i = SABER_N; i < 2 * SABER_N; i++) { + c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]); + } } } diff --git a/crypto_kem/saber/clean/poly_mul.h b/crypto_kem/saber/clean/poly_mul.h index 82140f5b..b28b04f6 100644 --- a/crypto_kem/saber/clean/poly_mul.h +++ b/crypto_kem/saber/clean/poly_mul.h @@ -1,9 +1,3 @@ -#ifndef POLY_MUL_H -#define POLY_MUL_H -#include "SABER_params.h" -#include - -void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); -#endif + diff --git a/test/duplicate_consistency/firesaber_avx2.yml b/test/duplicate_consistency/firesaber_avx2.yml index 1790559f..0ff04768 100644 --- a/test/duplicate_consistency/firesaber_avx2.yml +++ b/test/duplicate_consistency/firesaber_avx2.yml @@ -3,5 +3,14 @@ consistency_checks: scheme: firesaber implementation: clean files: + - api.h + - cbd.h + - pack_unpack.h + - kem.h + - SABER_indcpa.h + - SABER_params.h - verify.h + - cbd.c + - kem.c + - pack_unpack.c - verify.c diff --git a/test/duplicate_consistency/firesaber_clean.yml b/test/duplicate_consistency/firesaber_clean.yml index 3e93674e..5537ea62 100644 --- a/test/duplicate_consistency/firesaber_clean.yml +++ b/test/duplicate_consistency/firesaber_clean.yml @@ -3,5 +3,14 @@ consistency_checks: scheme: firesaber implementation: avx2 files: + - api.h + - cbd.h + - poly_mul.h + - pack_unpack.h + - SABER_indcpa.h + - SABER_params.h - verify.h + - cbd.c + - kem.c + - pack_unpack.c - verify.c diff --git a/test/duplicate_consistency/lightsaber_avx2.yml b/test/duplicate_consistency/lightsaber_avx2.yml index 9239f8f0..ed9ea4b8 100644 --- a/test/duplicate_consistency/lightsaber_avx2.yml +++ b/test/duplicate_consistency/lightsaber_avx2.yml @@ -3,13 +3,27 @@ consistency_checks: scheme: lightsaber implementation: clean files: + - api.h + - cbd.h + - pack_unpack.h + - kem.h + - SABER_indcpa.h + - SABER_params.h - verify.h + - cbd.c + - kem.c + - pack_unpack.c - verify.c - source: scheme: saber implementation: clean files: + - cbd.h + - pack_unpack.h + - kem.h + - SABER_indcpa.h - verify.h + - kem.c - verify.c - source: scheme: saber @@ -22,13 +36,20 @@ consistency_checks: - SABER_indcpa.h - verify.h - kem.c - - pack_unpack.c + - poly.c + - poly_mul.c + - SABER_indcpa.c - verify.c - source: scheme: firesaber implementation: clean files: + - cbd.h + - pack_unpack.h + - kem.h + - SABER_indcpa.h - verify.h + - kem.c - verify.c - source: scheme: firesaber @@ -41,5 +62,7 @@ consistency_checks: - SABER_indcpa.h - verify.h - kem.c - - pack_unpack.c + - poly.c + - poly_mul.c + - SABER_indcpa.c - verify.c diff --git a/test/duplicate_consistency/lightsaber_clean.yml b/test/duplicate_consistency/lightsaber_clean.yml index 14c8975d..8146b7d0 100644 --- a/test/duplicate_consistency/lightsaber_clean.yml +++ b/test/duplicate_consistency/lightsaber_clean.yml @@ -3,7 +3,16 @@ consistency_checks: scheme: lightsaber implementation: avx2 files: + - api.h + - cbd.h + - poly_mul.h + - pack_unpack.h + - SABER_indcpa.h + - SABER_params.h - verify.h + - cbd.c + - kem.c + - pack_unpack.c - verify.c - source: scheme: saber @@ -24,7 +33,12 @@ consistency_checks: scheme: saber implementation: avx2 files: + - cbd.h + - poly_mul.h + - pack_unpack.h + - SABER_indcpa.h - verify.h + - kem.c - verify.c - source: scheme: firesaber @@ -45,5 +59,10 @@ consistency_checks: scheme: firesaber implementation: avx2 files: + - cbd.h + - poly_mul.h + - pack_unpack.h + - SABER_indcpa.h - verify.h + - kem.c - verify.c diff --git a/test/duplicate_consistency/saber_avx2.yml b/test/duplicate_consistency/saber_avx2.yml index 010ac0c9..4a04951d 100644 --- a/test/duplicate_consistency/saber_avx2.yml +++ b/test/duplicate_consistency/saber_avx2.yml @@ -3,13 +3,27 @@ consistency_checks: scheme: saber implementation: clean files: + - api.h + - cbd.h + - pack_unpack.h + - kem.h + - SABER_indcpa.h + - SABER_params.h - verify.h + - cbd.c + - kem.c + - pack_unpack.c - verify.c - source: scheme: firesaber implementation: clean files: + - cbd.h + - pack_unpack.h + - kem.h + - SABER_indcpa.h - verify.h + - kem.c - verify.c - source: scheme: firesaber @@ -22,5 +36,7 @@ consistency_checks: - SABER_indcpa.h - verify.h - kem.c - - pack_unpack.c + - poly.c + - poly_mul.c + - SABER_indcpa.c - verify.c diff --git a/test/duplicate_consistency/saber_clean.yml b/test/duplicate_consistency/saber_clean.yml index 7f01d619..a2700ea8 100644 --- a/test/duplicate_consistency/saber_clean.yml +++ b/test/duplicate_consistency/saber_clean.yml @@ -3,7 +3,16 @@ consistency_checks: scheme: saber implementation: avx2 files: + - api.h + - cbd.h + - poly_mul.h + - pack_unpack.h + - SABER_indcpa.h + - SABER_params.h - verify.h + - cbd.c + - kem.c + - pack_unpack.c - verify.c - source: scheme: firesaber @@ -24,5 +33,10 @@ consistency_checks: scheme: firesaber implementation: avx2 files: + - cbd.h + - poly_mul.h + - pack_unpack.h + - SABER_indcpa.h - verify.h + - kem.c - verify.c From bb037b918b4fe80149baa1a74a21fd00aaa95943 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Wed, 28 Oct 2020 12:12:44 -0400 Subject: [PATCH 06/10] Update packaging script version --- crypto_kem/firesaber/META.yml | 4 ++-- crypto_kem/lightsaber/META.yml | 4 ++-- crypto_kem/saber/META.yml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml index 0aa614ca..6cd4342b 100644 --- a/crypto_kem/firesaber/META.yml +++ b/crypto_kem/firesaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml index 027f1fab..d3d7bf13 100644 --- a/crypto_kem/lightsaber/META.yml +++ b/crypto_kem/lightsaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml index 7eb15ca2..32b4b964 100644 --- a/crypto_kem/saber/META.yml +++ b/crypto_kem/saber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber supported_platforms: - architecture: x86_64 operating_systems: From 8af8939e2b32452ffd71d59077208d7e00e7c368 Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Wed, 28 Oct 2020 12:15:04 -0400 Subject: [PATCH 07/10] Remove empty file --- crypto_kem/firesaber/avx2/Makefile | 2 +- crypto_kem/firesaber/avx2/kem.h | 3 --- crypto_kem/lightsaber/avx2/Makefile | 2 +- crypto_kem/lightsaber/avx2/kem.h | 3 --- crypto_kem/saber/avx2/Makefile | 2 +- crypto_kem/saber/avx2/kem.h | 3 --- 6 files changed, 3 insertions(+), 12 deletions(-) delete mode 100644 crypto_kem/firesaber/avx2/kem.h delete mode 100644 crypto_kem/lightsaber/avx2/kem.h delete mode 100644 crypto_kem/saber/avx2/kem.h diff --git a/crypto_kem/firesaber/avx2/Makefile b/crypto_kem/firesaber/avx2/Makefile index b7fbd7d8..1ecd3c1a 100644 --- a/crypto_kem/firesaber/avx2/Makefile +++ b/crypto_kem/firesaber/avx2/Makefile @@ -1,7 +1,7 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libfiresaber_avx2.a -HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h +HEADERS=api.h cbd.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/firesaber/avx2/kem.h b/crypto_kem/firesaber/avx2/kem.h deleted file mode 100644 index b28b04f6..00000000 --- a/crypto_kem/firesaber/avx2/kem.h +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/crypto_kem/lightsaber/avx2/Makefile b/crypto_kem/lightsaber/avx2/Makefile index f2817574..ff4f4367 100644 --- a/crypto_kem/lightsaber/avx2/Makefile +++ b/crypto_kem/lightsaber/avx2/Makefile @@ -1,7 +1,7 @@ # This Makefile can be used with GNU Make or BSD Make LIB=liblightsaber_avx2.a -HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h +HEADERS=api.h cbd.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/lightsaber/avx2/kem.h b/crypto_kem/lightsaber/avx2/kem.h deleted file mode 100644 index b28b04f6..00000000 --- a/crypto_kem/lightsaber/avx2/kem.h +++ /dev/null @@ -1,3 +0,0 @@ - - - diff --git a/crypto_kem/saber/avx2/Makefile b/crypto_kem/saber/avx2/Makefile index 070665b4..41ea6101 100644 --- a/crypto_kem/saber/avx2/Makefile +++ b/crypto_kem/saber/avx2/Makefile @@ -1,7 +1,7 @@ # This Makefile can be used with GNU Make or BSD Make LIB=libsaber_avx2.a -HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h +HEADERS=api.h cbd.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) diff --git a/crypto_kem/saber/avx2/kem.h b/crypto_kem/saber/avx2/kem.h deleted file mode 100644 index b28b04f6..00000000 --- a/crypto_kem/saber/avx2/kem.h +++ /dev/null @@ -1,3 +0,0 @@ - - - From f8503cbd7158049fce79cc26fe784f17179226ea Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Wed, 28 Oct 2020 23:31:01 -0400 Subject: [PATCH 08/10] simplify pack_unpack.c --- crypto_kem/firesaber/META.yml | 4 +- crypto_kem/firesaber/avx2/pack_unpack.c | 76 +++++++++-------- crypto_kem/firesaber/clean/pack_unpack.c | 76 +++++++++-------- crypto_kem/lightsaber/META.yml | 4 +- crypto_kem/lightsaber/avx2/pack_unpack.c | 84 ++++++++++--------- crypto_kem/lightsaber/clean/pack_unpack.c | 84 ++++++++++--------- crypto_kem/saber/META.yml | 4 +- crypto_kem/saber/avx2/pack_unpack.c | 68 +++++++-------- crypto_kem/saber/clean/pack_unpack.c | 68 +++++++-------- test/duplicate_consistency/firesaber_avx2.yml | 1 - .../duplicate_consistency/firesaber_clean.yml | 1 - .../duplicate_consistency/lightsaber_avx2.yml | 5 -- .../lightsaber_clean.yml | 3 - test/duplicate_consistency/saber_avx2.yml | 3 - test/duplicate_consistency/saber_clean.yml | 2 - 15 files changed, 240 insertions(+), 243 deletions(-) diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml index 6cd4342b..3d1a0a41 100644 --- a/crypto_kem/firesaber/META.yml +++ b/crypto_kem/firesaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/firesaber/avx2/pack_unpack.c b/crypto_kem/firesaber/avx2/pack_unpack.c index 41b9747a..82f5a3f0 100644 --- a/crypto_kem/firesaber/avx2/pack_unpack.c +++ b/crypto_kem/firesaber/avx2/pack_unpack.c @@ -8,23 +8,24 @@ void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const p const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6); - out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4); - out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2); + out[0] = (in[0] & 0x3f) | (in[1] << 6); + out[1] = ((in[1] >> 2) & 0x0f) | (in[2] << 4); + out[2] = ((in[2] >> 4) & 0x03) | (in[3] << 2); in += 4; out += 3; } } void PQCLEAN_FIRESABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + /* This function does not reduce its output mod T */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - out[0] = in[0] & 0x3f; - out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2); - out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4); - out[3] = ((in[2] & 0xff) >> 2); + out[0] = in[0]; + out[1] = (in[0] >> 6) | (in[1] << 2); + out[2] = (in[1] >> 4) | (in[2] << 4); + out[3] = (in[2] >> 2); in += 3; out += 4; } @@ -35,37 +36,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); - out[2] = ((in[1] >> 3) & 0xff); - out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); - out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); - out[5] = ((in[3] >> 1) & 0xff); - out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); - out[7] = ((in[4] >> 4) & 0xff); - out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); - out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); - out[10] = ((in[6] >> 2) & 0xff); - out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); - out[12] = ((in[7] >> 5) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); + out[2] = in[1] >> 3; + out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); + out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); + out[5] = in[3] >> 1; + out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); + out[7] = in[4] >> 4; + out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); + out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); + out[10] = in[6] >> 2; + out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); + out[12] = in[7] >> 5; in += 8; out += 13; } } static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + /* This function does not reduce its output mod Q */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); - out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); - out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); - out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); - out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); - out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); - out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); - out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + out[0] = (in[0]) | (in[1] << 8); + out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + out[2] = (in[3] >> 2) | (in[4] << 6); + out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + out[5] = (in[8] >> 1) | (in[9] << 7); + out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + out[7] = (in[11] >> 3) | (in[12] << 5); in += 13; out += 8; } @@ -76,11 +78,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); - out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); - out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); - out[4] = ((in[3] >> 2) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); + out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); + out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); + out[4] = in[3] >> 2; in += 4; out += 5; } @@ -91,10 +93,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); - out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); - out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); - out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + out[0] = in[0] | (in[1] << 8); + out[1] = (in[1] >> 2) | (in[2] << 6); + out[2] = (in[2] >> 4) | (in[3] << 4); + out[3] = (in[3] >> 6) | (in[4] << 2); in += 5; out += 4; } diff --git a/crypto_kem/firesaber/clean/pack_unpack.c b/crypto_kem/firesaber/clean/pack_unpack.c index ec2f1263..91ffd723 100644 --- a/crypto_kem/firesaber/clean/pack_unpack.c +++ b/crypto_kem/firesaber/clean/pack_unpack.c @@ -8,23 +8,24 @@ void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6); - out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4); - out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2); + out[0] = (in[0] & 0x3f) | (in[1] << 6); + out[1] = ((in[1] >> 2) & 0x0f) | (in[2] << 4); + out[2] = ((in[2] >> 4) & 0x03) | (in[3] << 2); in += 4; out += 3; } } void PQCLEAN_FIRESABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + /* This function does not reduce its output mod T */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - out[0] = in[0] & 0x3f; - out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2); - out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4); - out[3] = ((in[2] & 0xff) >> 2); + out[0] = in[0]; + out[1] = (in[0] >> 6) | (in[1] << 2); + out[2] = (in[1] >> 4) | (in[2] << 4); + out[3] = (in[2] >> 2); in += 3; out += 4; } @@ -35,37 +36,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); - out[2] = ((in[1] >> 3) & 0xff); - out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); - out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); - out[5] = ((in[3] >> 1) & 0xff); - out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); - out[7] = ((in[4] >> 4) & 0xff); - out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); - out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); - out[10] = ((in[6] >> 2) & 0xff); - out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); - out[12] = ((in[7] >> 5) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); + out[2] = in[1] >> 3; + out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); + out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); + out[5] = in[3] >> 1; + out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); + out[7] = in[4] >> 4; + out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); + out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); + out[10] = in[6] >> 2; + out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); + out[12] = in[7] >> 5; in += 8; out += 13; } } static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + /* This function does not reduce its output mod Q */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); - out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); - out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); - out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); - out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); - out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); - out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); - out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + out[0] = (in[0]) | (in[1] << 8); + out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + out[2] = (in[3] >> 2) | (in[4] << 6); + out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + out[5] = (in[8] >> 1) | (in[9] << 7); + out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + out[7] = (in[11] >> 3) | (in[12] << 5); in += 13; out += 8; } @@ -76,11 +78,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); - out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); - out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); - out[4] = ((in[3] >> 2) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); + out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); + out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); + out[4] = in[3] >> 2; in += 4; out += 5; } @@ -91,10 +93,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); - out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); - out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); - out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + out[0] = in[0] | (in[1] << 8); + out[1] = (in[1] >> 2) | (in[2] << 6); + out[2] = (in[2] >> 4) | (in[3] << 4); + out[3] = (in[3] >> 6) | (in[4] << 2); in += 5; out += 4; } diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml index d3d7bf13..263db2e0 100644 --- a/crypto_kem/lightsaber/META.yml +++ b/crypto_kem/lightsaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.c b/crypto_kem/lightsaber/avx2/pack_unpack.c index a9f866ae..a154d24c 100644 --- a/crypto_kem/lightsaber/avx2/pack_unpack.c +++ b/crypto_kem/lightsaber/avx2/pack_unpack.c @@ -8,27 +8,28 @@ void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6); - out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7); - out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5); + out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6); + out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7); + out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5); in += 8; out += 3; } } void PQCLEAN_LIGHTSABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + /* This function does not reduce its output mod T */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0]) & 0x07; - out[1] = ((in[0]) >> 3) & 0x07; - out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2); - out[3] = ((in[1]) >> 1) & 0x07; - out[4] = ((in[1]) >> 4) & 0x07; - out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1); - out[6] = ((in[2] >> 2) & 0x07); - out[7] = ((in[2] >> 5) & 0x07); + out[0] = in[0]; + out[1] = in[0] >> 3; + out[2] = (in[0] >> 6) | (in[1] << 2); + out[3] = in[1] >> 1; + out[4] = in[1] >> 4; + out[5] = (in[1] >> 7) | (in[2] << 1); + out[6] = in[2] >> 2; + out[7] = in[2] >> 5; in += 3; out += 8; } @@ -39,37 +40,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); - out[2] = ((in[1] >> 3) & 0xff); - out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); - out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); - out[5] = ((in[3] >> 1) & 0xff); - out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); - out[7] = ((in[4] >> 4) & 0xff); - out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); - out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); - out[10] = ((in[6] >> 2) & 0xff); - out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); - out[12] = ((in[7] >> 5) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); + out[2] = in[1] >> 3; + out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); + out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); + out[5] = in[3] >> 1; + out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); + out[7] = in[4] >> 4; + out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); + out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); + out[10] = in[6] >> 2; + out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); + out[12] = in[7] >> 5; in += 8; out += 13; } } static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + /* This function does not reduce its output mod Q */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); - out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); - out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); - out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); - out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); - out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); - out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); - out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + out[0] = (in[0]) | (in[1] << 8); + out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + out[2] = (in[3] >> 2) | (in[4] << 6); + out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + out[5] = (in[8] >> 1) | (in[9] << 7); + out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + out[7] = (in[11] >> 3) | (in[12] << 5); in += 13; out += 8; } @@ -80,11 +82,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); - out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); - out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); - out[4] = ((in[3] >> 2) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); + out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); + out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); + out[4] = in[3] >> 2; in += 4; out += 5; } @@ -95,10 +97,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); - out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); - out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); - out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + out[0] = in[0] | (in[1] << 8); + out[1] = (in[1] >> 2) | (in[2] << 6); + out[2] = (in[2] >> 4) | (in[3] << 4); + out[3] = (in[3] >> 6) | (in[4] << 2); in += 5; out += 4; } diff --git a/crypto_kem/lightsaber/clean/pack_unpack.c b/crypto_kem/lightsaber/clean/pack_unpack.c index f64c4143..c1c8666c 100644 --- a/crypto_kem/lightsaber/clean/pack_unpack.c +++ b/crypto_kem/lightsaber/clean/pack_unpack.c @@ -8,27 +8,28 @@ void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6); - out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7); - out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5); + out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6); + out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7); + out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5); in += 8; out += 3; } } void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + /* This function does not reduce its output mod T */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0]) & 0x07; - out[1] = ((in[0]) >> 3) & 0x07; - out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2); - out[3] = ((in[1]) >> 1) & 0x07; - out[4] = ((in[1]) >> 4) & 0x07; - out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1); - out[6] = ((in[2] >> 2) & 0x07); - out[7] = ((in[2] >> 5) & 0x07); + out[0] = in[0]; + out[1] = in[0] >> 3; + out[2] = (in[0] >> 6) | (in[1] << 2); + out[3] = in[1] >> 1; + out[4] = in[1] >> 4; + out[5] = (in[1] >> 7) | (in[2] << 1); + out[6] = in[2] >> 2; + out[7] = in[2] >> 5; in += 3; out += 8; } @@ -39,37 +40,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); - out[2] = ((in[1] >> 3) & 0xff); - out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); - out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); - out[5] = ((in[3] >> 1) & 0xff); - out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); - out[7] = ((in[4] >> 4) & 0xff); - out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); - out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); - out[10] = ((in[6] >> 2) & 0xff); - out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); - out[12] = ((in[7] >> 5) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); + out[2] = in[1] >> 3; + out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); + out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); + out[5] = in[3] >> 1; + out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); + out[7] = in[4] >> 4; + out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); + out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); + out[10] = in[6] >> 2; + out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); + out[12] = in[7] >> 5; in += 8; out += 13; } } static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + /* This function does not reduce its output mod Q */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); - out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); - out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); - out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); - out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); - out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); - out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); - out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + out[0] = (in[0]) | (in[1] << 8); + out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + out[2] = (in[3] >> 2) | (in[4] << 6); + out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + out[5] = (in[8] >> 1) | (in[9] << 7); + out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + out[7] = (in[11] >> 3) | (in[12] << 5); in += 13; out += 8; } @@ -80,11 +82,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); - out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); - out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); - out[4] = ((in[3] >> 2) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); + out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); + out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); + out[4] = in[3] >> 2; in += 4; out += 5; } @@ -95,10 +97,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); - out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); - out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); - out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + out[0] = in[0] | (in[1] << 8); + out[1] = (in[1] >> 2) | (in[2] << 6); + out[2] = (in[2] >> 4) | (in[3] << 4); + out[3] = (in[3] >> 6) | (in[4] << 2); in += 5; out += 4; } diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml index 32b4b964..319f4ebf 100644 --- a/crypto_kem/saber/META.yml +++ b/crypto_kem/saber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/saber/avx2/pack_unpack.c b/crypto_kem/saber/avx2/pack_unpack.c index 9bb46acb..a055b7e5 100644 --- a/crypto_kem/saber/avx2/pack_unpack.c +++ b/crypto_kem/saber/avx2/pack_unpack.c @@ -8,19 +8,20 @@ void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 2; j++) { - out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4); + out[0] = (in[0] & 0x0f) | (in[1] << 4); in += 2; out += 1; } } void PQCLEAN_SABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + /* This function does not reduce its output mod T */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 2; j++) { - out[0] = in[0] & 0x0f; - out[1] = (in[0] >> 4) & 0x0f; + out[0] = in[0]; + out[1] = in[0] >> 4; in += 1; out += 2; } @@ -31,37 +32,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); - out[2] = ((in[1] >> 3) & 0xff); - out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); - out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); - out[5] = ((in[3] >> 1) & 0xff); - out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); - out[7] = ((in[4] >> 4) & 0xff); - out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); - out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); - out[10] = ((in[6] >> 2) & 0xff); - out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); - out[12] = ((in[7] >> 5) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); + out[2] = in[1] >> 3; + out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); + out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); + out[5] = in[3] >> 1; + out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); + out[7] = in[4] >> 4; + out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); + out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); + out[10] = in[6] >> 2; + out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); + out[12] = in[7] >> 5; in += 8; out += 13; } } static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + /* This function does not reduce its output mod Q */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); - out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); - out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); - out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); - out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); - out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); - out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); - out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + out[0] = (in[0]) | (in[1] << 8); + out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + out[2] = (in[3] >> 2) | (in[4] << 6); + out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + out[5] = (in[8] >> 1) | (in[9] << 7); + out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + out[7] = (in[11] >> 3) | (in[12] << 5); in += 13; out += 8; } @@ -72,11 +74,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); - out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); - out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); - out[4] = ((in[3] >> 2) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); + out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); + out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); + out[4] = in[3] >> 2; in += 4; out += 5; } @@ -87,10 +89,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); - out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); - out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); - out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + out[0] = in[0] | (in[1] << 8); + out[1] = (in[1] >> 2) | (in[2] << 6); + out[2] = (in[2] >> 4) | (in[3] << 4); + out[3] = (in[3] >> 6) | (in[4] << 2); in += 5; out += 4; } diff --git a/crypto_kem/saber/clean/pack_unpack.c b/crypto_kem/saber/clean/pack_unpack.c index 2542f0b0..1b5bed81 100644 --- a/crypto_kem/saber/clean/pack_unpack.c +++ b/crypto_kem/saber/clean/pack_unpack.c @@ -8,19 +8,20 @@ void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 2; j++) { - out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4); + out[0] = (in[0] & 0x0f) | (in[1] << 4); in += 2; out += 1; } } void PQCLEAN_SABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { + /* This function does not reduce its output mod T */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 2; j++) { - out[0] = in[0] & 0x0f; - out[1] = (in[0] >> 4) & 0x0f; + out[0] = in[0]; + out[1] = in[0] >> 4; in += 1; out += 2; } @@ -31,37 +32,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); - out[2] = ((in[1] >> 3) & 0xff); - out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); - out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); - out[5] = ((in[3] >> 1) & 0xff); - out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); - out[7] = ((in[4] >> 4) & 0xff); - out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); - out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); - out[10] = ((in[6] >> 2) & 0xff); - out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); - out[12] = ((in[7] >> 5) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); + out[2] = in[1] >> 3; + out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); + out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); + out[5] = in[3] >> 1; + out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); + out[7] = in[4] >> 4; + out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); + out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); + out[10] = in[6] >> 2; + out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); + out[12] = in[7] >> 5; in += 8; out += 13; } } static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { + /* This function does not reduce its output mod Q */ size_t j; const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); - out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); - out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); - out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); - out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); - out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); - out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); - out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); + out[0] = (in[0]) | (in[1] << 8); + out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11); + out[2] = (in[3] >> 2) | (in[4] << 6); + out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9); + out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12); + out[5] = (in[8] >> 1) | (in[9] << 7); + out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10); + out[7] = (in[11] >> 3) | (in[12] << 5); in += 13; out += 8; } @@ -72,11 +74,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)); - out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); - out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); - out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); - out[4] = ((in[3] >> 2) & 0xff); + out[0] = in[0]; + out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); + out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); + out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); + out[4] = in[3] >> 2; in += 4; out += 5; } @@ -87,10 +89,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) const uint8_t *in = bytes; uint16_t *out = data->coeffs; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); - out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); - out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); - out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); + out[0] = in[0] | (in[1] << 8); + out[1] = (in[1] >> 2) | (in[2] << 6); + out[2] = (in[2] >> 4) | (in[3] << 4); + out[3] = (in[3] >> 6) | (in[4] << 2); in += 5; out += 4; } diff --git a/test/duplicate_consistency/firesaber_avx2.yml b/test/duplicate_consistency/firesaber_avx2.yml index 0ff04768..f5240334 100644 --- a/test/duplicate_consistency/firesaber_avx2.yml +++ b/test/duplicate_consistency/firesaber_avx2.yml @@ -6,7 +6,6 @@ consistency_checks: - api.h - cbd.h - pack_unpack.h - - kem.h - SABER_indcpa.h - SABER_params.h - verify.h diff --git a/test/duplicate_consistency/firesaber_clean.yml b/test/duplicate_consistency/firesaber_clean.yml index 5537ea62..bcfed7c0 100644 --- a/test/duplicate_consistency/firesaber_clean.yml +++ b/test/duplicate_consistency/firesaber_clean.yml @@ -5,7 +5,6 @@ consistency_checks: files: - api.h - cbd.h - - poly_mul.h - pack_unpack.h - SABER_indcpa.h - SABER_params.h diff --git a/test/duplicate_consistency/lightsaber_avx2.yml b/test/duplicate_consistency/lightsaber_avx2.yml index ed9ea4b8..dd6e9fed 100644 --- a/test/duplicate_consistency/lightsaber_avx2.yml +++ b/test/duplicate_consistency/lightsaber_avx2.yml @@ -6,7 +6,6 @@ consistency_checks: - api.h - cbd.h - pack_unpack.h - - kem.h - SABER_indcpa.h - SABER_params.h - verify.h @@ -20,7 +19,6 @@ consistency_checks: files: - cbd.h - pack_unpack.h - - kem.h - SABER_indcpa.h - verify.h - kem.c @@ -30,7 +28,6 @@ consistency_checks: implementation: avx2 files: - cbd.h - - kem.h - pack_unpack.h - poly.h - SABER_indcpa.h @@ -46,7 +43,6 @@ consistency_checks: files: - cbd.h - pack_unpack.h - - kem.h - SABER_indcpa.h - verify.h - kem.c @@ -56,7 +52,6 @@ consistency_checks: implementation: avx2 files: - cbd.h - - kem.h - pack_unpack.h - poly.h - SABER_indcpa.h diff --git a/test/duplicate_consistency/lightsaber_clean.yml b/test/duplicate_consistency/lightsaber_clean.yml index 8146b7d0..2f36ec86 100644 --- a/test/duplicate_consistency/lightsaber_clean.yml +++ b/test/duplicate_consistency/lightsaber_clean.yml @@ -5,7 +5,6 @@ consistency_checks: files: - api.h - cbd.h - - poly_mul.h - pack_unpack.h - SABER_indcpa.h - SABER_params.h @@ -34,7 +33,6 @@ consistency_checks: implementation: avx2 files: - cbd.h - - poly_mul.h - pack_unpack.h - SABER_indcpa.h - verify.h @@ -60,7 +58,6 @@ consistency_checks: implementation: avx2 files: - cbd.h - - poly_mul.h - pack_unpack.h - SABER_indcpa.h - verify.h diff --git a/test/duplicate_consistency/saber_avx2.yml b/test/duplicate_consistency/saber_avx2.yml index 4a04951d..0b4b60d7 100644 --- a/test/duplicate_consistency/saber_avx2.yml +++ b/test/duplicate_consistency/saber_avx2.yml @@ -6,7 +6,6 @@ consistency_checks: - api.h - cbd.h - pack_unpack.h - - kem.h - SABER_indcpa.h - SABER_params.h - verify.h @@ -20,7 +19,6 @@ consistency_checks: files: - cbd.h - pack_unpack.h - - kem.h - SABER_indcpa.h - verify.h - kem.c @@ -30,7 +28,6 @@ consistency_checks: implementation: avx2 files: - cbd.h - - kem.h - pack_unpack.h - poly.h - SABER_indcpa.h diff --git a/test/duplicate_consistency/saber_clean.yml b/test/duplicate_consistency/saber_clean.yml index a2700ea8..7f5ba121 100644 --- a/test/duplicate_consistency/saber_clean.yml +++ b/test/duplicate_consistency/saber_clean.yml @@ -5,7 +5,6 @@ consistency_checks: files: - api.h - cbd.h - - poly_mul.h - pack_unpack.h - SABER_indcpa.h - SABER_params.h @@ -34,7 +33,6 @@ consistency_checks: implementation: avx2 files: - cbd.h - - poly_mul.h - pack_unpack.h - SABER_indcpa.h - verify.h From 11b4772e73ae4967619356b517c715b21f27627b Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Wed, 28 Oct 2020 23:42:28 -0400 Subject: [PATCH 09/10] explicit casts --- crypto_kem/firesaber/META.yml | 4 +-- crypto_kem/firesaber/avx2/pack_unpack.c | 42 +++++++++++------------ crypto_kem/firesaber/clean/pack_unpack.c | 42 +++++++++++------------ crypto_kem/lightsaber/META.yml | 4 +-- crypto_kem/lightsaber/avx2/pack_unpack.c | 42 +++++++++++------------ crypto_kem/lightsaber/clean/pack_unpack.c | 42 +++++++++++------------ crypto_kem/saber/META.yml | 4 +-- crypto_kem/saber/avx2/pack_unpack.c | 38 ++++++++++---------- crypto_kem/saber/clean/pack_unpack.c | 38 ++++++++++---------- 9 files changed, 128 insertions(+), 128 deletions(-) diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml index 3d1a0a41..24363a85 100644 --- a/crypto_kem/firesaber/META.yml +++ b/crypto_kem/firesaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/firesaber/avx2/pack_unpack.c b/crypto_kem/firesaber/avx2/pack_unpack.c index 82f5a3f0..d5e6b9ba 100644 --- a/crypto_kem/firesaber/avx2/pack_unpack.c +++ b/crypto_kem/firesaber/avx2/pack_unpack.c @@ -8,9 +8,9 @@ void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const p const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & 0x3f) | (in[1] << 6); - out[1] = ((in[1] >> 2) & 0x0f) | (in[2] << 4); - out[2] = ((in[2] >> 4) & 0x03) | (in[3] << 2); + out[0] = (uint8_t) ((in[0] & 0x3f) | (in[1] << 6)); + out[1] = (uint8_t) (((in[1] >> 2) & 0x0f) | (in[2] << 4)); + out[2] = (uint8_t) (((in[2] >> 4) & 0x03) | (in[3] << 2)); in += 4; out += 3; } @@ -36,19 +36,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); - out[2] = in[1] >> 3; - out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); - out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); - out[5] = in[3] >> 1; - out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); - out[7] = in[4] >> 4; - out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); - out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); - out[10] = in[6] >> 2; - out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); - out[12] = in[7] >> 5; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); in += 8; out += 13; } @@ -78,11 +78,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); - out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); - out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); - out[4] = in[3] >> 2; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); in += 4; out += 5; } diff --git a/crypto_kem/firesaber/clean/pack_unpack.c b/crypto_kem/firesaber/clean/pack_unpack.c index 91ffd723..2d1538ae 100644 --- a/crypto_kem/firesaber/clean/pack_unpack.c +++ b/crypto_kem/firesaber/clean/pack_unpack.c @@ -8,9 +8,9 @@ void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = (in[0] & 0x3f) | (in[1] << 6); - out[1] = ((in[1] >> 2) & 0x0f) | (in[2] << 4); - out[2] = ((in[2] >> 4) & 0x03) | (in[3] << 2); + out[0] = (uint8_t) ((in[0] & 0x3f) | (in[1] << 6)); + out[1] = (uint8_t) (((in[1] >> 2) & 0x0f) | (in[2] << 4)); + out[2] = (uint8_t) (((in[2] >> 4) & 0x03) | (in[3] << 2)); in += 4; out += 3; } @@ -36,19 +36,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); - out[2] = in[1] >> 3; - out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); - out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); - out[5] = in[3] >> 1; - out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); - out[7] = in[4] >> 4; - out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); - out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); - out[10] = in[6] >> 2; - out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); - out[12] = in[7] >> 5; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); in += 8; out += 13; } @@ -78,11 +78,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); - out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); - out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); - out[4] = in[3] >> 2; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); in += 4; out += 5; } diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml index 263db2e0..ad9d6acc 100644 --- a/crypto_kem/lightsaber/META.yml +++ b/crypto_kem/lightsaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.c b/crypto_kem/lightsaber/avx2/pack_unpack.c index a154d24c..08f7a9d9 100644 --- a/crypto_kem/lightsaber/avx2/pack_unpack.c +++ b/crypto_kem/lightsaber/avx2/pack_unpack.c @@ -8,9 +8,9 @@ void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6); - out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7); - out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5); + out[0] = (uint8_t) ((in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6)); + out[1] = (uint8_t) (((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7)); + out[2] = (uint8_t) (((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5)); in += 8; out += 3; } @@ -40,19 +40,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); - out[2] = in[1] >> 3; - out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); - out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); - out[5] = in[3] >> 1; - out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); - out[7] = in[4] >> 4; - out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); - out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); - out[10] = in[6] >> 2; - out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); - out[12] = in[7] >> 5; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); in += 8; out += 13; } @@ -82,11 +82,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); - out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); - out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); - out[4] = in[3] >> 2; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); in += 4; out += 5; } diff --git a/crypto_kem/lightsaber/clean/pack_unpack.c b/crypto_kem/lightsaber/clean/pack_unpack.c index c1c8666c..106a62d4 100644 --- a/crypto_kem/lightsaber/clean/pack_unpack.c +++ b/crypto_kem/lightsaber/clean/pack_unpack.c @@ -8,9 +8,9 @@ void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6); - out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7); - out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5); + out[0] = (uint8_t) ((in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6)); + out[1] = (uint8_t) (((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7)); + out[2] = (uint8_t) (((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5)); in += 8; out += 3; } @@ -40,19 +40,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); - out[2] = in[1] >> 3; - out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); - out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); - out[5] = in[3] >> 1; - out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); - out[7] = in[4] >> 4; - out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); - out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); - out[10] = in[6] >> 2; - out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); - out[12] = in[7] >> 5; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); in += 8; out += 13; } @@ -82,11 +82,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); - out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); - out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); - out[4] = in[3] >> 2; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); in += 4; out += 5; } diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml index 319f4ebf..f6375c71 100644 --- a/crypto_kem/saber/META.yml +++ b/crypto_kem/saber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/saber/avx2/pack_unpack.c b/crypto_kem/saber/avx2/pack_unpack.c index a055b7e5..f9315d5d 100644 --- a/crypto_kem/saber/avx2/pack_unpack.c +++ b/crypto_kem/saber/avx2/pack_unpack.c @@ -8,7 +8,7 @@ void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 2; j++) { - out[0] = (in[0] & 0x0f) | (in[1] << 4); + out[0] = (uint8_t) ((in[0] & 0x0f) | (in[1] << 4)); in += 2; out += 1; } @@ -32,19 +32,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); - out[2] = in[1] >> 3; - out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); - out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); - out[5] = in[3] >> 1; - out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); - out[7] = in[4] >> 4; - out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); - out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); - out[10] = in[6] >> 2; - out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); - out[12] = in[7] >> 5; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); in += 8; out += 13; } @@ -74,11 +74,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); - out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); - out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); - out[4] = in[3] >> 2; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); in += 4; out += 5; } diff --git a/crypto_kem/saber/clean/pack_unpack.c b/crypto_kem/saber/clean/pack_unpack.c index 1b5bed81..89a98951 100644 --- a/crypto_kem/saber/clean/pack_unpack.c +++ b/crypto_kem/saber/clean/pack_unpack.c @@ -8,7 +8,7 @@ void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 2; j++) { - out[0] = (in[0] & 0x0f) | (in[1] << 4); + out[0] = (uint8_t) ((in[0] & 0x0f) | (in[1] << 4)); in += 2; out += 1; } @@ -32,19 +32,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 8; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5); - out[2] = in[1] >> 3; - out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2); - out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7); - out[5] = in[3] >> 1; - out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4); - out[7] = in[4] >> 4; - out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1); - out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6); - out[10] = in[6] >> 2; - out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3); - out[12] = in[7] >> 5; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5)); + out[2] = (uint8_t) (in[1] >> 3); + out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2)); + out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7)); + out[5] = (uint8_t) (in[3] >> 1); + out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4)); + out[7] = (uint8_t) (in[4] >> 4); + out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1)); + out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6)); + out[10] = (uint8_t) (in[6] >> 2); + out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3)); + out[12] = (uint8_t) (in[7] >> 5); in += 8; out += 13; } @@ -74,11 +74,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) const uint16_t *in = data->coeffs; uint8_t *out = bytes; for (j = 0; j < SABER_N / 4; j++) { - out[0] = in[0]; - out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2); - out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4); - out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6); - out[4] = in[3] >> 2; + out[0] = (uint8_t) (in[0]); + out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2)); + out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4)); + out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6)); + out[4] = (uint8_t) (in[3] >> 2); in += 4; out += 5; } From dd00b7fbd89d373286ea8e474ffdbb171580da8d Mon Sep 17 00:00:00 2001 From: "John M. Schanck" Date: Fri, 30 Oct 2020 18:01:44 -0400 Subject: [PATCH 10/10] slightly faster avx2 schoolbook multiplications --- crypto_kem/firesaber/META.yml | 4 +- crypto_kem/firesaber/avx2/poly_mul.c | 1336 ++++++++++++------------- crypto_kem/lightsaber/META.yml | 4 +- crypto_kem/lightsaber/avx2/poly_mul.c | 1336 ++++++++++++------------- crypto_kem/saber/META.yml | 4 +- crypto_kem/saber/avx2/poly_mul.c | 1336 ++++++++++++------------- 6 files changed, 1968 insertions(+), 2052 deletions(-) diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml index 24363a85..9e067250 100644 --- a/crypto_kem/firesaber/META.yml +++ b/crypto_kem/firesaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/firesaber/avx2/poly_mul.c b/crypto_kem/firesaber/avx2/poly_mul.c index d4e37d59..4d4ec959 100644 --- a/crypto_kem/firesaber/avx2/poly_mul.c +++ b/crypto_kem/firesaber/avx2/poly_mul.c @@ -4,701 +4,673 @@ #define L (SABER_N / 64) -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - -static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; +/* 16 word parallel multiply */ +#define mul(a, b) _mm256_mullo_epi16((a), (b)) +/* 16 word parallel multiply and accumulate */ +#define mac(a, b, c) _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c)) +static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = mul_add(a0, b0, c[0]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[1] = _mm256_add_epi16(temp, c[1]); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - temp = mul_add(a2, b0, temp); - c[2] = _mm256_add_epi16(temp, c[2]); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp = mul_add(a3, b0, temp); - c[3] = _mm256_add_epi16(temp, c[3]); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp = mul_add(a2, b2, temp); - c[4] = _mm256_add_epi16(temp, c[4]); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp = mul_add(a5, b0, temp); - c[5] = _mm256_add_epi16(temp, c[5]); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a4, b2, temp); - c[6] = _mm256_add_epi16(temp, c[6]); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a5, b2, temp); - c[7] = _mm256_add_epi16(temp, c[7]); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a6, b2, temp); - c[8] = _mm256_add_epi16(temp, c[8]); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a7, b2, temp); - c[9] = _mm256_add_epi16(temp, c[9]); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a[8], b2, temp); - c[10] = _mm256_add_epi16(temp, c[10]); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - temp = mul_add(a[9], b2, temp); - c[11] = _mm256_add_epi16(temp, c[11]); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - temp = mul_add(a[10], b2, temp); - c[12] = _mm256_add_epi16(temp, c[12]); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - temp = mul_add(a[11], b2, temp); - c[13] = _mm256_add_epi16(temp, c[13]); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - temp = mul_add(a[12], b2, temp); - c[14] = _mm256_add_epi16(temp, c[14]); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - temp = mul_add(a[13], b2, temp); - c[15] = _mm256_add_epi16(temp, c[15]); - - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - temp = mul_add(a1, b[1], temp); - c[16] = _mm256_add_epi16(temp, c[16]); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - temp = mul_add(a1, b[2], temp); - c[17] = _mm256_add_epi16(temp, c[17]); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - temp = mul_add(a1, b[3], temp); - c[18] = _mm256_add_epi16(temp, c[18]); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - temp = mul_add(a1, b[4], temp); - c[19] = _mm256_add_epi16(temp, c[19]); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - temp = mul_add(a1, b[5], temp); - c[20] = _mm256_add_epi16(temp, c[20]); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - temp = mul_add(a1, b[6], temp); - c[21] = _mm256_add_epi16(temp, c[21]); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - temp = mul_add(a1, b[7], temp); - c[22] = _mm256_add_epi16(temp, c[22]); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - temp = mul_add(a1, b7, temp); - c[23] = _mm256_add_epi16(temp, c[23]); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - temp = mul_add(a1, b6, temp); - c[24] = _mm256_add_epi16(temp, c[24]); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - temp = mul_add(a1, b5, temp); - c[25] = _mm256_add_epi16(temp, c[25]); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - temp = mul_add(a1, b4, temp); - c[26] = _mm256_add_epi16(temp, c[26]); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - temp = mul_add(a1, b3, temp); - c[27] = _mm256_add_epi16(temp, c[27]); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - temp = mul_add(a1, b2, temp); - c[28] = _mm256_add_epi16(temp, c[28]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[29] = _mm256_add_epi16(temp, c[29]); - - c[30] = mul_add(a1, b1, c[30]); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mul(a0, b0); + t0 = mul(a0, b1); + c[1] = mac(a1, b0, t0); + t0 = mul(a0, b2); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[5] = mac(a3, b2, t0); + c[6] = mul(a3, b3); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[9] = mac(a3, b2, t0); + c[10] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[13] = mac(a3, b2, t0); + c[14] = mul(a3, b3); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[17] = mac(a3, b2, t0); + c[18] = mul(a3, b3); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[21] = mac(a3, b2, t0); + c[22] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[25] = mac(a3, b2, t0); + c[26] = mul(a3, b3); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[29] = mac(a3, b2, t0); + c[30] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + c[31] = _mm256_setzero_si256(); } - -static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - +static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = _mm256_mullo_epi16(a0, b0); - - temp = _mm256_mullo_epi16(a0, b1); - c[1] = mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - c[2] = mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c[3] = mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c[4] = mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - c[7] = mul_add(a5, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - c[8] = mul_add(a6, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - c[9] = mul_add(a7, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - c[10] = mul_add(a[8], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - c[11] = mul_add(a[9], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - c[12] = mul_add(a[10], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - c[13] = mul_add(a[11], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - c[14] = mul_add(a[12], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - c[15] = mul_add(a[13], b2, temp); - - // unrolled second triangle - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - c[16] = mul_add(a1, b[1], temp); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - c[17] = mul_add(a1, b[2], temp); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - c[18] = mul_add(a1, b[3], temp); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - c[19] = mul_add(a1, b[4], temp); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - c[20] = mul_add(a1, b[5], temp); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - c[21] = mul_add(a1, b[6], temp); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - c[22] = mul_add(a1, b[7], temp); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - c[23] = mul_add(a1, b7, temp); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - c[24] = mul_add(a1, b6, temp); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - c[25] = mul_add(a1, b5, temp); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - c[26] = mul_add(a1, b4, temp); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - c[27] = mul_add(a1, b3, temp); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - c[28] = mul_add(a1, b2, temp); - - temp = _mm256_mullo_epi16(a0, b1); - c[29] = mul_add(a1, b0, temp); - - c[30] = _mm256_mullo_epi16(a1, b1); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mac(a0, b0, c[0]); + t0 = mac(a0, b1, c[1]); + c[1] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[2]); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[3]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[4]); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[5]); + c[5] = mac(a3, b2, t0); + c[6] = mac(a3, b3, c[6]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[27]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[28]); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[29]); + c[29] = mac(a3, b2, t0); + c[30] = mac(a3, b3, c[30]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); } + static void transpose(__m256i *M) { __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; __m256i temp, temp0, temp1, temp2; @@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co //-----------------Forward transposes ends--------------------------------- if (accumulate == 0) { - schoolbook_avx(vc, va, vb); - schoolbook_avx(vc + 32, va + 16, vb + 16); - schoolbook_avx(vc + 64, va + 32, vb + 32); - schoolbook_avx(vc + 96, va + 48, vb + 48); + schoolbook16x16(vc, va, vb); + schoolbook16x16(vc + 32, va + 16, vb + 16); + schoolbook16x16(vc + 64, va + 32, vb + 32); + schoolbook16x16(vc + 96, va + 48, vb + 48); } else { - schoolbook_avx_acc(vc, va, vb); - schoolbook_avx_acc(vc + 32, va + 16, vb + 16); - schoolbook_avx_acc(vc + 64, va + 32, vb + 32); - schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + schoolbook16x16_acc(vc, va, vb); + schoolbook16x16_acc(vc + 32, va + 16, vb + 16); + schoolbook16x16_acc(vc + 64, va + 32, vb + 32); + schoolbook16x16_acc(vc + 96, va + 48, vb + 48); } } diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml index ad9d6acc..ec0f7517 100644 --- a/crypto_kem/lightsaber/META.yml +++ b/crypto_kem/lightsaber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/lightsaber/avx2/poly_mul.c b/crypto_kem/lightsaber/avx2/poly_mul.c index 9ae8de05..51504491 100644 --- a/crypto_kem/lightsaber/avx2/poly_mul.c +++ b/crypto_kem/lightsaber/avx2/poly_mul.c @@ -4,701 +4,673 @@ #define L (SABER_N / 64) -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - -static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; +/* 16 word parallel multiply */ +#define mul(a, b) _mm256_mullo_epi16((a), (b)) +/* 16 word parallel multiply and accumulate */ +#define mac(a, b, c) _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c)) +static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = mul_add(a0, b0, c[0]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[1] = _mm256_add_epi16(temp, c[1]); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - temp = mul_add(a2, b0, temp); - c[2] = _mm256_add_epi16(temp, c[2]); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp = mul_add(a3, b0, temp); - c[3] = _mm256_add_epi16(temp, c[3]); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp = mul_add(a2, b2, temp); - c[4] = _mm256_add_epi16(temp, c[4]); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp = mul_add(a5, b0, temp); - c[5] = _mm256_add_epi16(temp, c[5]); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a4, b2, temp); - c[6] = _mm256_add_epi16(temp, c[6]); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a5, b2, temp); - c[7] = _mm256_add_epi16(temp, c[7]); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a6, b2, temp); - c[8] = _mm256_add_epi16(temp, c[8]); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a7, b2, temp); - c[9] = _mm256_add_epi16(temp, c[9]); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a[8], b2, temp); - c[10] = _mm256_add_epi16(temp, c[10]); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - temp = mul_add(a[9], b2, temp); - c[11] = _mm256_add_epi16(temp, c[11]); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - temp = mul_add(a[10], b2, temp); - c[12] = _mm256_add_epi16(temp, c[12]); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - temp = mul_add(a[11], b2, temp); - c[13] = _mm256_add_epi16(temp, c[13]); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - temp = mul_add(a[12], b2, temp); - c[14] = _mm256_add_epi16(temp, c[14]); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - temp = mul_add(a[13], b2, temp); - c[15] = _mm256_add_epi16(temp, c[15]); - - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - temp = mul_add(a1, b[1], temp); - c[16] = _mm256_add_epi16(temp, c[16]); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - temp = mul_add(a1, b[2], temp); - c[17] = _mm256_add_epi16(temp, c[17]); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - temp = mul_add(a1, b[3], temp); - c[18] = _mm256_add_epi16(temp, c[18]); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - temp = mul_add(a1, b[4], temp); - c[19] = _mm256_add_epi16(temp, c[19]); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - temp = mul_add(a1, b[5], temp); - c[20] = _mm256_add_epi16(temp, c[20]); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - temp = mul_add(a1, b[6], temp); - c[21] = _mm256_add_epi16(temp, c[21]); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - temp = mul_add(a1, b[7], temp); - c[22] = _mm256_add_epi16(temp, c[22]); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - temp = mul_add(a1, b7, temp); - c[23] = _mm256_add_epi16(temp, c[23]); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - temp = mul_add(a1, b6, temp); - c[24] = _mm256_add_epi16(temp, c[24]); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - temp = mul_add(a1, b5, temp); - c[25] = _mm256_add_epi16(temp, c[25]); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - temp = mul_add(a1, b4, temp); - c[26] = _mm256_add_epi16(temp, c[26]); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - temp = mul_add(a1, b3, temp); - c[27] = _mm256_add_epi16(temp, c[27]); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - temp = mul_add(a1, b2, temp); - c[28] = _mm256_add_epi16(temp, c[28]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[29] = _mm256_add_epi16(temp, c[29]); - - c[30] = mul_add(a1, b1, c[30]); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mul(a0, b0); + t0 = mul(a0, b1); + c[1] = mac(a1, b0, t0); + t0 = mul(a0, b2); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[5] = mac(a3, b2, t0); + c[6] = mul(a3, b3); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[9] = mac(a3, b2, t0); + c[10] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[13] = mac(a3, b2, t0); + c[14] = mul(a3, b3); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[17] = mac(a3, b2, t0); + c[18] = mul(a3, b3); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[21] = mac(a3, b2, t0); + c[22] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[25] = mac(a3, b2, t0); + c[26] = mul(a3, b3); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[29] = mac(a3, b2, t0); + c[30] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + c[31] = _mm256_setzero_si256(); } - -static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - +static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = _mm256_mullo_epi16(a0, b0); - - temp = _mm256_mullo_epi16(a0, b1); - c[1] = mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - c[2] = mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c[3] = mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c[4] = mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - c[7] = mul_add(a5, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - c[8] = mul_add(a6, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - c[9] = mul_add(a7, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - c[10] = mul_add(a[8], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - c[11] = mul_add(a[9], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - c[12] = mul_add(a[10], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - c[13] = mul_add(a[11], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - c[14] = mul_add(a[12], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - c[15] = mul_add(a[13], b2, temp); - - // unrolled second triangle - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - c[16] = mul_add(a1, b[1], temp); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - c[17] = mul_add(a1, b[2], temp); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - c[18] = mul_add(a1, b[3], temp); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - c[19] = mul_add(a1, b[4], temp); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - c[20] = mul_add(a1, b[5], temp); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - c[21] = mul_add(a1, b[6], temp); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - c[22] = mul_add(a1, b[7], temp); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - c[23] = mul_add(a1, b7, temp); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - c[24] = mul_add(a1, b6, temp); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - c[25] = mul_add(a1, b5, temp); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - c[26] = mul_add(a1, b4, temp); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - c[27] = mul_add(a1, b3, temp); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - c[28] = mul_add(a1, b2, temp); - - temp = _mm256_mullo_epi16(a0, b1); - c[29] = mul_add(a1, b0, temp); - - c[30] = _mm256_mullo_epi16(a1, b1); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mac(a0, b0, c[0]); + t0 = mac(a0, b1, c[1]); + c[1] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[2]); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[3]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[4]); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[5]); + c[5] = mac(a3, b2, t0); + c[6] = mac(a3, b3, c[6]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[27]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[28]); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[29]); + c[29] = mac(a3, b2, t0); + c[30] = mac(a3, b3, c[30]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); } + static void transpose(__m256i *M) { __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; __m256i temp, temp0, temp1, temp2; @@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co //-----------------Forward transposes ends--------------------------------- if (accumulate == 0) { - schoolbook_avx(vc, va, vb); - schoolbook_avx(vc + 32, va + 16, vb + 16); - schoolbook_avx(vc + 64, va + 32, vb + 32); - schoolbook_avx(vc + 96, va + 48, vb + 48); + schoolbook16x16(vc, va, vb); + schoolbook16x16(vc + 32, va + 16, vb + 16); + schoolbook16x16(vc + 64, va + 32, vb + 32); + schoolbook16x16(vc + 96, va + 48, vb + 48); } else { - schoolbook_avx_acc(vc, va, vb); - schoolbook_avx_acc(vc + 32, va + 16, vb + 16); - schoolbook_avx_acc(vc + 64, va + 32, vb + 32); - schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + schoolbook16x16_acc(vc, va, vb); + schoolbook16x16_acc(vc + 32, va + 16, vb + 16); + schoolbook16x16_acc(vc + 64, va + 32, vb + 32); + schoolbook16x16_acc(vc + 96, va + 48, vb + 48); } } diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml index f6375c71..742d77c5 100644 --- a/crypto_kem/saber/META.yml +++ b/crypto_kem/saber/META.yml @@ -14,9 +14,9 @@ principal-submitters: - Frederik Vercauteren implementations: - name: clean - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber - name: avx2 - version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber + version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber supported_platforms: - architecture: x86_64 operating_systems: diff --git a/crypto_kem/saber/avx2/poly_mul.c b/crypto_kem/saber/avx2/poly_mul.c index 5ec0aa73..2090e64f 100644 --- a/crypto_kem/saber/avx2/poly_mul.c +++ b/crypto_kem/saber/avx2/poly_mul.c @@ -4,701 +4,673 @@ #define L (SABER_N / 64) -static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { - return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); -} - -static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; +/* 16 word parallel multiply */ +#define mul(a, b) _mm256_mullo_epi16((a), (b)) +/* 16 word parallel multiply and accumulate */ +#define mac(a, b, c) _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c)) +static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = mul_add(a0, b0, c[0]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[1] = _mm256_add_epi16(temp, c[1]); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - temp = mul_add(a2, b0, temp); - c[2] = _mm256_add_epi16(temp, c[2]); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - temp = mul_add(a3, b0, temp); - c[3] = _mm256_add_epi16(temp, c[3]); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - temp = mul_add(a2, b2, temp); - c[4] = _mm256_add_epi16(temp, c[4]); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - temp = mul_add(a5, b0, temp); - c[5] = _mm256_add_epi16(temp, c[5]); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a4, b2, temp); - c[6] = _mm256_add_epi16(temp, c[6]); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a5, b2, temp); - c[7] = _mm256_add_epi16(temp, c[7]); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a6, b2, temp); - c[8] = _mm256_add_epi16(temp, c[8]); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a7, b2, temp); - c[9] = _mm256_add_epi16(temp, c[9]); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a[8], b2, temp); - c[10] = _mm256_add_epi16(temp, c[10]); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - temp = mul_add(a[9], b2, temp); - c[11] = _mm256_add_epi16(temp, c[11]); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - temp = mul_add(a[10], b2, temp); - c[12] = _mm256_add_epi16(temp, c[12]); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - temp = mul_add(a[11], b2, temp); - c[13] = _mm256_add_epi16(temp, c[13]); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - temp = mul_add(a[12], b2, temp); - c[14] = _mm256_add_epi16(temp, c[14]); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - temp = mul_add(a[13], b2, temp); - c[15] = _mm256_add_epi16(temp, c[15]); - - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - temp = mul_add(a1, b[1], temp); - c[16] = _mm256_add_epi16(temp, c[16]); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - temp = mul_add(a1, b[2], temp); - c[17] = _mm256_add_epi16(temp, c[17]); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - temp = mul_add(a1, b[3], temp); - c[18] = _mm256_add_epi16(temp, c[18]); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - temp = mul_add(a1, b[4], temp); - c[19] = _mm256_add_epi16(temp, c[19]); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - temp = mul_add(a1, b[5], temp); - c[20] = _mm256_add_epi16(temp, c[20]); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - temp = mul_add(a1, b[6], temp); - c[21] = _mm256_add_epi16(temp, c[21]); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - temp = mul_add(a1, b[7], temp); - c[22] = _mm256_add_epi16(temp, c[22]); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - temp = mul_add(a1, b7, temp); - c[23] = _mm256_add_epi16(temp, c[23]); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - temp = mul_add(a1, b6, temp); - c[24] = _mm256_add_epi16(temp, c[24]); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - temp = mul_add(a1, b5, temp); - c[25] = _mm256_add_epi16(temp, c[25]); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - temp = mul_add(a1, b4, temp); - c[26] = _mm256_add_epi16(temp, c[26]); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - temp = mul_add(a1, b3, temp); - c[27] = _mm256_add_epi16(temp, c[27]); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - temp = mul_add(a1, b2, temp); - c[28] = _mm256_add_epi16(temp, c[28]); - - temp = _mm256_mullo_epi16(a0, b1); - temp = mul_add(a1, b0, temp); - c[29] = _mm256_add_epi16(temp, c[29]); - - c[30] = mul_add(a1, b1, c[30]); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mul(a0, b0); + t0 = mul(a0, b1); + c[1] = mac(a1, b0, t0); + t0 = mul(a0, b2); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[5] = mac(a3, b2, t0); + c[6] = mul(a3, b3); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[9] = mac(a3, b2, t0); + c[10] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[13] = mac(a3, b2, t0); + c[14] = mul(a3, b3); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[17] = mac(a3, b2, t0); + c[18] = mul(a3, b3); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[21] = mac(a3, b2, t0); + c[22] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[25] = mac(a3, b2, t0); + c[26] = mul(a3, b3); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mul(a0, b3); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mul(a1, b3); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mul(a2, b3); + c[29] = mac(a3, b2, t0); + c[30] = mul(a3, b3); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + c[31] = _mm256_setzero_si256(); } - -static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) { - __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; - __m256i temp; - +static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) { + __m256i a0, a1, a2, a3; + __m256i b0, b1, b2, b3; + __m256i t0; a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; - b4 = b[4]; - b5 = b[5]; - b6 = b[6]; - b7 = b[7]; - - c[0] = _mm256_mullo_epi16(a0, b0); - - temp = _mm256_mullo_epi16(a0, b1); - c[1] = mul_add(a1, b0, temp); - - temp = _mm256_mullo_epi16(a0, b2); - temp = mul_add(a1, b1, temp); - c[2] = mul_add(a2, b0, temp); - - temp = _mm256_mullo_epi16(a0, b3); - temp = mul_add(a1, b2, temp); - temp = mul_add(a2, b1, temp); - c[3] = mul_add(a3, b0, temp); - - temp = _mm256_mullo_epi16(a0, b4); - temp = mul_add(a1, b3, temp); - temp = mul_add(a3, b1, temp); - temp = mul_add(a4, b0, temp); - c[4] = mul_add(a2, b2, temp); - - temp = _mm256_mullo_epi16(a0, b5); - temp = mul_add(a1, b4, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add( a4, b1, temp); - c[5] = mul_add(a5, b0, temp); - - temp = _mm256_mullo_epi16(a0, b6); - temp = mul_add(a1, b5, temp); - temp = mul_add(a5, b1, temp); - temp = mul_add(a6, b0, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a3, b3, temp); - c[6] = mul_add(a4, b2, temp); - - temp = _mm256_mullo_epi16(a0, b7); - temp = mul_add(a1, b6, temp); - temp = mul_add(a6, b1, temp); - temp = mul_add(a7, b0, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a4, b3, temp); - c[7] = mul_add(a5, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[8]); - temp = mul_add(a1, b7, temp); - temp = mul_add(a7, b1, temp); - temp = mul_add(a[8], b0, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a5, b3, temp); - c[8] = mul_add(a6, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[9]); - temp = mul_add(a1, b[8], temp); - temp = mul_add(a[8], b1, temp); - temp = mul_add(a[9], b0, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a6, b3, temp); - c[9] = mul_add(a7, b2, temp); - - temp = _mm256_mullo_epi16(a0, b[10]); - temp = mul_add(a1, b[9], temp); - temp = mul_add(a[9], b1, temp); - temp = mul_add(a[10], b0, temp); - temp = mul_add(a2, b[8], temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a7, b3, temp); - c[10] = mul_add(a[8], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[11]); - temp = mul_add(a1, b[10], temp); - temp = mul_add(a[10], b1, temp); - temp = mul_add(a[11], b0, temp); - temp = mul_add(a2, b[9], temp); - temp = mul_add(a3, b[8], temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a[8], b3, temp); - c[11] = mul_add(a[9], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[12]); - temp = mul_add(a1, b[11], temp); - temp = mul_add(a[11], b1, temp); - temp = mul_add(a[12], b0, temp); - temp = mul_add(a2, b[10], temp); - temp = mul_add(a3, b[9], temp); - temp = mul_add(a4, b[8], temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a[8], b4, temp); - temp = mul_add(a[9], b3, temp); - c[12] = mul_add(a[10], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[13]); - temp = mul_add(a1, b[12], temp); - temp = mul_add(a[12], b1, temp); - temp = mul_add(a[13], b0, temp); - temp = mul_add(a2, b[11], temp); - temp = mul_add(a3, b[10], temp); - temp = mul_add(a4, b[9], temp); - temp = mul_add(a5, b[8], temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a[8], b5, temp); - temp = mul_add(a[9], b4, temp); - temp = mul_add(a[10], b3, temp); - c[13] = mul_add(a[11], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[14]); - temp = mul_add(a1, b[13], temp); - temp = mul_add(a[13], b1, temp); - temp = mul_add(a[14], b0, temp); - temp = mul_add(a2, b[12], temp); - temp = mul_add(a3, b[11], temp); - temp = mul_add(a4, b[10], temp); - temp = mul_add(a5, b[9], temp); - temp = mul_add(a6, b[8], temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a[8], b6, temp); - temp = mul_add(a[9], b5, temp); - temp = mul_add(a[10], b4, temp); - temp = mul_add(a[11], b3, temp); - c[14] = mul_add(a[12], b2, temp); - - temp = _mm256_mullo_epi16(a0, b[15]); - temp = mul_add(a1, b[14], temp); - temp = mul_add(a[14], b1, temp); - temp = mul_add(a[15], b0, temp); - temp = mul_add(a2, b[13], temp); - temp = mul_add(a3, b[12], temp); - temp = mul_add(a4, b[11], temp); - temp = mul_add(a5, b[10], temp); - temp = mul_add(a6, b[9], temp); - temp = mul_add(a7, b[8], temp); - temp = mul_add(a[8], b7, temp); - temp = mul_add(a[9], b6, temp); - temp = mul_add(a[10], b5, temp); - temp = mul_add(a[11], b4, temp); - temp = mul_add(a[12], b3, temp); - c[15] = mul_add(a[13], b2, temp); - - // unrolled second triangle - a0 = a[14]; - a1 = a[15]; - a2 = a[13]; - a3 = a[12]; - a4 = a[11]; - a5 = a[10]; - a6 = a[9]; - a7 = a[8]; - - b0 = b[14]; - b1 = b[15]; - b2 = b[13]; - b3 = b[12]; - b4 = b[11]; - b5 = b[10]; - b6 = b[9]; - b7 = b[8]; - - temp = _mm256_mullo_epi16(a[1], b1); - temp = mul_add(a[2], b0, temp); - temp = mul_add(a[3], b2, temp); - temp = mul_add(a[4], b3, temp); - temp = mul_add(a[5], b4, temp); - temp = mul_add(a[6], b5, temp); - temp = mul_add(a[7], b6, temp); - temp = mul_add(a7, b7, temp); - temp = mul_add(a6, b[7], temp); - temp = mul_add(a5, b[6], temp); - temp = mul_add(a4, b[5], temp); - temp = mul_add(a3, b[4], temp); - temp = mul_add(a2, b[3], temp); - temp = mul_add(a0, b[2], temp); - c[16] = mul_add(a1, b[1], temp); - - temp = _mm256_mullo_epi16(a[2], b1); - temp = mul_add(a[3], b0, temp); - temp = mul_add(a[4], b2, temp); - temp = mul_add(a[5], b3, temp); - temp = mul_add(a[6], b4, temp); - temp = mul_add(a[7], b5, temp); - temp = mul_add(a7, b6, temp); - temp = mul_add(a6, b7, temp); - temp = mul_add(a5, b[7], temp); - temp = mul_add(a4, b[6], temp); - temp = mul_add(a3, b[5], temp); - temp = mul_add(a2, b[4], temp); - temp = mul_add(a0, b[3], temp); - c[17] = mul_add(a1, b[2], temp); - - temp = _mm256_mullo_epi16(a[3], b1); - temp = mul_add(a[4], b0, temp); - temp = mul_add(a[5], b2, temp); - temp = mul_add(a[6], b3, temp); - temp = mul_add(a[7], b4, temp); - temp = mul_add(a7, b5, temp); - temp = mul_add(a6, b6, temp); - temp = mul_add(a5, b7, temp); - temp = mul_add(a4, b[7], temp); - temp = mul_add(a3, b[6], temp); - temp = mul_add(a2, b[5], temp); - temp = mul_add(a0, b[4], temp); - c[18] = mul_add(a1, b[3], temp); - - temp = _mm256_mullo_epi16(a[4], b1); - temp = mul_add(a[5], b0, temp); - temp = mul_add(a[6], b2, temp); - temp = mul_add(a[7], b3, temp); - temp = mul_add(a7, b4, temp); - temp = mul_add(a6, b5, temp); - temp = mul_add(a5, b6, temp); - temp = mul_add(a4, b7, temp); - temp = mul_add(a3, b[7], temp); - temp = mul_add(a2, b[6], temp); - temp = mul_add(a0, b[5], temp); - c[19] = mul_add(a1, b[4], temp); - - temp = _mm256_mullo_epi16(a[5], b1); - temp = mul_add(a[6], b0, temp); - temp = mul_add(a[7], b2, temp); - temp = mul_add(a7, b3, temp); - temp = mul_add(a6, b4, temp); - temp = mul_add(a5, b5, temp); - temp = mul_add(a4, b6, temp); - temp = mul_add(a3, b7, temp); - temp = mul_add(a2, b[7], temp); - temp = mul_add(a0, b[6], temp); - c[20] = mul_add(a1, b[5], temp); - - temp = _mm256_mullo_epi16(a[6], b1); - temp = mul_add(a[7], b0, temp); - temp = mul_add(a7, b2, temp); - temp = mul_add(a6, b3, temp); - temp = mul_add(a5, b4, temp); - temp = mul_add(a4, b5, temp); - temp = mul_add(a3, b6, temp); - temp = mul_add(a2, b7, temp); - temp = mul_add(a0, b[7], temp); - c[21] = mul_add(a1, b[6], temp); - - temp = _mm256_mullo_epi16(a[7], b1); - temp = mul_add(a7, b0, temp); - temp = mul_add(a6, b2, temp); - temp = mul_add(a5, b3, temp); - temp = mul_add(a4, b4, temp); - temp = mul_add(a3, b5, temp); - temp = mul_add(a2, b6, temp); - temp = mul_add(a0, b7, temp); - c[22] = mul_add(a1, b[7], temp); - - temp = _mm256_mullo_epi16(a7, b1); - temp = mul_add(a6, b0, temp); - temp = mul_add(a5, b2, temp); - temp = mul_add(a4, b3, temp); - temp = mul_add(a3, b4, temp); - temp = mul_add(a2, b5, temp); - temp = mul_add(a0, b6, temp); - c[23] = mul_add(a1, b7, temp); - - temp = _mm256_mullo_epi16(a6, b1); - temp = mul_add(a5, b0, temp); - temp = mul_add(a4, b2, temp); - temp = mul_add(a3, b3, temp); - temp = mul_add(a2, b4, temp); - temp = mul_add(a0, b5, temp); - c[24] = mul_add(a1, b6, temp); - - temp = _mm256_mullo_epi16(a5, b1); - temp = mul_add(a4, b0, temp); - temp = mul_add(a3, b2, temp); - temp = mul_add(a2, b3, temp); - temp = mul_add(a0, b4, temp); - c[25] = mul_add(a1, b5, temp); - - temp = _mm256_mullo_epi16(a4, b1); - temp = mul_add(a3, b0, temp); - temp = mul_add(a2, b2, temp); - temp = mul_add(a0, b3, temp); - c[26] = mul_add(a1, b4, temp); - - temp = _mm256_mullo_epi16(a3, b1); - temp = mul_add(a2, b0, temp); - temp = mul_add(a0, b2, temp); - c[27] = mul_add(a1, b3, temp); - - temp = _mm256_mullo_epi16(a2, b1); - temp = mul_add(a0, b0, temp); - c[28] = mul_add(a1, b2, temp); - - temp = _mm256_mullo_epi16(a0, b1); - c[29] = mul_add(a1, b0, temp); - - c[30] = _mm256_mullo_epi16(a1, b1); - - c[31] = _mm256_set_epi64x(0, 0, 0, 0); + c[0] = mac(a0, b0, c[0]); + t0 = mac(a0, b1, c[1]); + c[1] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[2]); + t0 = mac(a1, b1, t0); + c[2] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[3]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[3] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[4]); + t0 = mac(a2, b2, t0); + c[4] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[5]); + c[5] = mac(a3, b2, t0); + c[6] = mac(a3, b3, c[6]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + a0 = a[4]; + a1 = a[5]; + a2 = a[6]; + a3 = a[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[4] = mac(a0, b0, c[4]); + t0 = mac(a0, b1, c[5]); + c[5] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[6]); + t0 = mac(a1, b1, t0); + c[6] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[7]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[7] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[8]); + t0 = mac(a2, b2, t0); + c[8] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[9]); + c[9] = mac(a3, b2, t0); + c[10] = mac(a3, b3, c[10]); + a0 = a[8]; + a1 = a[9]; + a2 = a[10]; + a3 = a[11]; + c[8] = mac(a0, b0, c[8]); + t0 = mac(a0, b1, c[9]); + c[9] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[10]); + t0 = mac(a1, b1, t0); + c[10] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[11]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[11] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[12]); + t0 = mac(a2, b2, t0); + c[12] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[13]); + c[13] = mac(a3, b2, t0); + c[14] = mac(a3, b3, c[14]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[12]; + b1 = b[13]; + b2 = b[14]; + b3 = b[15]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + a0 = a[12]; + a1 = a[13]; + a2 = a[14]; + a3 = a[15]; + c[24] = mac(a0, b0, c[24]); + t0 = mac(a0, b1, c[25]); + c[25] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[26]); + t0 = mac(a1, b1, t0); + c[26] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[27]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[27] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[28]); + t0 = mac(a2, b2, t0); + c[28] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[29]); + c[29] = mac(a3, b2, t0); + c[30] = mac(a3, b3, c[30]); + b0 = b[8]; + b1 = b[9]; + b2 = b[10]; + b3 = b[11]; + c[20] = mac(a0, b0, c[20]); + t0 = mac(a0, b1, c[21]); + c[21] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[22]); + t0 = mac(a1, b1, t0); + c[22] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[23]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[23] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[24]); + t0 = mac(a2, b2, t0); + c[24] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[25]); + c[25] = mac(a3, b2, t0); + c[26] = mac(a3, b3, c[26]); + b0 = b[4]; + b1 = b[5]; + b2 = b[6]; + b3 = b[7]; + c[16] = mac(a0, b0, c[16]); + t0 = mac(a0, b1, c[17]); + c[17] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[18]); + t0 = mac(a1, b1, t0); + c[18] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[19]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[19] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[20]); + t0 = mac(a2, b2, t0); + c[20] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[21]); + c[21] = mac(a3, b2, t0); + c[22] = mac(a3, b3, c[22]); + b0 = b[0]; + b1 = b[1]; + b2 = b[2]; + b3 = b[3]; + c[12] = mac(a0, b0, c[12]); + t0 = mac(a0, b1, c[13]); + c[13] = mac(a1, b0, t0); + t0 = mac(a0, b2, c[14]); + t0 = mac(a1, b1, t0); + c[14] = mac(a2, b0, t0); + t0 = mac(a0, b3, c[15]); + t0 = mac(a1, b2, t0); + t0 = mac(a2, b1, t0); + c[15] = mac(a3, b0, t0); + t0 = mac(a1, b3, c[16]); + t0 = mac(a2, b2, t0); + c[16] = mac(a3, b1, t0); + t0 = mac(a2, b3, c[17]); + c[17] = mac(a3, b2, t0); + c[18] = mac(a3, b3, c[18]); } + static void transpose(__m256i *M) { __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; __m256i temp, temp0, temp1, temp2; @@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co //-----------------Forward transposes ends--------------------------------- if (accumulate == 0) { - schoolbook_avx(vc, va, vb); - schoolbook_avx(vc + 32, va + 16, vb + 16); - schoolbook_avx(vc + 64, va + 32, vb + 32); - schoolbook_avx(vc + 96, va + 48, vb + 48); + schoolbook16x16(vc, va, vb); + schoolbook16x16(vc + 32, va + 16, vb + 16); + schoolbook16x16(vc + 64, va + 32, vb + 32); + schoolbook16x16(vc + 96, va + 48, vb + 48); } else { - schoolbook_avx_acc(vc, va, vb); - schoolbook_avx_acc(vc + 32, va + 16, vb + 16); - schoolbook_avx_acc(vc + 64, va + 32, vb + 32); - schoolbook_avx_acc(vc + 96, va + 48, vb + 48); + schoolbook16x16_acc(vc, va, vb); + schoolbook16x16_acc(vc + 32, va + 16, vb + 16); + schoolbook16x16_acc(vc + 64, va + 32, vb + 32); + schoolbook16x16_acc(vc + 96, va + 48, vb + 48); } }