@@ -14,9 +14,9 @@ principal-submitters: | |||
- Frederik Vercauteren | |||
implementations: | |||
- name: clean | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber | |||
- name: avx2 | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
@@ -2,7 +2,7 @@ | |||
LIB=libfiresaber_avx2.a | |||
HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h | |||
OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o | |||
OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o | |||
CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -1,416 +1,125 @@ | |||
#include "./polymul/toom-cook_4way.c" | |||
#include "SABER_indcpa.h" | |||
#include "SABER_params.h" | |||
#include "api.h" | |||
#include "cbd.h" | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include "randombytes.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
//#include "randombytes.h" | |||
//#include "./polymul/toom_cook_4/toom-cook_4way.c" | |||
#define h1 4 //2^(EQ-EP-1) | |||
#define h1 (1 << (SABER_EQ - SABER_EP - 1)) | |||
#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) | |||
#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly *skpv1 = A[0]; // use first row of A to hold sk temporarily | |||
toom4_points skpv1_eval[SABER_L]; | |||
poly res[SABER_L]; | |||
static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { | |||
int32_t i, j; | |||
uint8_t rand[SABER_NOISESEEDBYTES]; | |||
uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
message_dec[j] = 0; | |||
for (i = 0; i < 8; i++) { | |||
message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); | |||
} | |||
} | |||
} | |||
/*----------------------------------------------------------------------------------- | |||
This routine generates a=[Matrix K x K] of 256-coefficient polynomials | |||
static void GenMatrix(polyvec *a, const uint8_t *seed) { | |||
uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; | |||
uint16_t temp_ar[SABER_N]; | |||
int i, j, k; | |||
uint16_t mod = (SABER_Q - 1); | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
PQCLEAN_FIRESABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); | |||
for (k = 0; k < SABER_N; k++) { | |||
a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; | |||
} | |||
} | |||
} | |||
} | |||
static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { | |||
uint32_t i; | |||
randombytes(seed_A, SABER_SEEDBYTES); | |||
shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state | |||
uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; | |||
randombytes(rand, SABER_NOISESEEDBYTES); | |||
PQCLEAN_FIRESABER_AVX2_GenSecret(skpv1, rand); | |||
PQCLEAN_FIRESABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_K; i++) { | |||
PQCLEAN_FIRESABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_FIRESABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]); | |||
} | |||
} | |||
//********************************matrix-vector mul routines***************************************************** | |||
static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { | |||
int64_t i, j; | |||
__m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time | |||
for (i = 0; i < NUM_POLY; i++) { | |||
for (j = 0; j < NUM_POLY; j++) { | |||
PQCLEAN_FIRESABER_AVX2_GenMatrix(A, seed_A); // sample matrix A | |||
PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order | |||
if (isTranspose == 0) { | |||
toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); | |||
} else { | |||
toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); | |||
} | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_N; j++) { | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
TC_interpol(c_bucket, res_avx[i]); | |||
} | |||
} | |||
static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { | |||
int64_t i; | |||
__m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time | |||
for (i = 0; i < NUM_POLY; i++) { | |||
toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); | |||
} | |||
TC_interpol(c_bucket, res_avx); | |||
} | |||
//********************************matrix-vector mul routines***************************************************** | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
polyvec a[SABER_K]; | |||
uint16_t skpv1[SABER_K][SABER_N]; | |||
uint8_t seed[SABER_SEEDBYTES]; | |||
uint8_t noiseseed[SABER_COINBYTES]; | |||
int32_t i, j, k; | |||
//--------------AVX declaration------------------ | |||
__m256i sk_avx[SABER_K][SABER_N / 16]; | |||
__m256i mod; | |||
__m256i res_avx[SABER_K][SABER_N / 16]; | |||
__m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
mod = _mm256_set1_epi16(SABER_Q - 1); | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
randombytes(seed, SABER_SEEDBYTES); | |||
shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state | |||
randombytes(noiseseed, SABER_COINBYTES); | |||
GenMatrix(a, seed); //sample matrix A | |||
GenSecret(skpv1, noiseseed); | |||
// Load sk into avx vectors | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); | |||
} | |||
} | |||
// Load a into avx vectors | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); | |||
} | |||
} | |||
} | |||
//------------------------do the matrix vector multiplication and rounding------------ | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sk_avx[j], b_bucket[j]); | |||
} | |||
matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order | |||
// Now truncation | |||
for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); | |||
res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); | |||
res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); | |||
} | |||
} | |||
//------------------Pack sk into byte string------- | |||
PQCLEAN_FIRESABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); | |||
//------------------Pack pk into byte string------- | |||
for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); | |||
} | |||
} | |||
PQCLEAN_FIRESABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string | |||
for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. | |||
pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; | |||
} | |||
PQCLEAN_FIRESABER_AVX2_POLVECp2BS(pk, res); // pack public key | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly res[SABER_L]; | |||
toom4_points skpv1_eval[SABER_L]; | |||
uint32_t i, j, k; | |||
polyvec a[SABER_K]; // skpv; | |||
uint8_t seed[SABER_SEEDBYTES]; | |||
uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client | |||
uint16_t skpv1[SABER_K][SABER_N]; | |||
uint16_t temp[SABER_K][SABER_N]; | |||
uint16_t message[SABER_KEYBYTES * 8]; | |||
uint8_t msk_c[SABER_SCALEBYTES_KEM]; | |||
//--------------AVX declaration------------------ | |||
__m256i sk_avx[SABER_K][SABER_N / 16]; | |||
__m256i mod, mod_p; | |||
__m256i res_avx[SABER_K][SABER_N / 16]; | |||
__m256i vprime_avx[SABER_N / 16]; | |||
__m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
__m256i pkcl_avx[SABER_K][SABER_N / 16]; | |||
__m256i message_avx[SABER_N / 16]; | |||
mod = _mm256_set1_epi16(SABER_Q - 1); | |||
mod_p = _mm256_set1_epi16(SABER_P - 1); | |||
poly *temp = A[0]; // re-use stack space | |||
poly *vprime = &A[0][0]; | |||
poly *message = &A[0][1]; | |||
const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. | |||
seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; | |||
PQCLEAN_FIRESABER_AVX2_GenSecret(temp, noiseseed); | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_FIRESABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]); | |||
} | |||
GenMatrix(a, seed); | |||
GenSecret(skpv1, noiseseed); | |||
// ----------- Load skpv1 into avx vectors ---------- | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); | |||
} | |||
} | |||
PQCLEAN_FIRESABER_AVX2_GenMatrix(A, seed_A); | |||
PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed | |||
// ----------- Load skpv1 into avx vectors ---------- | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); | |||
} | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N; j++) { | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
//-----------------matrix-vector multiplication and rounding | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sk_avx[j], b_bucket[j]); | |||
} | |||
matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order | |||
// Now truncation | |||
for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); | |||
res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); | |||
res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); | |||
} | |||
} | |||
//-----this result should be put in b_prime for later use in server. | |||
for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); | |||
} | |||
} | |||
PQCLEAN_FIRESABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string | |||
//**************client matrix-vector multiplication ends******************// | |||
//------now calculate the v' | |||
//-------unpack the public_key | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); | |||
} | |||
} | |||
// InnerProduct | |||
//for(k=0;k<SABER_N/16;k++){ | |||
// vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]); | |||
//} | |||
PQCLEAN_FIRESABER_AVX2_POLVECp2BS(ciphertext, res); | |||
// vector-vector scalar multiplication with mod p | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVECp(temp, pk); | |||
PQCLEAN_FIRESABER_AVX2_InnerProd(vprime, temp, skpv1_eval); | |||
PQCLEAN_FIRESABER_AVX2_BS2POLmsg(message, m); | |||
vector_vector_mul(vprime_avx, pkcl_avx, b_bucket); | |||
// Computation of v'+h1 | |||
for (i = 0; i < SABER_N / 16; i++) { //adding h1 | |||
vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1)); | |||
} | |||
// unpack m; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
message[8 * j + i] = ((m[j] >> i) & 0x01); | |||
} | |||
} | |||
// message encoding | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); | |||
message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); | |||
} | |||
// SHIFTRIGHT(v'+h1-m mod p, EP-ET) | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); | |||
vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); | |||
vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); | |||
} | |||
// Unpack avx | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); | |||
} | |||
PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(msk_c, temp[0]); | |||
for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { | |||
ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; | |||
for (i = 0; i < SABER_N; i++) { | |||
vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); | |||
vprime->coeffs[i] &= SABER_P - 1; | |||
vprime->coeffs[i] >>= SABER_EP - SABER_ET; | |||
} | |||
PQCLEAN_FIRESABER_AVX2_POLT2BS(msk_c, vprime); | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
size_t i; | |||
uint32_t i, j; | |||
uint16_t sksv[SABER_K][SABER_N]; //secret key of the server | |||
uint16_t pksv[SABER_K][SABER_N]; | |||
uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; | |||
uint8_t scale_ar[SABER_SCALEBYTES_KEM]; | |||
uint16_t op[SABER_N]; | |||
//--------------AVX declaration------------------ | |||
//__m256i mod_p; | |||
__m256i v_avx[SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
__m256i sksv_avx[SABER_K][SABER_N / 16]; | |||
__m256i pksv_avx[SABER_K][SABER_N / 16]; | |||
poly temp[SABER_L]; | |||
toom4_points sksv_eval[SABER_L]; | |||
//mod_p=_mm256_set1_epi16(SABER_P-1); | |||
const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
poly *v = &temp[0]; | |||
poly *cm = &temp[1]; | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
//-------unpack the public_key | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); | |||
pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); | |||
} | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVECq(temp, sk); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]); | |||
} | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); | |||
} | |||
// InnerProduct(b', s, mod p) | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sksv_avx[j], b_bucket[j]); | |||
} | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVECp(temp, ciphertext); | |||
PQCLEAN_FIRESABER_AVX2_InnerProd(v, temp, sksv_eval); | |||
vector_vector_mul(v_avx, pksv_avx, b_bucket); | |||
PQCLEAN_FIRESABER_AVX2_BS2POLT(cm, packed_cm); | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
_mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); | |||
} | |||
for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { | |||
scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; | |||
} | |||
PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(op, scale_ar); | |||
//addition of h2 | |||
for (i = 0; i < SABER_N; i++) { | |||
message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); | |||
v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); | |||
v->coeffs[i] &= SABER_P - 1; | |||
v->coeffs[i] >>= SABER_EP - 1; | |||
} | |||
POL2MSG(m, message_dec_unpacked); | |||
PQCLEAN_FIRESABER_AVX2_POLmsg2BS(m, v); | |||
} |
@@ -1,45 +1,41 @@ | |||
#ifndef PARAMS_H | |||
#define PARAMS_H | |||
#include "api.h" | |||
#define SABER_K 4 | |||
/* Don't change anything below this line */ | |||
#define SABER_L 4 | |||
#define SABER_MU 6 | |||
#define SABER_ET 6 | |||
#define SABER_EQ 13 | |||
#define SABER_EP 10 | |||
#define SABER_N 256 | |||
#define SABER_Q 8192 //2^13 | |||
#define SABER_P 1024 | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_COINBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_EP 10 | |||
#define SABER_P (1 << SABER_EP) | |||
#define SABER_HASHBYTES 32 | |||
#define SABER_EQ 13 | |||
#define SABER_Q (1 << SABER_EQ) | |||
#define SABER_POLYBYTES 416 //13*256/8 | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_HASHBYTES 32 | |||
#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) | |||
#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) | |||
#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation | |||
#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) | |||
#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) | |||
#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) | |||
#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) | |||
#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) | |||
#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) | |||
#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) | |||
#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) | |||
#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) | |||
#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) | |||
#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) | |||
#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) | |||
#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ | |||
#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) | |||
#endif |
@@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
----------------------------------------------------------------------*/ | |||
static uint64_t load_littleendian(const unsigned char *x, int bytes) { | |||
static uint64_t load_littleendian(const uint8_t *x, int bytes) { | |||
int i; | |||
uint64_t r = x[0]; | |||
for (i = 1; i < bytes; i++) { | |||
@@ -20,33 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { | |||
return r; | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { | |||
uint16_t Qmod_minus1 = SABER_Q - 1; | |||
void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { | |||
uint32_t t, d, a[4], b[4]; | |||
int i, j; | |||
for (i = 0; i < SABER_N / 4; i++) { | |||
t = load_littleendian(buf + 3 * i, 3); | |||
t = (uint32_t) load_littleendian(buf + 3 * i, 3); | |||
d = 0; | |||
for (j = 0; j < 3; j++) { | |||
d += (t >> j) & 0x249249; | |||
} | |||
a[0] = d & 0x7; | |||
b[0] = (d >> 3) & 0x7; | |||
a[1] = (d >> 6) & 0x7; | |||
b[1] = (d >> 9) & 0x7; | |||
a[0] = d & 0x7; | |||
b[0] = (d >> 3) & 0x7; | |||
a[1] = (d >> 6) & 0x7; | |||
b[1] = (d >> 9) & 0x7; | |||
a[2] = (d >> 12) & 0x7; | |||
b[2] = (d >> 15) & 0x7; | |||
a[3] = (d >> 18) & 0x7; | |||
b[3] = (d >> 21); | |||
r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; | |||
r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; | |||
r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; | |||
r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; | |||
s[4 * i + 0] = (uint16_t)(a[0] - b[0]); | |||
s[4 * i + 1] = (uint16_t)(a[1] - b[1]); | |||
s[4 * i + 2] = (uint16_t)(a[2] - b[2]); | |||
s[4 * i + 3] = (uint16_t)(a[3] - b[3]); | |||
} | |||
} |
@@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" | |||
by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, | |||
Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
----------------------------------------------------------------------*/ | |||
#include "poly.h" | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); | |||
void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); | |||
#endif |
@@ -4,14 +4,12 @@ | |||
#include "fips202.h" | |||
#include "randombytes.h" | |||
#include "verify.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
int i; | |||
size_t i; | |||
PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk | |||
for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { | |||
@@ -39,7 +37,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t | |||
sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); | |||
// K^ <-- kr[0:31] | |||
// noiseseed (r) <-- kr[32:63]; | |||
PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; | |||
PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; | |||
sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); | |||
@@ -49,7 +47,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t | |||
} | |||
int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { | |||
int i; | |||
size_t i; | |||
uint8_t fail; | |||
uint8_t cmp[SABER_BYTES_CCA_DEC]; | |||
uint8_t buf[64]; | |||
@@ -65,7 +63,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const ui | |||
sha3_512(kr, buf, 64); | |||
PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); | |||
PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk); | |||
fail = PQCLEAN_FIRESABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); | |||
@@ -1,35 +1,3 @@ | |||
#ifndef INDCPA_H | |||
#define INDCPA_H | |||
#include <stdint.h> | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); | |||
void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); | |||
int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); | |||
int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); | |||
//uint64_t clock1,clock2; | |||
//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; | |||
#endif |
@@ -1,502 +1,149 @@ | |||
#include "SABER_params.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include <string.h> | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); | |||
bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); | |||
bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; | |||
data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; | |||
data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); | |||
data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; | |||
data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; | |||
data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); | |||
data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); | |||
data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0; | |||
for (j = 0; j < SABER_N / 2; j++) { | |||
offset_data = 2 * j; | |||
bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0; | |||
for (j = 0; j < SABER_N / 2; j++) { | |||
offset_data = 2 * j; | |||
data[offset_data] = bytes[j] & 0x0f; | |||
data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); | |||
bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); | |||
bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); | |||
out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6); | |||
out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4); | |||
out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2); | |||
in += 4; | |||
out += 3; | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 4 * j; | |||
data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; | |||
data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; | |||
data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; | |||
data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); | |||
bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); | |||
} | |||
out[0] = in[0] & 0x3f; | |||
out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2); | |||
out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4); | |||
out[3] = ((in[2] & 0xff) >> 2); | |||
in += 3; | |||
out += 4; | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); | |||
data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); | |||
data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); | |||
data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); | |||
data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); | |||
data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); | |||
data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); | |||
} | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); | |||
out[2] = ((in[1] >> 3) & 0xff); | |||
out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); | |||
out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); | |||
out[5] = ((in[3] >> 1) & 0xff); | |||
out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); | |||
out[7] = ((in[4] >> 4) & 0xff); | |||
out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); | |||
out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); | |||
out[10] = ((in[6] >> 2) & 0xff); | |||
out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); | |||
out[12] = ((in[7] >> 5) & 0xff); | |||
in += 8; | |||
out += 13; | |||
} | |||
} | |||
static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); | |||
out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); | |||
out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); | |||
out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); | |||
out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); | |||
out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); | |||
out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); | |||
out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); | |||
in += 13; | |||
out += 8; | |||
} | |||
} | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); | |||
out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); | |||
out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); | |||
out[4] = ((in[3] >> 2) & 0xff); | |||
in += 4; | |||
out += 5; | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); | |||
bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); | |||
} | |||
static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); | |||
out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); | |||
out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); | |||
out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); | |||
in += 5; | |||
out += 4; | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
//for(i=0;i<SABER_K;i++){ | |||
//i=0; | |||
//offset_byte1=i*(SABER_N*13)/8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
//offset_byte=offset_byte1+13*j; | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); | |||
} | |||
//} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
/*This function packs 11 bit data stream into 8 bits of data. | |||
*/ | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 11) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 11 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 11) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 11 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); | |||
data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); | |||
data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); | |||
data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); | |||
data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); | |||
data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); | |||
data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); | |||
data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 14) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 7 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { | |||
size_t i, j; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { | |||
size_t i, j; | |||
memset(bytes, 0, SABER_KEYBYTES); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 14) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 7 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); | |||
data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); | |||
data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); | |||
data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { | |||
if (modulus == 1024) { | |||
PQCLEAN_FIRESABER_AVX2_POLVECp2BS(bytes, data); | |||
} else if (modulus == 8192) { | |||
PQCLEAN_FIRESABER_AVX2_POLVECq2BS(bytes, data); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { | |||
if (modulus == 1024) { | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVECp(data, bytes); | |||
} else if (modulus == 8192) { | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVECq(data, bytes); | |||
} | |||
} |
@@ -1,56 +1,28 @@ | |||
#ifndef PACK_UNPACK_H | |||
#define PACK_UNPACK_H | |||
#include "SABER_params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); | |||
void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_FIRESABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_FIRESABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); | |||
#endif |
@@ -0,0 +1,62 @@ | |||
#include "cbd.h" | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
void PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) { | |||
size_t i, j; | |||
toom4_points_product c_eval; | |||
if (transpose) { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1); | |||
} | |||
PQCLEAN_FIRESABER_AVX2_toom4_interp(&c[i], &c_eval); | |||
} | |||
} else { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1); | |||
} | |||
PQCLEAN_FIRESABER_AVX2_toom4_interp(&c[i], &c_eval); | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) { | |||
size_t i; | |||
toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time | |||
PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0); | |||
for (i = 1; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1); | |||
} | |||
PQCLEAN_FIRESABER_AVX2_toom4_interp(c, &c_eval); | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); | |||
} | |||
} |
@@ -1,27 +1,38 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
/*--------------------------------------------------------------------- | |||
This file has been adapted from the implementation | |||
(available at, Public Domain https://github.com/pq-crystals/kyber) | |||
of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" | |||
by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, | |||
Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
#include "SABER_params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
typedef struct { | |||
typedef union { | |||
uint16_t coeffs[SABER_N]; | |||
__m256i dummy; | |||
} poly; | |||
typedef struct { | |||
poly vec[SABER_K]; | |||
} polyvec; | |||
typedef union { | |||
uint16_t coeffs[4 * SABER_N]; | |||
__m256i dummy; | |||
} toom4_points; | |||
void PQCLEAN_FIRESABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); | |||
typedef union { | |||
uint16_t coeffs[8 * SABER_N]; | |||
__m256i dummy; | |||
} toom4_points_product; | |||
void PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose); | |||
void PQCLEAN_FIRESABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); | |||
void PQCLEAN_FIRESABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]); | |||
void PQCLEAN_FIRESABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_FIRESABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); | |||
void PQCLEAN_FIRESABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval); | |||
void PQCLEAN_FIRESABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b); | |||
void PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate); | |||
#endif |
@@ -1,20 +0,0 @@ | |||
#include "../SABER_params.h" | |||
#define AVX_N (SABER_N >> 4) | |||
#define small_len_avx (AVX_N >> 2) | |||
#define SCHB_N 16 | |||
#define N_SB (SABER_N >> 2) | |||
#define N_SB_RES (2*N_SB-1) | |||
#define N_SB_16 (N_SB >> 2) | |||
#define N_SB_16_RES (2*N_SB_16-1) | |||
#define AVX_N1 16 /*N/16*/ | |||
#define SCM_SIZE 16 | |||
// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements | |||
#define NUM_POLY SABER_K | |||
//int NUM_POLY=2; |
@@ -1,303 +0,0 @@ | |||
#include <immintrin.h> | |||
static void transpose_n1(__m256i *M) | |||
{ | |||
//int i; | |||
register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; | |||
register __m256i temp, temp0, temp1, temp2; | |||
//for(i=0; i<8; i=i+1) | |||
//{ | |||
r0 = _mm256_unpacklo_epi16(M[0], M[1]); | |||
r1 = _mm256_unpacklo_epi16(M[2], M[3]); | |||
r2 = _mm256_unpacklo_epi16(M[4], M[5]); | |||
r3 = _mm256_unpacklo_epi16(M[6], M[7]); | |||
r4 = _mm256_unpacklo_epi16(M[8], M[9]); | |||
r5 = _mm256_unpacklo_epi16(M[10], M[11]); | |||
r6 = _mm256_unpacklo_epi16(M[12], M[13]); | |||
r7 = _mm256_unpacklo_epi16(M[14], M[15]); | |||
temp = _mm256_unpacklo_epi32(r0, r1); | |||
temp0 = _mm256_unpacklo_epi32(r2, r3); | |||
temp1 = _mm256_unpacklo_epi32(r4, r5); | |||
temp2 = _mm256_unpacklo_epi32(r6, r7); | |||
r8 = _mm256_unpackhi_epi32(r0, r1); | |||
r9 = _mm256_unpackhi_epi32(r2, r3); | |||
r10 = _mm256_unpackhi_epi32(r4, r5); | |||
r11 = _mm256_unpackhi_epi32(r6, r7); | |||
r0 = _mm256_unpacklo_epi64(temp, temp0); | |||
r2 = _mm256_unpackhi_epi64(temp, temp0); | |||
r1 = _mm256_unpacklo_epi64(temp1, temp2); | |||
r3 = _mm256_unpackhi_epi64(temp1, temp2); | |||
temp = _mm256_unpackhi_epi16(M[0], M[1]); | |||
temp0 = _mm256_unpackhi_epi16(M[2], M[3]); | |||
temp1 = _mm256_unpackhi_epi16(M[4], M[5]); | |||
temp2 = _mm256_unpackhi_epi16(M[6], M[7]); | |||
r4 = _mm256_unpackhi_epi16(M[8], M[9]); | |||
M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); | |||
M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); | |||
M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); | |||
M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); | |||
r5 = _mm256_unpackhi_epi16(M[10], M[11]); | |||
r6 = _mm256_unpackhi_epi16(M[12], M[13]); | |||
r7 = _mm256_unpackhi_epi16(M[14], M[15]); | |||
r0 = _mm256_unpacklo_epi64(r8, r9); | |||
r1 = _mm256_unpacklo_epi64(r10, r11); | |||
r2 = _mm256_unpackhi_epi64(r8, r9); | |||
r3 = _mm256_unpackhi_epi64(r10, r11); | |||
M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); | |||
M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); | |||
M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); | |||
M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); | |||
//for(i=0; i<4; i=i+1) | |||
//{ | |||
r0 = _mm256_unpacklo_epi32(temp, temp0); | |||
r1 = _mm256_unpacklo_epi32(temp1, temp2); | |||
r2 = _mm256_unpacklo_epi32(r4, r5); | |||
r3 = _mm256_unpacklo_epi32(r6, r7); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
r8 = _mm256_unpacklo_epi64(r0, r1); | |||
r10 = _mm256_unpackhi_epi64(r0, r1); | |||
r9 = _mm256_unpacklo_epi64(r2, r3); | |||
r11 = _mm256_unpackhi_epi64(r2, r3); | |||
M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); | |||
M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); | |||
M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); | |||
M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); | |||
r0 = _mm256_unpackhi_epi32(temp, temp0); | |||
r1 = _mm256_unpackhi_epi32(temp1, temp2); | |||
r2 = _mm256_unpackhi_epi32(r4, r5); | |||
r3 = _mm256_unpackhi_epi32(r6, r7); | |||
//} | |||
// for(i=0; i<2; i=i+1) | |||
// { | |||
r4 = _mm256_unpacklo_epi64(r0, r1); | |||
r6 = _mm256_unpackhi_epi64(r0, r1); | |||
r5 = _mm256_unpacklo_epi64(r2, r3); | |||
r7 = _mm256_unpackhi_epi64(r2, r3); | |||
// } | |||
//------------------------------------------------------- | |||
M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); | |||
M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); | |||
M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); | |||
M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); | |||
} | |||
/* | |||
void transpose_unrolled(__m256i *M) | |||
{ | |||
int i; | |||
__m256i tL[8], tH[8]; | |||
__m256i bL[4], bH[4], cL[4], cH[4]; | |||
__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; | |||
__m256i r0, r1, r2, r3, r4, r5, r6, r7; | |||
//for(i=0; i<8; i=i+1) | |||
//{ | |||
tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); | |||
tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); | |||
tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); | |||
tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); | |||
tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); | |||
tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); | |||
tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); | |||
tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); | |||
tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); | |||
tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); | |||
tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); | |||
tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); | |||
tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); | |||
tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); | |||
tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); | |||
tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); | |||
//} | |||
//------------------------------------------------------- | |||
//for(i=0; i<4; i=i+1) | |||
//{ | |||
bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); | |||
bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); | |||
bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); | |||
bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); | |||
bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); | |||
bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); | |||
bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); | |||
bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); | |||
dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); | |||
dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); | |||
dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); | |||
M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); | |||
M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); | |||
M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); | |||
M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); | |||
eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); | |||
eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); | |||
eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); | |||
//} | |||
//------------------------------------------------------- | |||
//------------------------------------------------------- | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); | |||
cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); | |||
fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); | |||
gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); | |||
} | |||
//------------------------------------------------------- | |||
M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); | |||
M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); | |||
M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); | |||
M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); | |||
M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); | |||
M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); | |||
M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); | |||
M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); | |||
M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); | |||
M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); | |||
M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); | |||
M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); | |||
} | |||
void transpose1(__m256i *M) | |||
{ | |||
int i; | |||
__m256i tL[8], tH[8]; | |||
__m256i bL[4], bH[4], cL[4], cH[4]; | |||
__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; | |||
for(i=0; i<8; i=i+1) | |||
{ | |||
tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); | |||
tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); | |||
} | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); | |||
bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); | |||
} | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); | |||
cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); | |||
dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); | |||
eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); | |||
fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); | |||
gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); | |||
} | |||
M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); | |||
M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); | |||
M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); | |||
M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); | |||
M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); | |||
M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); | |||
M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); | |||
M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); | |||
M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); | |||
M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); | |||
M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); | |||
M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); | |||
M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); | |||
M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); | |||
M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); | |||
M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); | |||
} | |||
*/ |
@@ -1,753 +0,0 @@ | |||
//#define SCM_SIZE 16 | |||
//#pragma STDC FP_CONTRACT ON | |||
#include <immintrin.h> | |||
static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { | |||
return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); | |||
} | |||
static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched | |||
//the c_avx are added cummulatively | |||
{ | |||
register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; | |||
register __m256i temp; | |||
a0=a[0]; | |||
a1=a[1]; | |||
a2=a[2]; | |||
a3=a[3]; | |||
a4=a[4]; | |||
a5=a[5]; | |||
a6=a[6]; | |||
a7=a[7]; | |||
b0=b[0]; | |||
b1=b[1]; | |||
b2=b[2]; | |||
b3=b[3]; | |||
b4=b[4]; | |||
b5=b[5]; | |||
b6=b[6]; | |||
b7=b[7]; | |||
// New Unrolled first triangle | |||
//otherwise accumulate | |||
c_avx[0] = mul_add(a0, b0, c_avx[0]); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
temp=mul_add(a1, b0, temp); | |||
c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); | |||
temp = _mm256_mullo_epi16 (a0, b2); | |||
temp = mul_add(a1, b1, temp); | |||
temp=mul_add(a2, b0, temp); | |||
c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); | |||
temp = _mm256_mullo_epi16 (a0, b3); | |||
temp = mul_add(a1, b2, temp); | |||
temp = mul_add(a2, b1, temp); | |||
temp=mul_add(a3, b0, temp); | |||
c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); | |||
temp = _mm256_mullo_epi16 (a0, b4); | |||
temp = mul_add(a1, b3, temp); | |||
temp = mul_add(a3, b1, temp); | |||
temp = mul_add(a4, b0, temp); | |||
temp=mul_add(a2, b2, temp); | |||
c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); | |||
temp = _mm256_mullo_epi16 (a0, b5); | |||
temp = mul_add(a1, b4 , temp); | |||
temp = mul_add(a2, b3, temp); | |||
temp = mul_add(a3, b2, temp); | |||
temp = mul_add( a4, b1, temp); | |||
temp=mul_add(a5, b0, temp); | |||
c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); | |||
temp = _mm256_mullo_epi16 (a0, b6); | |||
temp = mul_add(a1, b5, temp); | |||
temp = mul_add(a5, b1, temp); | |||
temp = mul_add(a6, b0, temp); | |||
temp = mul_add(a2, b4, temp); | |||
temp = mul_add(a3, b3, temp); | |||
temp=mul_add(a4, b2, temp); | |||
c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); | |||
temp = _mm256_mullo_epi16 (a0, b7); | |||
temp = mul_add(a1, b6, temp); | |||
temp = mul_add (a6, b1, temp); | |||
temp = mul_add (a7, b0, temp); | |||
temp = mul_add(a2, b5, temp); | |||
temp = mul_add (a3, b4, temp); | |||
temp = mul_add (a4, b3, temp); | |||
temp=mul_add(a5, b2, temp); | |||
c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); | |||
temp = _mm256_mullo_epi16 (a0, b[8]); | |||
temp = mul_add (a1, b7, temp); | |||
temp = mul_add (a7, b1, temp); | |||
temp = mul_add (a[8], b0, temp); | |||
temp = mul_add (a2, b6,temp); | |||
temp = mul_add(a3, b5, temp); | |||
temp = mul_add (a4, b4,temp); | |||
temp = mul_add (a5, b3, temp); | |||
temp=mul_add(a6, b2, temp); | |||
c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); | |||
temp = _mm256_mullo_epi16 (a0, b[9]); | |||
temp = mul_add (a1, b[8], temp); | |||
temp = mul_add (a[8], b1, temp); | |||
temp = mul_add (a[9], b0, temp); | |||
temp = mul_add (a2, b7, temp); | |||
temp = mul_add (a3, b6, temp); | |||
temp = mul_add (a4, b5, temp); | |||
temp = mul_add (a5, b4, temp); | |||
temp = mul_add (a6, b3, temp); | |||
temp=mul_add(a7, b2, temp); | |||
c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); | |||
temp= _mm256_mullo_epi16 (a0, b[10]); | |||
temp = mul_add (a1, b[9], temp); | |||
temp = mul_add (a[9], b1, temp); | |||
temp = mul_add (a[10], b0, temp); | |||
temp = mul_add (a2, b[8], temp); | |||
temp = mul_add (a3, b7, temp); | |||
temp = mul_add (a4, b6, temp); | |||
temp = mul_add (a5, b5, temp); | |||
temp = mul_add (a6, b4, temp); | |||
temp = mul_add (a7, b3, temp); | |||
temp=mul_add(a[8], b2, temp); | |||
c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); | |||
temp = _mm256_mullo_epi16 (a0, b[11]); | |||
temp = mul_add (a1, b[10], temp ); | |||
temp = mul_add (a[10], b1, temp ); | |||
temp = mul_add (a[11], b0, temp ); | |||
temp = mul_add (a2, b[9], temp ); | |||
temp = mul_add (a3, b[8], temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a[8], b3, temp ); | |||
temp=mul_add(a[9], b2, temp); | |||
c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); | |||
temp = _mm256_mullo_epi16 (a0, b[12]); | |||
temp = mul_add (a1, b[11], temp); | |||
temp = mul_add (a[11], b1, temp); | |||
temp = mul_add (a[12], b0, temp); | |||
temp = mul_add (a2, b[10], temp); | |||
temp = mul_add (a3, b[9], temp); | |||
temp = mul_add (a4, b[8], temp); | |||
temp = mul_add (a5, b7, temp); | |||
temp = mul_add (a6, b6, temp); | |||
temp = mul_add (a7, b5, temp); | |||
temp = mul_add (a[8], b4, temp); | |||
temp = mul_add (a[9], b3, temp); | |||
temp=mul_add(a[10], b2, temp); | |||
c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); | |||
temp = _mm256_mullo_epi16 (a0, b[13]); | |||
temp = mul_add (a1, b[12], temp ); | |||
temp = mul_add (a[12], b1, temp ); | |||
temp = mul_add (a[13], b0, temp ); | |||
temp = mul_add (a2, b[11], temp ); | |||
temp = mul_add (a3, b[10], temp ); | |||
temp = mul_add (a4, b[9], temp ); | |||
temp = mul_add (a5, b[8], temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a[8], b5, temp ); | |||
temp = mul_add (a[9], b4, temp ); | |||
temp = mul_add (a[10], b3, temp ); | |||
temp=mul_add(a[11], b2, temp); | |||
c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); | |||
temp = _mm256_mullo_epi16 (a0, b[14]); | |||
temp = mul_add (a1, b[13], temp ); | |||
temp = mul_add (a[13], b1, temp ); | |||
temp = mul_add (a[14], b0, temp ); | |||
temp = mul_add (a2, b[12], temp ); | |||
temp = mul_add (a3, b[11], temp ); | |||
temp = mul_add (a4, b[10], temp ); | |||
temp = mul_add (a5, b[9], temp ); | |||
temp = mul_add (a6, b[8], temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a[8], b6, temp ); | |||
temp = mul_add (a[9], b5, temp ); | |||
temp = mul_add (a[10], b4, temp ); | |||
temp = mul_add (a[11], b3, temp ); | |||
temp=mul_add(a[12], b2, temp); | |||
c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); | |||
temp = _mm256_mullo_epi16 (a0, b[15]); | |||
temp = mul_add (a1, b[14], temp ); | |||
temp = mul_add (a[14], b1, temp ); | |||
temp = mul_add (a[15], b0, temp ); | |||
temp = mul_add (a2, b[13], temp ); | |||
temp = mul_add (a3, b[12], temp ); | |||
temp = mul_add (a4, b[11], temp ); | |||
temp = mul_add (a5, b[10], temp ); | |||
temp = mul_add (a6, b[9], temp ); | |||
temp = mul_add (a7, b[8], temp ); | |||
temp = mul_add (a[8], b7, temp ); | |||
temp = mul_add (a[9], b6, temp ); | |||
temp = mul_add (a[10], b5, temp ); | |||
temp = mul_add (a[11], b4, temp ); | |||
temp = mul_add (a[12], b3, temp ); | |||
temp=mul_add(a[13], b2, temp); | |||
c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); | |||
// unrolled second triangle | |||
a0=a[14]; | |||
a1=a[15]; | |||
a2=a[13]; | |||
a3=a[12]; | |||
a4=a[11]; | |||
a5=a[10]; | |||
a6=a[9]; | |||
a7=a[8]; | |||
b0=b[14]; | |||
b1=b[15]; | |||
b2=b[13]; | |||
b3=b[12]; | |||
b4=b[11]; | |||
b5=b[10]; | |||
b6=b[9]; | |||
b7=b[8]; | |||
temp = _mm256_mullo_epi16 (a[1], b1); | |||
temp = mul_add (a[2], b0, temp ); | |||
temp = mul_add (a[3], b2, temp ); | |||
temp = mul_add (a[4], b3, temp ); | |||
temp = mul_add (a[5], b4, temp ); | |||
temp = mul_add (a[6], b5, temp ); | |||
temp = mul_add (a[7], b6, temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a6, b[7], temp ); | |||
temp = mul_add (a5, b[6], temp ); | |||
temp = mul_add (a4, b[5], temp ); | |||
temp = mul_add (a3, b[4], temp ); | |||
temp = mul_add (a2, b[3], temp ); | |||
temp = mul_add (a0, b[2], temp ); | |||
temp=mul_add(a1, b[1], temp); | |||
c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); | |||
temp = _mm256_mullo_epi16 (a[2], b1); | |||
temp = mul_add (a[3], b0, temp ); | |||
temp = mul_add (a[4], b2, temp ); | |||
temp = mul_add (a[5], b3, temp ); | |||
temp = mul_add (a[6], b4, temp ); | |||
temp = mul_add (a[7], b5, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a5, b[7], temp ); | |||
temp = mul_add (a4, b[6], temp ); | |||
temp = mul_add (a3, b[5], temp ); | |||
temp = mul_add (a2, b[4], temp ); | |||
temp = mul_add (a0, b[3], temp ); | |||
temp=mul_add(a1, b[2], temp); | |||
c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); | |||
temp = _mm256_mullo_epi16 (a[3], b1); | |||
temp = mul_add (a[4], b0, temp ); | |||
temp = mul_add (a[5], b2, temp ); | |||
temp = mul_add (a[6], b3, temp ); | |||
temp = mul_add (a[7], b4, temp ); | |||
temp = mul_add (a7, b5, temp ); | |||
temp = mul_add (a6, b6, temp ); | |||
temp = mul_add (a5, b7, temp ); | |||
temp = mul_add (a4, b[7], temp ); | |||
temp = mul_add (a3, b[6], temp ); | |||
temp = mul_add (a2, b[5], temp ); | |||
temp = mul_add (a0, b[4], temp ); | |||
temp=mul_add(a1, b[3], temp); | |||
c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); | |||
temp = _mm256_mullo_epi16 (a[4], b1); | |||
temp = mul_add (a[5], b0, temp ); | |||
temp = mul_add (a[6], b2, temp ); | |||
temp = mul_add (a[7], b3, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a3, b[7], temp ); | |||
temp = mul_add (a2, b[6], temp ); | |||
temp = mul_add (a0, b[5], temp ); | |||
temp=mul_add(a1, b[4], temp); | |||
c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); | |||
temp = _mm256_mullo_epi16 (a[5], b1); | |||
temp = mul_add (a[6], b0, temp ); | |||
temp = mul_add (a[7], b2, temp ); | |||
temp = mul_add (a7, b3, temp ); | |||
temp = mul_add (a6, b4, temp ); | |||
temp = mul_add (a5, b5, temp ); | |||
temp = mul_add (a4, b6, temp ); | |||
temp = mul_add (a3, b7, temp ); | |||
temp = mul_add (a2, b[7], temp ); | |||
temp = mul_add (a0, b[6], temp ); | |||
temp=mul_add(a1, b[5], temp); | |||
c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); | |||
temp = _mm256_mullo_epi16 (a[6], b1); | |||
temp = mul_add (a[7], b0, temp ); | |||
temp = mul_add (a7, b2, temp ); | |||
temp = mul_add (a6, b3, temp ); | |||
temp = mul_add (a5, b4, temp ); | |||
temp = mul_add (a4, b5, temp ); | |||
temp = mul_add (a3, b6, temp ); | |||
temp = mul_add (a2, b7, temp ); | |||
temp = mul_add (a0, b[7], temp ); | |||
temp=mul_add(a1, b[6], temp); | |||
c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); | |||
temp = _mm256_mullo_epi16 (a[7], b1); | |||
temp = mul_add (a7, b0, temp ); | |||
temp = mul_add (a6, b2, temp ); | |||
temp = mul_add (a5, b3, temp ); | |||
temp = mul_add (a4, b4, temp ); | |||
temp = mul_add (a3, b5, temp ); | |||
temp = mul_add (a2, b6, temp ); | |||
temp = mul_add (a0, b7, temp ); | |||
temp=mul_add(a1, b[7], temp); | |||
c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); | |||
temp = _mm256_mullo_epi16 (a7, b1); | |||
temp = mul_add (a6, b0, temp ); | |||
temp = mul_add (a5, b2, temp ); | |||
temp = mul_add (a4, b3, temp ); | |||
temp = mul_add (a3, b4, temp ); | |||
temp = mul_add (a2, b5, temp ); | |||
temp = mul_add (a0, b6, temp ); | |||
temp=mul_add(a1, b7, temp); | |||
c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); | |||
temp = _mm256_mullo_epi16 (a6, b1); | |||
temp = mul_add (a5, b0, temp ); | |||
temp = mul_add (a4, b2, temp ); | |||
temp = mul_add (a3, b3, temp ); | |||
temp = mul_add (a2, b4, temp ); | |||
temp = mul_add (a0, b5, temp ); | |||
temp=mul_add(a1, b6, temp); | |||
c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); | |||
temp = _mm256_mullo_epi16 (a5, b1); | |||
temp = mul_add (a4, b0, temp ); | |||
temp = mul_add (a3, b2, temp ); | |||
temp = mul_add (a2, b3, temp ); | |||
temp = mul_add (a0, b4, temp ); | |||
temp=mul_add(a1, b5, temp); | |||
c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); | |||
temp = _mm256_mullo_epi16 (a4, b1); | |||
temp = mul_add (a3, b0, temp ); | |||
temp = mul_add (a2, b2, temp ); | |||
temp = mul_add (a0, b3, temp ); | |||
temp=mul_add(a1, b4, temp); | |||
c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); | |||
temp = _mm256_mullo_epi16 (a3, b1); | |||
temp = mul_add (a2, b0, temp ); | |||
temp = mul_add (a0, b2, temp ); | |||
temp=mul_add(a1, b3, temp); | |||
c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); | |||
temp = _mm256_mullo_epi16 (a2, b1); | |||
temp = mul_add (a0, b0, temp ); | |||
temp=mul_add(a1, b2, temp); | |||
c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
temp=mul_add(a1, b0, temp); | |||
c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); | |||
c_avx[30] = mul_add(a1, b1, c_avx[30]); | |||
c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); | |||
} | |||
static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched | |||
//the c_avx are not added cummulatively | |||
{ | |||
__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; | |||
__m256i temp; | |||
a0=a[0]; | |||
a1=a[1]; | |||
a2=a[2]; | |||
a3=a[3]; | |||
a4=a[4]; | |||
a5=a[5]; | |||
a6=a[6]; | |||
a7=a[7]; | |||
b0=b[0]; | |||
b1=b[1]; | |||
b2=b[2]; | |||
b3=b[3]; | |||
b4=b[4]; | |||
b5=b[5]; | |||
b6=b[6]; | |||
b7=b[7]; | |||
// New Unrolled first triangle | |||
c_avx[0] = _mm256_mullo_epi16 (a0, b0); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
c_avx[1]=mul_add(a1, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b2); | |||
temp = mul_add(a1, b1, temp); | |||
c_avx[2]= mul_add(a2, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b3); | |||
temp = mul_add(a1, b2, temp); | |||
temp = mul_add(a2, b1, temp); | |||
c_avx[3]= mul_add(a3, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b4); | |||
temp = mul_add(a1, b3, temp); | |||
temp = mul_add(a3, b1, temp); | |||
temp = mul_add(a4, b0, temp); | |||
c_avx[4]= mul_add(a2, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b5); | |||
temp = mul_add(a1, b4 , temp); | |||
temp = mul_add(a2, b3, temp); | |||
temp = mul_add(a3, b2, temp); | |||
temp = mul_add( a4, b1, temp); | |||
c_avx[5] = mul_add(a5, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b6); | |||
temp = mul_add(a1, b5, temp); | |||
temp = mul_add(a5, b1, temp); | |||
temp = mul_add(a6, b0, temp); | |||
temp = mul_add(a2, b4, temp); | |||
temp = mul_add(a3, b3, temp); | |||
c_avx[6] = mul_add(a4, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b7); | |||
temp = mul_add(a1, b6, temp); | |||
temp = mul_add (a6, b1, temp); | |||
temp = mul_add (a7, b0, temp); | |||
temp = mul_add(a2, b5, temp); | |||
temp = mul_add (a3, b4, temp); | |||
temp = mul_add (a4, b3, temp); | |||
c_avx[7] = mul_add (a5, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[8]); | |||
temp = mul_add (a1, b7, temp); | |||
temp = mul_add (a7, b1, temp); | |||
temp = mul_add (a[8], b0, temp); | |||
temp = mul_add (a2, b6,temp); | |||
temp = mul_add(a3, b5, temp); | |||
temp = mul_add (a4, b4,temp); | |||
temp = mul_add (a5, b3, temp); | |||
c_avx[8] = mul_add (a6, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[9]); | |||
temp = mul_add (a1, b[8], temp); | |||
temp = mul_add (a[8], b1, temp); | |||
temp = mul_add (a[9], b0, temp); | |||
temp = mul_add (a2, b7, temp); | |||
temp = mul_add (a3, b6, temp); | |||
temp = mul_add (a4, b5, temp); | |||
temp = mul_add (a5, b4, temp); | |||
temp = mul_add (a6, b3, temp); | |||
c_avx[9] = mul_add (a7, b2, temp); | |||
temp= _mm256_mullo_epi16 (a0, b[10]); | |||
temp = mul_add (a1, b[9], temp); | |||
temp = mul_add (a[9], b1, temp); | |||
temp = mul_add (a[10], b0, temp); | |||
temp = mul_add (a2, b[8], temp); | |||
temp = mul_add (a3, b7, temp); | |||
temp = mul_add (a4, b6, temp); | |||
temp = mul_add (a5, b5, temp); | |||
temp = mul_add (a6, b4, temp); | |||
temp = mul_add (a7, b3, temp); | |||
c_avx[10] = mul_add (a[8], b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[11]); | |||
temp = mul_add (a1, b[10], temp ); | |||
temp = mul_add (a[10], b1, temp ); | |||
temp = mul_add (a[11], b0, temp ); | |||
temp = mul_add (a2, b[9], temp ); | |||
temp = mul_add (a3, b[8], temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a[8], b3, temp ); | |||
c_avx[11] = mul_add (a[9], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[12]); | |||
temp = mul_add (a1, b[11], temp); | |||
temp = mul_add (a[11], b1, temp); | |||
temp = mul_add (a[12], b0, temp); | |||
temp = mul_add (a2, b[10], temp); | |||
temp = mul_add (a3, b[9], temp); | |||
temp = mul_add (a4, b[8], temp); | |||
temp = mul_add (a5, b7, temp); | |||
temp = mul_add (a6, b6, temp); | |||
temp = mul_add (a7, b5, temp); | |||
temp = mul_add (a[8], b4, temp); | |||
temp = mul_add (a[9], b3, temp); | |||
c_avx[12] = mul_add (a[10], b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[13]); | |||
temp = mul_add (a1, b[12], temp ); | |||
temp = mul_add (a[12], b1, temp ); | |||
temp = mul_add (a[13], b0, temp ); | |||
temp = mul_add (a2, b[11], temp ); | |||
temp = mul_add (a3, b[10], temp ); | |||
temp = mul_add (a4, b[9], temp ); | |||
temp = mul_add (a5, b[8], temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a[8], b5, temp ); | |||
temp = mul_add (a[9], b4, temp ); | |||
temp = mul_add (a[10], b3, temp ); | |||
c_avx[13] = mul_add (a[11], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[14]); | |||
temp = mul_add (a1, b[13], temp ); | |||
temp = mul_add (a[13], b1, temp ); | |||
temp = mul_add (a[14], b0, temp ); | |||
temp = mul_add (a2, b[12], temp ); | |||
temp = mul_add (a3, b[11], temp ); | |||
temp = mul_add (a4, b[10], temp ); | |||
temp = mul_add (a5, b[9], temp ); | |||
temp = mul_add (a6, b[8], temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a[8], b6, temp ); | |||
temp = mul_add (a[9], b5, temp ); | |||
temp = mul_add (a[10], b4, temp ); | |||
temp = mul_add (a[11], b3, temp ); | |||
c_avx[14] = mul_add (a[12], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[15]); | |||
temp = mul_add (a1, b[14], temp ); | |||
temp = mul_add (a[14], b1, temp ); | |||
temp = mul_add (a[15], b0, temp ); | |||
temp = mul_add (a2, b[13], temp ); | |||
temp = mul_add (a3, b[12], temp ); | |||
temp = mul_add (a4, b[11], temp ); | |||
temp = mul_add (a5, b[10], temp ); | |||
temp = mul_add (a6, b[9], temp ); | |||
temp = mul_add (a7, b[8], temp ); | |||
temp = mul_add (a[8], b7, temp ); | |||
temp = mul_add (a[9], b6, temp ); | |||
temp = mul_add (a[10], b5, temp ); | |||
temp = mul_add (a[11], b4, temp ); | |||
temp = mul_add (a[12], b3, temp ); | |||
c_avx[15] = mul_add (a[13], b2, temp ); | |||
// unrolled second triangle | |||
a0=a[14]; | |||
a1=a[15]; | |||
a2=a[13]; | |||
a3=a[12]; | |||
a4=a[11]; | |||
a5=a[10]; | |||
a6=a[9]; | |||
a7=a[8]; | |||
b0=b[14]; | |||
b1=b[15]; | |||
b2=b[13]; | |||
b3=b[12]; | |||
b4=b[11]; | |||
b5=b[10]; | |||
b6=b[9]; | |||
b7=b[8]; | |||
temp = _mm256_mullo_epi16 (a[1], b1); | |||
temp = mul_add (a[2], b0, temp ); | |||
temp = mul_add (a[3], b2, temp ); | |||
temp = mul_add (a[4], b3, temp ); | |||
temp = mul_add (a[5], b4, temp ); | |||
temp = mul_add (a[6], b5, temp ); | |||
temp = mul_add (a[7], b6, temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a6, b[7], temp ); | |||
temp = mul_add (a5, b[6], temp ); | |||
temp = mul_add (a4, b[5], temp ); | |||
temp = mul_add (a3, b[4], temp ); | |||
temp = mul_add (a2, b[3], temp ); | |||
temp = mul_add (a0, b[2], temp ); | |||
c_avx[16] = mul_add (a1, b[1], temp ); | |||
temp = _mm256_mullo_epi16 (a[2], b1); | |||
temp = mul_add (a[3], b0, temp ); | |||
temp = mul_add (a[4], b2, temp ); | |||
temp = mul_add (a[5], b3, temp ); | |||
temp = mul_add (a[6], b4, temp ); | |||
temp = mul_add (a[7], b5, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a5, b[7], temp ); | |||
temp = mul_add (a4, b[6], temp ); | |||
temp = mul_add (a3, b[5], temp ); | |||
temp = mul_add (a2, b[4], temp ); | |||
temp = mul_add (a0, b[3], temp ); | |||
c_avx[17] = mul_add (a1, b[2], temp ); | |||
temp = _mm256_mullo_epi16 (a[3], b1); | |||
temp = mul_add (a[4], b0, temp ); | |||
temp = mul_add (a[5], b2, temp ); | |||
temp = mul_add (a[6], b3, temp ); | |||
temp = mul_add (a[7], b4, temp ); | |||
temp = mul_add (a7, b5, temp ); | |||
temp = mul_add (a6, b6, temp ); | |||
temp = mul_add (a5, b7, temp ); | |||
temp = mul_add (a4, b[7], temp ); | |||
temp = mul_add (a3, b[6], temp ); | |||
temp = mul_add (a2, b[5], temp ); | |||
temp = mul_add (a0, b[4], temp ); | |||
c_avx[18] = mul_add (a1, b[3], temp ); | |||
temp = _mm256_mullo_epi16 (a[4], b1); | |||
temp = mul_add (a[5], b0, temp ); | |||
temp = mul_add (a[6], b2, temp ); | |||
temp = mul_add (a[7], b3, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a3, b[7], temp ); | |||
temp = mul_add (a2, b[6], temp ); | |||
temp = mul_add (a0, b[5], temp ); | |||
c_avx[19] = mul_add (a1, b[4], temp ); | |||
temp = _mm256_mullo_epi16 (a[5], b1); | |||
temp = mul_add (a[6], b0, temp ); | |||
temp = mul_add (a[7], b2, temp ); | |||
temp = mul_add (a7, b3, temp ); | |||
temp = mul_add (a6, b4, temp ); | |||
temp = mul_add (a5, b5, temp ); | |||
temp = mul_add (a4, b6, temp ); | |||
temp = mul_add (a3, b7, temp ); | |||
temp = mul_add (a2, b[7], temp ); | |||
temp = mul_add (a0, b[6], temp ); | |||
c_avx[20] = mul_add (a1, b[5], temp ); | |||
temp = _mm256_mullo_epi16 (a[6], b1); | |||
temp = mul_add (a[7], b0, temp ); | |||
temp = mul_add (a7, b2, temp ); | |||
temp = mul_add (a6, b3, temp ); | |||
temp = mul_add (a5, b4, temp ); | |||
temp = mul_add (a4, b5, temp ); | |||
temp = mul_add (a3, b6, temp ); | |||
temp = mul_add (a2, b7, temp ); | |||
temp = mul_add (a0, b[7], temp ); | |||
c_avx[21] = mul_add (a1, b[6], temp ); | |||
temp = _mm256_mullo_epi16 (a[7], b1); | |||
temp = mul_add (a7, b0, temp ); | |||
temp = mul_add (a6, b2, temp ); | |||
temp = mul_add (a5, b3, temp ); | |||
temp = mul_add (a4, b4, temp ); | |||
temp = mul_add (a3, b5, temp ); | |||
temp = mul_add (a2, b6, temp ); | |||
temp = mul_add (a0, b7, temp ); | |||
c_avx[22] = mul_add (a1, b[7], temp ); | |||
temp = _mm256_mullo_epi16 (a7, b1); | |||
temp = mul_add (a6, b0, temp ); | |||
temp = mul_add (a5, b2, temp ); | |||
temp = mul_add (a4, b3, temp ); | |||
temp = mul_add (a3, b4, temp ); | |||
temp = mul_add (a2, b5, temp ); | |||
temp = mul_add (a0, b6, temp ); | |||
c_avx[23] = mul_add (a1, b7, temp ); | |||
temp = _mm256_mullo_epi16 (a6, b1); | |||
temp = mul_add (a5, b0, temp ); | |||
temp = mul_add (a4, b2, temp ); | |||
temp = mul_add (a3, b3, temp ); | |||
temp = mul_add (a2, b4, temp ); | |||
temp = mul_add (a0, b5, temp ); | |||
c_avx[24] = mul_add (a1, b6, temp ); | |||
temp = _mm256_mullo_epi16 (a5, b1); | |||
temp = mul_add (a4, b0, temp ); | |||
temp = mul_add (a3, b2, temp ); | |||
temp = mul_add (a2, b3, temp ); | |||
temp = mul_add (a0, b4, temp ); | |||
c_avx[25] = mul_add (a1, b5, temp ); | |||
temp = _mm256_mullo_epi16 (a4, b1); | |||
temp = mul_add (a3, b0, temp ); | |||
temp = mul_add (a2, b2, temp ); | |||
temp = mul_add (a0, b3, temp ); | |||
c_avx[26] = mul_add (a1, b4, temp ); | |||
temp = _mm256_mullo_epi16 (a3, b1); | |||
temp = mul_add (a2, b0, temp ); | |||
temp = mul_add (a0, b2, temp ); | |||
c_avx[27] = mul_add (a1, b3, temp ); | |||
temp = _mm256_mullo_epi16 (a2, b1); | |||
temp = mul_add (a0, b0, temp ); | |||
c_avx[28] = mul_add (a1, b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
c_avx[29] = mul_add (a1, b0, temp); | |||
c_avx[30] = _mm256_mullo_epi16 (a1, b1); | |||
c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); | |||
} |
@@ -11,81 +11,102 @@ | |||
#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { | |||
uint16_t A[SABER_L][SABER_L][SABER_N]; | |||
uint16_t s[SABER_L][SABER_N]; | |||
uint16_t b[SABER_L][SABER_N] = {{0}}; | |||
uint8_t seed_A[SABER_SEEDBYTES]; | |||
uint8_t seed_s[SABER_NOISE_SEEDBYTES]; | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly s[SABER_L]; | |||
poly res[SABER_L]; | |||
uint8_t rand[SABER_NOISESEEDBYTES]; | |||
uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
randombytes(seed_A, SABER_SEEDBYTES); | |||
shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state | |||
randombytes(seed_s, SABER_NOISE_SEEDBYTES); | |||
PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A); | |||
PQCLEAN_FIRESABER_CLEAN_GenSecret(s, seed_s); | |||
PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1); | |||
randombytes(rand, SABER_NOISESEEDBYTES); | |||
PQCLEAN_FIRESABER_CLEAN_GenSecret(s, rand); | |||
PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(sk, s); | |||
PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A); // sample matrix A | |||
PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 1); // Matrix in transposed order | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_N; j++) { | |||
b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP); | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); | |||
PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); | |||
memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); | |||
PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(pk, res); // pack public key | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
uint16_t A[SABER_L][SABER_L][SABER_N]; | |||
uint16_t sp[SABER_L][SABER_N]; | |||
uint16_t bp[SABER_L][SABER_N] = {{0}}; | |||
uint16_t vp[SABER_N] = {0}; | |||
uint16_t mp[SABER_N]; | |||
uint16_t b[SABER_L][SABER_N]; | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly res[SABER_L]; | |||
poly s[SABER_L]; | |||
poly *temp = A[0]; // re-use stack space | |||
poly *vprime = &A[0][0]; | |||
poly *message = &A[0][1]; | |||
const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
PQCLEAN_FIRESABER_CLEAN_GenSecret(s, noiseseed); | |||
PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A); | |||
PQCLEAN_FIRESABER_CLEAN_GenSecret(sp, seed_sp); | |||
PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); | |||
PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed | |||
for (i = 0; i < SABER_L; i++) { | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N; j++) { | |||
bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(ciphertext, res); | |||
PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, pk); | |||
PQCLEAN_FIRESABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(mp, m); | |||
// vector-vector scalar multiplication with mod p | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(temp, pk); | |||
PQCLEAN_FIRESABER_CLEAN_InnerProd(vprime, temp, s); | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(message, m); | |||
for (j = 0; j < SABER_N; j++) { | |||
vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); | |||
for (i = 0; i < SABER_N; i++) { | |||
vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); | |||
vprime->coeffs[i] &= SABER_P - 1; | |||
vprime->coeffs[i] >>= SABER_EP - SABER_ET; | |||
} | |||
PQCLEAN_FIRESABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); | |||
PQCLEAN_FIRESABER_CLEAN_POLT2BS(msk_c, vprime); | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
uint16_t s[SABER_L][SABER_N]; | |||
uint16_t b[SABER_L][SABER_N]; | |||
uint16_t v[SABER_N] = {0}; | |||
uint16_t cm[SABER_N]; | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
size_t i; | |||
poly temp[SABER_L]; | |||
poly s[SABER_L]; | |||
const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
poly *v = &temp[0]; | |||
poly *cm = &temp[1]; | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk); | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, ciphertext); | |||
PQCLEAN_FIRESABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(temp, ciphertext); | |||
PQCLEAN_FIRESABER_CLEAN_InnerProd(&temp[0], temp, s); | |||
PQCLEAN_FIRESABER_CLEAN_BS2POLT(cm, packed_cm); | |||
for (i = 0; i < SABER_N; i++) { | |||
v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); | |||
v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); | |||
v->coeffs[i] &= SABER_P - 1; | |||
v->coeffs[i] >>= SABER_EP - 1; | |||
} | |||
PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(m, v); | |||
@@ -5,7 +5,7 @@ | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); | |||
@@ -2,19 +2,21 @@ | |||
#define PARAMS_H | |||
/* Change this for different security strengths */ | |||
/* Don't change anything below this line */ | |||
#define SABER_L 4 | |||
#define SABER_MU 6 | |||
#define SABER_ET 6 | |||
#define SABER_EQ 13 | |||
#define SABER_EP 10 | |||
#define SABER_N 256 | |||
#define SABER_EP 10 | |||
#define SABER_P (1 << SABER_EP) | |||
#define SABER_EQ 13 | |||
#define SABER_Q (1 << SABER_EQ) | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISE_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_HASHBYTES 32 | |||
@@ -15,4 +15,4 @@ int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, | |||
int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); | |||
#endif /* api_h */ | |||
#endif /* PQCLEAN_FIRESABER_CLEAN_API_H */ |
@@ -1,136 +1,149 @@ | |||
#include "api.h" | |||
#include "SABER_params.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include <string.h> | |||
void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); | |||
bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); | |||
bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); | |||
out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6); | |||
out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4); | |||
out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2); | |||
in += 4; | |||
out += 3; | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j, offset_byte, offset_data; | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 4 * j; | |||
data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; | |||
data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2); | |||
data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4); | |||
data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); | |||
out[0] = in[0] & 0x3f; | |||
out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2); | |||
out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4); | |||
out[3] = ((in[2] & 0xff) >> 2); | |||
in += 3; | |||
out += 4; | |||
} | |||
} | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); | |||
bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); | |||
bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); | |||
bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); | |||
bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); | |||
bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); | |||
bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); | |||
bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); | |||
bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); | |||
bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); | |||
bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); | |||
out[2] = ((in[1] >> 3) & 0xff); | |||
out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); | |||
out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); | |||
out[5] = ((in[3] >> 1) & 0xff); | |||
out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); | |||
out[7] = ((in[4] >> 4) & 0xff); | |||
out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); | |||
out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); | |||
out[10] = ((in[6] >> 2) & 0xff); | |||
out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); | |||
out[12] = ((in[7] >> 5) & 0xff); | |||
in += 8; | |||
out += 13; | |||
} | |||
} | |||
static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j, offset_byte, offset_data; | |||
static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); | |||
out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); | |||
out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); | |||
out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); | |||
out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); | |||
out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); | |||
out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); | |||
out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); | |||
in += 13; | |||
out += 8; | |||
} | |||
} | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); | |||
bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); | |||
out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); | |||
out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); | |||
out[4] = ((in[3] >> 2) & 0xff); | |||
in += 4; | |||
out += 5; | |||
} | |||
} | |||
static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j, offset_byte, offset_data; | |||
static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 5 * j; | |||
offset_data = 4 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); | |||
data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); | |||
data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); | |||
data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); | |||
out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); | |||
out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); | |||
out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); | |||
in += 5; | |||
out += 4; | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { | |||
void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); | |||
POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLq(data[i], bytes + i * SABER_POLYBYTES); | |||
BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { | |||
void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); | |||
POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); | |||
BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { | |||
size_t i, j; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
data[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { | |||
void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { | |||
size_t i, j; | |||
memset(bytes, 0, SABER_KEYBYTES); | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); | |||
bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); | |||
} | |||
} | |||
} |
@@ -1,27 +1,28 @@ | |||
#ifndef PACK_UNPACK_H | |||
#define PACK_UNPACK_H | |||
#include "SABER_params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); | |||
void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); | |||
void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); | |||
void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); | |||
void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); | |||
#endif |
@@ -3,32 +3,40 @@ | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include "poly_mul.h" | |||
#include <stddef.h> | |||
void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { | |||
void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) { | |||
size_t i, j; | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_L; j++) { | |||
if (transpose == 1) { | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); | |||
} else { | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); | |||
if (transpose) { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1); | |||
} | |||
} | |||
} else { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1); | |||
} | |||
} | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { | |||
size_t j; | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res, b[j], s[j]); | |||
void PQCLEAN_FIRESABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) { | |||
size_t i; | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul(c, &b[0], &s[0], 0); | |||
for (i = 1; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_CLEAN_poly_mul(c, &b[i], &s[i], 1); | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
void PQCLEAN_FIRESABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
@@ -37,13 +45,13 @@ void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], co | |||
} | |||
} | |||
void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
void PQCLEAN_FIRESABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_FIRESABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); | |||
PQCLEAN_FIRESABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); | |||
} | |||
} |
@@ -3,13 +3,21 @@ | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); | |||
typedef union { | |||
uint16_t coeffs[SABER_N]; | |||
} poly; | |||
void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); | |||
void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose); | |||
void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]); | |||
void PQCLEAN_FIRESABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); | |||
void PQCLEAN_FIRESABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate); | |||
#endif |
@@ -1,4 +1,4 @@ | |||
#include "poly_mul.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
@@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t | |||
} | |||
/* res += a*b */ | |||
void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { | |||
uint16_t c[2 * SABER_N] = {0}; | |||
void PQCLEAN_FIRESABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) { | |||
uint16_t C[2 * SABER_N] = {0}; | |||
size_t i; | |||
toom_cook_4way(c, a, b); | |||
toom_cook_4way(C, a->coeffs, b->coeffs); | |||
/* reduction */ | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
res[i - SABER_N] += (c[i - SABER_N] - c[i]); | |||
if (accumulate == 0) { | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]); | |||
} | |||
} else { | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]); | |||
} | |||
} | |||
} |
@@ -1,9 +1,3 @@ | |||
#ifndef POLY_MUL_H | |||
#define POLY_MUL_H | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); | |||
#endif |
@@ -14,9 +14,9 @@ principal-submitters: | |||
- Frederik Vercauteren | |||
implementations: | |||
- name: clean | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber | |||
- name: avx2 | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
@@ -2,7 +2,7 @@ | |||
LIB=liblightsaber_avx2.a | |||
HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h | |||
OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o | |||
OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o | |||
CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -1,416 +1,125 @@ | |||
#include "./polymul/toom-cook_4way.c" | |||
#include "SABER_indcpa.h" | |||
#include "SABER_params.h" | |||
#include "api.h" | |||
#include "cbd.h" | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include "randombytes.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
//#include "randombytes.h" | |||
//#include "./polymul/toom_cook_4/toom-cook_4way.c" | |||
#define h1 4 //2^(EQ-EP-1) | |||
#define h1 (1 << (SABER_EQ - SABER_EP - 1)) | |||
#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) | |||
#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly *skpv1 = A[0]; // use first row of A to hold sk temporarily | |||
toom4_points skpv1_eval[SABER_L]; | |||
poly res[SABER_L]; | |||
static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { | |||
int32_t i, j; | |||
uint8_t rand[SABER_NOISESEEDBYTES]; | |||
uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
message_dec[j] = 0; | |||
for (i = 0; i < 8; i++) { | |||
message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); | |||
} | |||
} | |||
} | |||
/*----------------------------------------------------------------------------------- | |||
This routine generates a=[Matrix K x K] of 256-coefficient polynomials | |||
static void GenMatrix(polyvec *a, const uint8_t *seed) { | |||
uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; | |||
uint16_t temp_ar[SABER_N]; | |||
int i, j, k; | |||
uint16_t mod = (SABER_Q - 1); | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); | |||
for (k = 0; k < SABER_N; k++) { | |||
a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; | |||
} | |||
} | |||
} | |||
} | |||
static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { | |||
uint32_t i; | |||
randombytes(seed_A, SABER_SEEDBYTES); | |||
shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state | |||
uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; | |||
randombytes(rand, SABER_NOISESEEDBYTES); | |||
PQCLEAN_LIGHTSABER_AVX2_GenSecret(skpv1, rand); | |||
PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_K; i++) { | |||
PQCLEAN_LIGHTSABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]); | |||
} | |||
} | |||
//********************************matrix-vector mul routines***************************************************** | |||
static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { | |||
int64_t i, j; | |||
__m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time | |||
for (i = 0; i < NUM_POLY; i++) { | |||
for (j = 0; j < NUM_POLY; j++) { | |||
PQCLEAN_LIGHTSABER_AVX2_GenMatrix(A, seed_A); // sample matrix A | |||
PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order | |||
if (isTranspose == 0) { | |||
toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); | |||
} else { | |||
toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); | |||
} | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_N; j++) { | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
TC_interpol(c_bucket, res_avx[i]); | |||
} | |||
} | |||
static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { | |||
int64_t i; | |||
__m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time | |||
for (i = 0; i < NUM_POLY; i++) { | |||
toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); | |||
} | |||
TC_interpol(c_bucket, res_avx); | |||
} | |||
//********************************matrix-vector mul routines***************************************************** | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
polyvec a[SABER_K]; | |||
uint16_t skpv1[SABER_K][SABER_N]; | |||
uint8_t seed[SABER_SEEDBYTES]; | |||
uint8_t noiseseed[SABER_COINBYTES]; | |||
int32_t i, j, k; | |||
//--------------AVX declaration------------------ | |||
__m256i sk_avx[SABER_K][SABER_N / 16]; | |||
__m256i mod; | |||
__m256i res_avx[SABER_K][SABER_N / 16]; | |||
__m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
mod = _mm256_set1_epi16(SABER_Q - 1); | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
randombytes(seed, SABER_SEEDBYTES); | |||
shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state | |||
randombytes(noiseseed, SABER_COINBYTES); | |||
GenMatrix(a, seed); //sample matrix A | |||
GenSecret(skpv1, noiseseed); | |||
// Load sk into avx vectors | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); | |||
} | |||
} | |||
// Load a into avx vectors | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); | |||
} | |||
} | |||
} | |||
//------------------------do the matrix vector multiplication and rounding------------ | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sk_avx[j], b_bucket[j]); | |||
} | |||
matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order | |||
// Now truncation | |||
for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); | |||
res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); | |||
res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); | |||
} | |||
} | |||
//------------------Pack sk into byte string------- | |||
PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); | |||
//------------------Pack pk into byte string------- | |||
for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); | |||
} | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string | |||
for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. | |||
pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(pk, res); // pack public key | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly res[SABER_L]; | |||
toom4_points skpv1_eval[SABER_L]; | |||
uint32_t i, j, k; | |||
polyvec a[SABER_K]; // skpv; | |||
uint8_t seed[SABER_SEEDBYTES]; | |||
uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client | |||
uint16_t skpv1[SABER_K][SABER_N]; | |||
uint16_t temp[SABER_K][SABER_N]; | |||
uint16_t message[SABER_KEYBYTES * 8]; | |||
uint8_t msk_c[SABER_SCALEBYTES_KEM]; | |||
//--------------AVX declaration------------------ | |||
__m256i sk_avx[SABER_K][SABER_N / 16]; | |||
__m256i mod, mod_p; | |||
__m256i res_avx[SABER_K][SABER_N / 16]; | |||
__m256i vprime_avx[SABER_N / 16]; | |||
__m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
__m256i pkcl_avx[SABER_K][SABER_N / 16]; | |||
__m256i message_avx[SABER_N / 16]; | |||
mod = _mm256_set1_epi16(SABER_Q - 1); | |||
mod_p = _mm256_set1_epi16(SABER_P - 1); | |||
poly *temp = A[0]; // re-use stack space | |||
poly *vprime = &A[0][0]; | |||
poly *message = &A[0][1]; | |||
const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. | |||
seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; | |||
PQCLEAN_LIGHTSABER_AVX2_GenSecret(temp, noiseseed); | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]); | |||
} | |||
GenMatrix(a, seed); | |||
GenSecret(skpv1, noiseseed); | |||
// ----------- Load skpv1 into avx vectors ---------- | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); | |||
} | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_GenMatrix(A, seed_A); | |||
PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed | |||
// ----------- Load skpv1 into avx vectors ---------- | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); | |||
} | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N; j++) { | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
//-----------------matrix-vector multiplication and rounding | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sk_avx[j], b_bucket[j]); | |||
} | |||
matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order | |||
// Now truncation | |||
for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); | |||
res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); | |||
res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); | |||
} | |||
} | |||
//-----this result should be put in b_prime for later use in server. | |||
for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); | |||
} | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string | |||
//**************client matrix-vector multiplication ends******************// | |||
//------now calculate the v' | |||
//-------unpack the public_key | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); | |||
} | |||
} | |||
// InnerProduct | |||
//for(k=0;k<SABER_N/16;k++){ | |||
// vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]); | |||
//} | |||
PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(ciphertext, res); | |||
// vector-vector scalar multiplication with mod p | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(temp, pk); | |||
PQCLEAN_LIGHTSABER_AVX2_InnerProd(vprime, temp, skpv1_eval); | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLmsg(message, m); | |||
vector_vector_mul(vprime_avx, pkcl_avx, b_bucket); | |||
// Computation of v'+h1 | |||
for (i = 0; i < SABER_N / 16; i++) { //adding h1 | |||
vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1)); | |||
} | |||
// unpack m; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
message[8 * j + i] = ((m[j] >> i) & 0x01); | |||
} | |||
} | |||
// message encoding | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); | |||
message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); | |||
} | |||
// SHIFTRIGHT(v'+h1-m mod p, EP-ET) | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); | |||
vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); | |||
vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); | |||
} | |||
// Unpack avx | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(msk_c, temp[0]); | |||
for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { | |||
ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; | |||
for (i = 0; i < SABER_N; i++) { | |||
vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); | |||
vprime->coeffs[i] &= SABER_P - 1; | |||
vprime->coeffs[i] >>= SABER_EP - SABER_ET; | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_POLT2BS(msk_c, vprime); | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
size_t i; | |||
uint32_t i, j; | |||
uint16_t sksv[SABER_K][SABER_N]; //secret key of the server | |||
uint16_t pksv[SABER_K][SABER_N]; | |||
uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; | |||
uint8_t scale_ar[SABER_SCALEBYTES_KEM]; | |||
uint16_t op[SABER_N]; | |||
//--------------AVX declaration------------------ | |||
//__m256i mod_p; | |||
__m256i v_avx[SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
__m256i sksv_avx[SABER_K][SABER_N / 16]; | |||
__m256i pksv_avx[SABER_K][SABER_N / 16]; | |||
poly temp[SABER_L]; | |||
toom4_points sksv_eval[SABER_L]; | |||
//mod_p=_mm256_set1_epi16(SABER_P-1); | |||
const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
poly *v = &temp[0]; | |||
poly *cm = &temp[1]; | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
//-------unpack the public_key | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); | |||
pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(temp, sk); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]); | |||
} | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); | |||
} | |||
// InnerProduct(b', s, mod p) | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sksv_avx[j], b_bucket[j]); | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(temp, ciphertext); | |||
PQCLEAN_LIGHTSABER_AVX2_InnerProd(v, temp, sksv_eval); | |||
vector_vector_mul(v_avx, pksv_avx, b_bucket); | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLT(cm, packed_cm); | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
_mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); | |||
} | |||
for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { | |||
scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(op, scale_ar); | |||
//addition of h2 | |||
for (i = 0; i < SABER_N; i++) { | |||
message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); | |||
v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); | |||
v->coeffs[i] &= SABER_P - 1; | |||
v->coeffs[i] >>= SABER_EP - 1; | |||
} | |||
POL2MSG(m, message_dec_unpacked); | |||
PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(m, v); | |||
} |
@@ -1,46 +1,41 @@ | |||
#ifndef PARAMS_H | |||
#define PARAMS_H | |||
#include "api.h" | |||
#define SABER_K 2 | |||
/* Don't change anything below this line */ | |||
#define SABER_L 2 | |||
#define SABER_MU 10 | |||
#define SABER_ET 3 | |||
#define SABER_EQ 13 | |||
#define SABER_EP 10 | |||
#define SABER_N 256 | |||
#define SABER_Q 8192 //2^13 | |||
#define SABER_P 1024 | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_COINBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_EP 10 | |||
#define SABER_P (1 << SABER_EP) | |||
#define SABER_HASHBYTES 32 | |||
#define SABER_EQ 13 | |||
#define SABER_Q (1 << SABER_EQ) | |||
#define SABER_POLYBYTES 416 //13*256/8 | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_HASHBYTES 32 | |||
#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) | |||
#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) | |||
#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation | |||
#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) | |||
#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) | |||
#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) | |||
#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) | |||
#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) | |||
#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) | |||
#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) | |||
#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) | |||
#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) | |||
#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) | |||
#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) | |||
#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) | |||
#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ | |||
#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) | |||
#endif |
@@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
----------------------------------------------------------------------*/ | |||
static uint64_t load_littleendian(const unsigned char *x, int bytes) { | |||
static uint64_t load_littleendian(const uint8_t *x, int bytes) { | |||
int i; | |||
uint64_t r = x[0]; | |||
for (i = 1; i < bytes; i++) { | |||
@@ -20,10 +20,7 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { | |||
return r; | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { | |||
uint16_t Qmod_minus1 = SABER_Q - 1; | |||
void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { | |||
uint64_t t, d, a[4], b[4]; | |||
int i, j; | |||
@@ -34,8 +31,8 @@ void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { | |||
d += (t >> j) & 0x0842108421UL; | |||
} | |||
a[0] = d & 0x1f; | |||
b[0] = (d >> 5) & 0x1f; | |||
a[0] = d & 0x1f; | |||
b[0] = (d >> 5) & 0x1f; | |||
a[1] = (d >> 10) & 0x1f; | |||
b[1] = (d >> 15) & 0x1f; | |||
a[2] = (d >> 20) & 0x1f; | |||
@@ -43,9 +40,9 @@ void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { | |||
a[3] = (d >> 30) & 0x1f; | |||
b[3] = (d >> 35); | |||
r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; | |||
r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; | |||
r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; | |||
r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; | |||
s[4 * i + 0] = (uint16_t)(a[0] - b[0]); | |||
s[4 * i + 1] = (uint16_t)(a[1] - b[1]); | |||
s[4 * i + 2] = (uint16_t)(a[2] - b[2]); | |||
s[4 * i + 3] = (uint16_t)(a[3] - b[3]); | |||
} | |||
} |
@@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" | |||
by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, | |||
Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
----------------------------------------------------------------------*/ | |||
#include "poly.h" | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); | |||
void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); | |||
#endif |
@@ -4,14 +4,12 @@ | |||
#include "fips202.h" | |||
#include "randombytes.h" | |||
#include "verify.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
int i; | |||
size_t i; | |||
PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk | |||
for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { | |||
@@ -39,7 +37,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t | |||
sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); | |||
// K^ <-- kr[0:31] | |||
// noiseseed (r) <-- kr[32:63]; | |||
PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; | |||
PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; | |||
sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); | |||
@@ -49,7 +47,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t | |||
} | |||
int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { | |||
int i; | |||
size_t i; | |||
uint8_t fail; | |||
uint8_t cmp[SABER_BYTES_CCA_DEC]; | |||
uint8_t buf[64]; | |||
@@ -65,7 +63,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const u | |||
sha3_512(kr, buf, 64); | |||
PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); | |||
PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk); | |||
fail = PQCLEAN_LIGHTSABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); | |||
@@ -1,35 +1,3 @@ | |||
#ifndef INDCPA_H | |||
#define INDCPA_H | |||
#include <stdint.h> | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); | |||
void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); | |||
int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); | |||
int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); | |||
//uint64_t clock1,clock2; | |||
//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; | |||
#endif |
@@ -1,502 +1,153 @@ | |||
#include "SABER_params.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include <string.h> | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); | |||
bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); | |||
bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); | |||
out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6); | |||
out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7); | |||
out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5); | |||
in += 8; | |||
out += 3; | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; | |||
data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; | |||
data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); | |||
data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; | |||
data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; | |||
data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); | |||
data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); | |||
data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0; | |||
for (j = 0; j < SABER_N / 2; j++) { | |||
offset_data = 2 * j; | |||
bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0; | |||
for (j = 0; j < SABER_N / 2; j++) { | |||
offset_data = 2 * j; | |||
data[offset_data] = bytes[j] & 0x0f; | |||
data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
out[0] = (in[0]) & 0x07; | |||
out[1] = ((in[0]) >> 3) & 0x07; | |||
out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2); | |||
out[3] = ((in[1]) >> 1) & 0x07; | |||
out[4] = ((in[1]) >> 4) & 0x07; | |||
out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1); | |||
out[6] = ((in[2] >> 2) & 0x07); | |||
out[7] = ((in[2] >> 5) & 0x07); | |||
in += 3; | |||
out += 8; | |||
} | |||
} | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); | |||
out[2] = ((in[1] >> 3) & 0xff); | |||
out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); | |||
out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); | |||
out[5] = ((in[3] >> 1) & 0xff); | |||
out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); | |||
out[7] = ((in[4] >> 4) & 0xff); | |||
out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); | |||
out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); | |||
out[10] = ((in[6] >> 2) & 0xff); | |||
out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); | |||
out[12] = ((in[7] >> 5) & 0xff); | |||
in += 8; | |||
out += 13; | |||
} | |||
} | |||
static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); | |||
out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); | |||
out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); | |||
out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); | |||
out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); | |||
out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); | |||
out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); | |||
out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); | |||
in += 13; | |||
out += 8; | |||
} | |||
} | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); | |||
bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); | |||
bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); | |||
out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); | |||
out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); | |||
out[4] = ((in[3] >> 2) & 0xff); | |||
in += 4; | |||
out += 5; | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 4 * j; | |||
data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; | |||
data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; | |||
data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; | |||
data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); | |||
out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); | |||
out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); | |||
out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); | |||
in += 5; | |||
out += 4; | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); | |||
bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); | |||
data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); | |||
data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); | |||
data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { | |||
size_t i, j; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
void PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { | |||
size_t i, j; | |||
memset(bytes, 0, SABER_KEYBYTES); | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); | |||
data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); | |||
data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); | |||
data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); | |||
bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
//for(i=0;i<SABER_K;i++){ | |||
//i=0; | |||
//offset_byte1=i*(SABER_N*13)/8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
//offset_byte=offset_byte1+13*j; | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
//} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
/*This function packs 11 bit data stream into 8 bits of data. | |||
*/ | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 11) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 11 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 11) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 11 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); | |||
data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); | |||
data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); | |||
data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); | |||
data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); | |||
data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); | |||
data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); | |||
data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 14) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 7 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 14) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 7 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); | |||
data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); | |||
data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); | |||
data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { | |||
if (modulus == 1024) { | |||
PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(bytes, data); | |||
} else if (modulus == 8192) { | |||
PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(bytes, data); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { | |||
if (modulus == 1024) { | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(data, bytes); | |||
} else if (modulus == 8192) { | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(data, bytes); | |||
} | |||
} |
@@ -1,56 +1,28 @@ | |||
#ifndef PACK_UNPACK_H | |||
#define PACK_UNPACK_H | |||
#include "SABER_params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_LIGHTSABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); | |||
#endif |
@@ -0,0 +1,62 @@ | |||
#include "cbd.h" | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
void PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) { | |||
size_t i, j; | |||
toom4_points_product c_eval; | |||
if (transpose) { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1); | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_interp(&c[i], &c_eval); | |||
} | |||
} else { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1); | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_interp(&c[i], &c_eval); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) { | |||
size_t i; | |||
toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0); | |||
for (i = 1; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1); | |||
} | |||
PQCLEAN_LIGHTSABER_AVX2_toom4_interp(c, &c_eval); | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); | |||
} | |||
} |
@@ -1,27 +1,38 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
/*--------------------------------------------------------------------- | |||
This file has been adapted from the implementation | |||
(available at, Public Domain https://github.com/pq-crystals/kyber) | |||
of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" | |||
by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, | |||
Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
#include "SABER_params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
typedef struct { | |||
typedef union { | |||
uint16_t coeffs[SABER_N]; | |||
__m256i dummy; | |||
} poly; | |||
typedef struct { | |||
poly vec[SABER_K]; | |||
} polyvec; | |||
typedef union { | |||
uint16_t coeffs[4 * SABER_N]; | |||
__m256i dummy; | |||
} toom4_points; | |||
void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); | |||
typedef union { | |||
uint16_t coeffs[8 * SABER_N]; | |||
__m256i dummy; | |||
} toom4_points_product; | |||
void PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose); | |||
void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); | |||
void PQCLEAN_LIGHTSABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]); | |||
void PQCLEAN_LIGHTSABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval); | |||
void PQCLEAN_LIGHTSABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b); | |||
void PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate); | |||
#endif |
@@ -1,20 +0,0 @@ | |||
#include "../SABER_params.h" | |||
#define AVX_N (SABER_N >> 4) | |||
#define small_len_avx (AVX_N >> 2) | |||
#define SCHB_N 16 | |||
#define N_SB (SABER_N >> 2) | |||
#define N_SB_RES (2*N_SB-1) | |||
#define N_SB_16 (N_SB >> 2) | |||
#define N_SB_16_RES (2*N_SB_16-1) | |||
#define AVX_N1 16 /*N/16*/ | |||
#define SCM_SIZE 16 | |||
// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements | |||
#define NUM_POLY SABER_K | |||
//int NUM_POLY=2; |
@@ -1,303 +0,0 @@ | |||
#include <immintrin.h> | |||
static void transpose_n1(__m256i *M) | |||
{ | |||
//int i; | |||
register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; | |||
register __m256i temp, temp0, temp1, temp2; | |||
//for(i=0; i<8; i=i+1) | |||
//{ | |||
r0 = _mm256_unpacklo_epi16(M[0], M[1]); | |||
r1 = _mm256_unpacklo_epi16(M[2], M[3]); | |||
r2 = _mm256_unpacklo_epi16(M[4], M[5]); | |||
r3 = _mm256_unpacklo_epi16(M[6], M[7]); | |||
r4 = _mm256_unpacklo_epi16(M[8], M[9]); | |||
r5 = _mm256_unpacklo_epi16(M[10], M[11]); | |||
r6 = _mm256_unpacklo_epi16(M[12], M[13]); | |||
r7 = _mm256_unpacklo_epi16(M[14], M[15]); | |||
temp = _mm256_unpacklo_epi32(r0, r1); | |||
temp0 = _mm256_unpacklo_epi32(r2, r3); | |||
temp1 = _mm256_unpacklo_epi32(r4, r5); | |||
temp2 = _mm256_unpacklo_epi32(r6, r7); | |||
r8 = _mm256_unpackhi_epi32(r0, r1); | |||
r9 = _mm256_unpackhi_epi32(r2, r3); | |||
r10 = _mm256_unpackhi_epi32(r4, r5); | |||
r11 = _mm256_unpackhi_epi32(r6, r7); | |||
r0 = _mm256_unpacklo_epi64(temp, temp0); | |||
r2 = _mm256_unpackhi_epi64(temp, temp0); | |||
r1 = _mm256_unpacklo_epi64(temp1, temp2); | |||
r3 = _mm256_unpackhi_epi64(temp1, temp2); | |||
temp = _mm256_unpackhi_epi16(M[0], M[1]); | |||
temp0 = _mm256_unpackhi_epi16(M[2], M[3]); | |||
temp1 = _mm256_unpackhi_epi16(M[4], M[5]); | |||
temp2 = _mm256_unpackhi_epi16(M[6], M[7]); | |||
r4 = _mm256_unpackhi_epi16(M[8], M[9]); | |||
M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); | |||
M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); | |||
M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); | |||
M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); | |||
r5 = _mm256_unpackhi_epi16(M[10], M[11]); | |||
r6 = _mm256_unpackhi_epi16(M[12], M[13]); | |||
r7 = _mm256_unpackhi_epi16(M[14], M[15]); | |||
r0 = _mm256_unpacklo_epi64(r8, r9); | |||
r1 = _mm256_unpacklo_epi64(r10, r11); | |||
r2 = _mm256_unpackhi_epi64(r8, r9); | |||
r3 = _mm256_unpackhi_epi64(r10, r11); | |||
M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); | |||
M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); | |||
M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); | |||
M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); | |||
//for(i=0; i<4; i=i+1) | |||
//{ | |||
r0 = _mm256_unpacklo_epi32(temp, temp0); | |||
r1 = _mm256_unpacklo_epi32(temp1, temp2); | |||
r2 = _mm256_unpacklo_epi32(r4, r5); | |||
r3 = _mm256_unpacklo_epi32(r6, r7); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
r8 = _mm256_unpacklo_epi64(r0, r1); | |||
r10 = _mm256_unpackhi_epi64(r0, r1); | |||
r9 = _mm256_unpacklo_epi64(r2, r3); | |||
r11 = _mm256_unpackhi_epi64(r2, r3); | |||
M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); | |||
M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); | |||
M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); | |||
M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); | |||
r0 = _mm256_unpackhi_epi32(temp, temp0); | |||
r1 = _mm256_unpackhi_epi32(temp1, temp2); | |||
r2 = _mm256_unpackhi_epi32(r4, r5); | |||
r3 = _mm256_unpackhi_epi32(r6, r7); | |||
//} | |||
// for(i=0; i<2; i=i+1) | |||
// { | |||
r4 = _mm256_unpacklo_epi64(r0, r1); | |||
r6 = _mm256_unpackhi_epi64(r0, r1); | |||
r5 = _mm256_unpacklo_epi64(r2, r3); | |||
r7 = _mm256_unpackhi_epi64(r2, r3); | |||
// } | |||
//------------------------------------------------------- | |||
M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); | |||
M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); | |||
M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); | |||
M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); | |||
} | |||
/* | |||
void transpose_unrolled(__m256i *M) | |||
{ | |||
int i; | |||
__m256i tL[8], tH[8]; | |||
__m256i bL[4], bH[4], cL[4], cH[4]; | |||
__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; | |||
__m256i r0, r1, r2, r3, r4, r5, r6, r7; | |||
//for(i=0; i<8; i=i+1) | |||
//{ | |||
tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); | |||
tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); | |||
tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); | |||
tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); | |||
tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); | |||
tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); | |||
tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); | |||
tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); | |||
tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); | |||
tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); | |||
tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); | |||
tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); | |||
tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); | |||
tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); | |||
tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); | |||
tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); | |||
//} | |||
//------------------------------------------------------- | |||
//for(i=0; i<4; i=i+1) | |||
//{ | |||
bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); | |||
bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); | |||
bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); | |||
bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); | |||
bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); | |||
bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); | |||
bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); | |||
bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); | |||
dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); | |||
dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); | |||
dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); | |||
M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); | |||
M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); | |||
M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); | |||
M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); | |||
eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); | |||
eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); | |||
eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); | |||
//} | |||
//------------------------------------------------------- | |||
//------------------------------------------------------- | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); | |||
cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); | |||
fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); | |||
gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); | |||
} | |||
//------------------------------------------------------- | |||
M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); | |||
M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); | |||
M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); | |||
M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); | |||
M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); | |||
M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); | |||
M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); | |||
M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); | |||
M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); | |||
M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); | |||
M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); | |||
M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); | |||
} | |||
void transpose1(__m256i *M) | |||
{ | |||
int i; | |||
__m256i tL[8], tH[8]; | |||
__m256i bL[4], bH[4], cL[4], cH[4]; | |||
__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; | |||
for(i=0; i<8; i=i+1) | |||
{ | |||
tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); | |||
tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); | |||
} | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); | |||
bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); | |||
} | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); | |||
cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); | |||
dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); | |||
eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); | |||
fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); | |||
gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); | |||
} | |||
M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); | |||
M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); | |||
M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); | |||
M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); | |||
M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); | |||
M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); | |||
M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); | |||
M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); | |||
M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); | |||
M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); | |||
M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); | |||
M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); | |||
M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); | |||
M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); | |||
M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); | |||
M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); | |||
} | |||
*/ |
@@ -1,753 +0,0 @@ | |||
//#define SCM_SIZE 16 | |||
//#pragma STDC FP_CONTRACT ON | |||
#include <immintrin.h> | |||
static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { | |||
return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); | |||
} | |||
static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched | |||
//the c_avx are added cummulatively | |||
{ | |||
register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; | |||
register __m256i temp; | |||
a0=a[0]; | |||
a1=a[1]; | |||
a2=a[2]; | |||
a3=a[3]; | |||
a4=a[4]; | |||
a5=a[5]; | |||
a6=a[6]; | |||
a7=a[7]; | |||
b0=b[0]; | |||
b1=b[1]; | |||
b2=b[2]; | |||
b3=b[3]; | |||
b4=b[4]; | |||
b5=b[5]; | |||
b6=b[6]; | |||
b7=b[7]; | |||
// New Unrolled first triangle | |||
//otherwise accumulate | |||
c_avx[0] = mul_add(a0, b0, c_avx[0]); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
temp=mul_add(a1, b0, temp); | |||
c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); | |||
temp = _mm256_mullo_epi16 (a0, b2); | |||
temp = mul_add(a1, b1, temp); | |||
temp=mul_add(a2, b0, temp); | |||
c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); | |||
temp = _mm256_mullo_epi16 (a0, b3); | |||
temp = mul_add(a1, b2, temp); | |||
temp = mul_add(a2, b1, temp); | |||
temp=mul_add(a3, b0, temp); | |||
c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); | |||
temp = _mm256_mullo_epi16 (a0, b4); | |||
temp = mul_add(a1, b3, temp); | |||
temp = mul_add(a3, b1, temp); | |||
temp = mul_add(a4, b0, temp); | |||
temp=mul_add(a2, b2, temp); | |||
c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); | |||
temp = _mm256_mullo_epi16 (a0, b5); | |||
temp = mul_add(a1, b4 , temp); | |||
temp = mul_add(a2, b3, temp); | |||
temp = mul_add(a3, b2, temp); | |||
temp = mul_add( a4, b1, temp); | |||
temp=mul_add(a5, b0, temp); | |||
c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); | |||
temp = _mm256_mullo_epi16 (a0, b6); | |||
temp = mul_add(a1, b5, temp); | |||
temp = mul_add(a5, b1, temp); | |||
temp = mul_add(a6, b0, temp); | |||
temp = mul_add(a2, b4, temp); | |||
temp = mul_add(a3, b3, temp); | |||
temp=mul_add(a4, b2, temp); | |||
c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); | |||
temp = _mm256_mullo_epi16 (a0, b7); | |||
temp = mul_add(a1, b6, temp); | |||
temp = mul_add (a6, b1, temp); | |||
temp = mul_add (a7, b0, temp); | |||
temp = mul_add(a2, b5, temp); | |||
temp = mul_add (a3, b4, temp); | |||
temp = mul_add (a4, b3, temp); | |||
temp=mul_add(a5, b2, temp); | |||
c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); | |||
temp = _mm256_mullo_epi16 (a0, b[8]); | |||
temp = mul_add (a1, b7, temp); | |||
temp = mul_add (a7, b1, temp); | |||
temp = mul_add (a[8], b0, temp); | |||
temp = mul_add (a2, b6,temp); | |||
temp = mul_add(a3, b5, temp); | |||
temp = mul_add (a4, b4,temp); | |||
temp = mul_add (a5, b3, temp); | |||
temp=mul_add(a6, b2, temp); | |||
c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); | |||
temp = _mm256_mullo_epi16 (a0, b[9]); | |||
temp = mul_add (a1, b[8], temp); | |||
temp = mul_add (a[8], b1, temp); | |||
temp = mul_add (a[9], b0, temp); | |||
temp = mul_add (a2, b7, temp); | |||
temp = mul_add (a3, b6, temp); | |||
temp = mul_add (a4, b5, temp); | |||
temp = mul_add (a5, b4, temp); | |||
temp = mul_add (a6, b3, temp); | |||
temp=mul_add(a7, b2, temp); | |||
c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); | |||
temp= _mm256_mullo_epi16 (a0, b[10]); | |||
temp = mul_add (a1, b[9], temp); | |||
temp = mul_add (a[9], b1, temp); | |||
temp = mul_add (a[10], b0, temp); | |||
temp = mul_add (a2, b[8], temp); | |||
temp = mul_add (a3, b7, temp); | |||
temp = mul_add (a4, b6, temp); | |||
temp = mul_add (a5, b5, temp); | |||
temp = mul_add (a6, b4, temp); | |||
temp = mul_add (a7, b3, temp); | |||
temp=mul_add(a[8], b2, temp); | |||
c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); | |||
temp = _mm256_mullo_epi16 (a0, b[11]); | |||
temp = mul_add (a1, b[10], temp ); | |||
temp = mul_add (a[10], b1, temp ); | |||
temp = mul_add (a[11], b0, temp ); | |||
temp = mul_add (a2, b[9], temp ); | |||
temp = mul_add (a3, b[8], temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a[8], b3, temp ); | |||
temp=mul_add(a[9], b2, temp); | |||
c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); | |||
temp = _mm256_mullo_epi16 (a0, b[12]); | |||
temp = mul_add (a1, b[11], temp); | |||
temp = mul_add (a[11], b1, temp); | |||
temp = mul_add (a[12], b0, temp); | |||
temp = mul_add (a2, b[10], temp); | |||
temp = mul_add (a3, b[9], temp); | |||
temp = mul_add (a4, b[8], temp); | |||
temp = mul_add (a5, b7, temp); | |||
temp = mul_add (a6, b6, temp); | |||
temp = mul_add (a7, b5, temp); | |||
temp = mul_add (a[8], b4, temp); | |||
temp = mul_add (a[9], b3, temp); | |||
temp=mul_add(a[10], b2, temp); | |||
c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); | |||
temp = _mm256_mullo_epi16 (a0, b[13]); | |||
temp = mul_add (a1, b[12], temp ); | |||
temp = mul_add (a[12], b1, temp ); | |||
temp = mul_add (a[13], b0, temp ); | |||
temp = mul_add (a2, b[11], temp ); | |||
temp = mul_add (a3, b[10], temp ); | |||
temp = mul_add (a4, b[9], temp ); | |||
temp = mul_add (a5, b[8], temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a[8], b5, temp ); | |||
temp = mul_add (a[9], b4, temp ); | |||
temp = mul_add (a[10], b3, temp ); | |||
temp=mul_add(a[11], b2, temp); | |||
c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); | |||
temp = _mm256_mullo_epi16 (a0, b[14]); | |||
temp = mul_add (a1, b[13], temp ); | |||
temp = mul_add (a[13], b1, temp ); | |||
temp = mul_add (a[14], b0, temp ); | |||
temp = mul_add (a2, b[12], temp ); | |||
temp = mul_add (a3, b[11], temp ); | |||
temp = mul_add (a4, b[10], temp ); | |||
temp = mul_add (a5, b[9], temp ); | |||
temp = mul_add (a6, b[8], temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a[8], b6, temp ); | |||
temp = mul_add (a[9], b5, temp ); | |||
temp = mul_add (a[10], b4, temp ); | |||
temp = mul_add (a[11], b3, temp ); | |||
temp=mul_add(a[12], b2, temp); | |||
c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); | |||
temp = _mm256_mullo_epi16 (a0, b[15]); | |||
temp = mul_add (a1, b[14], temp ); | |||
temp = mul_add (a[14], b1, temp ); | |||
temp = mul_add (a[15], b0, temp ); | |||
temp = mul_add (a2, b[13], temp ); | |||
temp = mul_add (a3, b[12], temp ); | |||
temp = mul_add (a4, b[11], temp ); | |||
temp = mul_add (a5, b[10], temp ); | |||
temp = mul_add (a6, b[9], temp ); | |||
temp = mul_add (a7, b[8], temp ); | |||
temp = mul_add (a[8], b7, temp ); | |||
temp = mul_add (a[9], b6, temp ); | |||
temp = mul_add (a[10], b5, temp ); | |||
temp = mul_add (a[11], b4, temp ); | |||
temp = mul_add (a[12], b3, temp ); | |||
temp=mul_add(a[13], b2, temp); | |||
c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); | |||
// unrolled second triangle | |||
a0=a[14]; | |||
a1=a[15]; | |||
a2=a[13]; | |||
a3=a[12]; | |||
a4=a[11]; | |||
a5=a[10]; | |||
a6=a[9]; | |||
a7=a[8]; | |||
b0=b[14]; | |||
b1=b[15]; | |||
b2=b[13]; | |||
b3=b[12]; | |||
b4=b[11]; | |||
b5=b[10]; | |||
b6=b[9]; | |||
b7=b[8]; | |||
temp = _mm256_mullo_epi16 (a[1], b1); | |||
temp = mul_add (a[2], b0, temp ); | |||
temp = mul_add (a[3], b2, temp ); | |||
temp = mul_add (a[4], b3, temp ); | |||
temp = mul_add (a[5], b4, temp ); | |||
temp = mul_add (a[6], b5, temp ); | |||
temp = mul_add (a[7], b6, temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a6, b[7], temp ); | |||
temp = mul_add (a5, b[6], temp ); | |||
temp = mul_add (a4, b[5], temp ); | |||
temp = mul_add (a3, b[4], temp ); | |||
temp = mul_add (a2, b[3], temp ); | |||
temp = mul_add (a0, b[2], temp ); | |||
temp=mul_add(a1, b[1], temp); | |||
c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); | |||
temp = _mm256_mullo_epi16 (a[2], b1); | |||
temp = mul_add (a[3], b0, temp ); | |||
temp = mul_add (a[4], b2, temp ); | |||
temp = mul_add (a[5], b3, temp ); | |||
temp = mul_add (a[6], b4, temp ); | |||
temp = mul_add (a[7], b5, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a5, b[7], temp ); | |||
temp = mul_add (a4, b[6], temp ); | |||
temp = mul_add (a3, b[5], temp ); | |||
temp = mul_add (a2, b[4], temp ); | |||
temp = mul_add (a0, b[3], temp ); | |||
temp=mul_add(a1, b[2], temp); | |||
c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); | |||
temp = _mm256_mullo_epi16 (a[3], b1); | |||
temp = mul_add (a[4], b0, temp ); | |||
temp = mul_add (a[5], b2, temp ); | |||
temp = mul_add (a[6], b3, temp ); | |||
temp = mul_add (a[7], b4, temp ); | |||
temp = mul_add (a7, b5, temp ); | |||
temp = mul_add (a6, b6, temp ); | |||
temp = mul_add (a5, b7, temp ); | |||
temp = mul_add (a4, b[7], temp ); | |||
temp = mul_add (a3, b[6], temp ); | |||
temp = mul_add (a2, b[5], temp ); | |||
temp = mul_add (a0, b[4], temp ); | |||
temp=mul_add(a1, b[3], temp); | |||
c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); | |||
temp = _mm256_mullo_epi16 (a[4], b1); | |||
temp = mul_add (a[5], b0, temp ); | |||
temp = mul_add (a[6], b2, temp ); | |||
temp = mul_add (a[7], b3, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a3, b[7], temp ); | |||
temp = mul_add (a2, b[6], temp ); | |||
temp = mul_add (a0, b[5], temp ); | |||
temp=mul_add(a1, b[4], temp); | |||
c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); | |||
temp = _mm256_mullo_epi16 (a[5], b1); | |||
temp = mul_add (a[6], b0, temp ); | |||
temp = mul_add (a[7], b2, temp ); | |||
temp = mul_add (a7, b3, temp ); | |||
temp = mul_add (a6, b4, temp ); | |||
temp = mul_add (a5, b5, temp ); | |||
temp = mul_add (a4, b6, temp ); | |||
temp = mul_add (a3, b7, temp ); | |||
temp = mul_add (a2, b[7], temp ); | |||
temp = mul_add (a0, b[6], temp ); | |||
temp=mul_add(a1, b[5], temp); | |||
c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); | |||
temp = _mm256_mullo_epi16 (a[6], b1); | |||
temp = mul_add (a[7], b0, temp ); | |||
temp = mul_add (a7, b2, temp ); | |||
temp = mul_add (a6, b3, temp ); | |||
temp = mul_add (a5, b4, temp ); | |||
temp = mul_add (a4, b5, temp ); | |||
temp = mul_add (a3, b6, temp ); | |||
temp = mul_add (a2, b7, temp ); | |||
temp = mul_add (a0, b[7], temp ); | |||
temp=mul_add(a1, b[6], temp); | |||
c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); | |||
temp = _mm256_mullo_epi16 (a[7], b1); | |||
temp = mul_add (a7, b0, temp ); | |||
temp = mul_add (a6, b2, temp ); | |||
temp = mul_add (a5, b3, temp ); | |||
temp = mul_add (a4, b4, temp ); | |||
temp = mul_add (a3, b5, temp ); | |||
temp = mul_add (a2, b6, temp ); | |||
temp = mul_add (a0, b7, temp ); | |||
temp=mul_add(a1, b[7], temp); | |||
c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); | |||
temp = _mm256_mullo_epi16 (a7, b1); | |||
temp = mul_add (a6, b0, temp ); | |||
temp = mul_add (a5, b2, temp ); | |||
temp = mul_add (a4, b3, temp ); | |||
temp = mul_add (a3, b4, temp ); | |||
temp = mul_add (a2, b5, temp ); | |||
temp = mul_add (a0, b6, temp ); | |||
temp=mul_add(a1, b7, temp); | |||
c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); | |||
temp = _mm256_mullo_epi16 (a6, b1); | |||
temp = mul_add (a5, b0, temp ); | |||
temp = mul_add (a4, b2, temp ); | |||
temp = mul_add (a3, b3, temp ); | |||
temp = mul_add (a2, b4, temp ); | |||
temp = mul_add (a0, b5, temp ); | |||
temp=mul_add(a1, b6, temp); | |||
c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); | |||
temp = _mm256_mullo_epi16 (a5, b1); | |||
temp = mul_add (a4, b0, temp ); | |||
temp = mul_add (a3, b2, temp ); | |||
temp = mul_add (a2, b3, temp ); | |||
temp = mul_add (a0, b4, temp ); | |||
temp=mul_add(a1, b5, temp); | |||
c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); | |||
temp = _mm256_mullo_epi16 (a4, b1); | |||
temp = mul_add (a3, b0, temp ); | |||
temp = mul_add (a2, b2, temp ); | |||
temp = mul_add (a0, b3, temp ); | |||
temp=mul_add(a1, b4, temp); | |||
c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); | |||
temp = _mm256_mullo_epi16 (a3, b1); | |||
temp = mul_add (a2, b0, temp ); | |||
temp = mul_add (a0, b2, temp ); | |||
temp=mul_add(a1, b3, temp); | |||
c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); | |||
temp = _mm256_mullo_epi16 (a2, b1); | |||
temp = mul_add (a0, b0, temp ); | |||
temp=mul_add(a1, b2, temp); | |||
c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
temp=mul_add(a1, b0, temp); | |||
c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); | |||
c_avx[30] = mul_add(a1, b1, c_avx[30]); | |||
c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); | |||
} | |||
static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched | |||
//the c_avx are not added cummulatively | |||
{ | |||
__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; | |||
__m256i temp; | |||
a0=a[0]; | |||
a1=a[1]; | |||
a2=a[2]; | |||
a3=a[3]; | |||
a4=a[4]; | |||
a5=a[5]; | |||
a6=a[6]; | |||
a7=a[7]; | |||
b0=b[0]; | |||
b1=b[1]; | |||
b2=b[2]; | |||
b3=b[3]; | |||
b4=b[4]; | |||
b5=b[5]; | |||
b6=b[6]; | |||
b7=b[7]; | |||
// New Unrolled first triangle | |||
c_avx[0] = _mm256_mullo_epi16 (a0, b0); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
c_avx[1]=mul_add(a1, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b2); | |||
temp = mul_add(a1, b1, temp); | |||
c_avx[2]= mul_add(a2, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b3); | |||
temp = mul_add(a1, b2, temp); | |||
temp = mul_add(a2, b1, temp); | |||
c_avx[3]= mul_add(a3, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b4); | |||
temp = mul_add(a1, b3, temp); | |||
temp = mul_add(a3, b1, temp); | |||
temp = mul_add(a4, b0, temp); | |||
c_avx[4]= mul_add(a2, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b5); | |||
temp = mul_add(a1, b4 , temp); | |||
temp = mul_add(a2, b3, temp); | |||
temp = mul_add(a3, b2, temp); | |||
temp = mul_add( a4, b1, temp); | |||
c_avx[5] = mul_add(a5, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b6); | |||
temp = mul_add(a1, b5, temp); | |||
temp = mul_add(a5, b1, temp); | |||
temp = mul_add(a6, b0, temp); | |||
temp = mul_add(a2, b4, temp); | |||
temp = mul_add(a3, b3, temp); | |||
c_avx[6] = mul_add(a4, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b7); | |||
temp = mul_add(a1, b6, temp); | |||
temp = mul_add (a6, b1, temp); | |||
temp = mul_add (a7, b0, temp); | |||
temp = mul_add(a2, b5, temp); | |||
temp = mul_add (a3, b4, temp); | |||
temp = mul_add (a4, b3, temp); | |||
c_avx[7] = mul_add (a5, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[8]); | |||
temp = mul_add (a1, b7, temp); | |||
temp = mul_add (a7, b1, temp); | |||
temp = mul_add (a[8], b0, temp); | |||
temp = mul_add (a2, b6,temp); | |||
temp = mul_add(a3, b5, temp); | |||
temp = mul_add (a4, b4,temp); | |||
temp = mul_add (a5, b3, temp); | |||
c_avx[8] = mul_add (a6, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[9]); | |||
temp = mul_add (a1, b[8], temp); | |||
temp = mul_add (a[8], b1, temp); | |||
temp = mul_add (a[9], b0, temp); | |||
temp = mul_add (a2, b7, temp); | |||
temp = mul_add (a3, b6, temp); | |||
temp = mul_add (a4, b5, temp); | |||
temp = mul_add (a5, b4, temp); | |||
temp = mul_add (a6, b3, temp); | |||
c_avx[9] = mul_add (a7, b2, temp); | |||
temp= _mm256_mullo_epi16 (a0, b[10]); | |||
temp = mul_add (a1, b[9], temp); | |||
temp = mul_add (a[9], b1, temp); | |||
temp = mul_add (a[10], b0, temp); | |||
temp = mul_add (a2, b[8], temp); | |||
temp = mul_add (a3, b7, temp); | |||
temp = mul_add (a4, b6, temp); | |||
temp = mul_add (a5, b5, temp); | |||
temp = mul_add (a6, b4, temp); | |||
temp = mul_add (a7, b3, temp); | |||
c_avx[10] = mul_add (a[8], b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[11]); | |||
temp = mul_add (a1, b[10], temp ); | |||
temp = mul_add (a[10], b1, temp ); | |||
temp = mul_add (a[11], b0, temp ); | |||
temp = mul_add (a2, b[9], temp ); | |||
temp = mul_add (a3, b[8], temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a[8], b3, temp ); | |||
c_avx[11] = mul_add (a[9], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[12]); | |||
temp = mul_add (a1, b[11], temp); | |||
temp = mul_add (a[11], b1, temp); | |||
temp = mul_add (a[12], b0, temp); | |||
temp = mul_add (a2, b[10], temp); | |||
temp = mul_add (a3, b[9], temp); | |||
temp = mul_add (a4, b[8], temp); | |||
temp = mul_add (a5, b7, temp); | |||
temp = mul_add (a6, b6, temp); | |||
temp = mul_add (a7, b5, temp); | |||
temp = mul_add (a[8], b4, temp); | |||
temp = mul_add (a[9], b3, temp); | |||
c_avx[12] = mul_add (a[10], b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[13]); | |||
temp = mul_add (a1, b[12], temp ); | |||
temp = mul_add (a[12], b1, temp ); | |||
temp = mul_add (a[13], b0, temp ); | |||
temp = mul_add (a2, b[11], temp ); | |||
temp = mul_add (a3, b[10], temp ); | |||
temp = mul_add (a4, b[9], temp ); | |||
temp = mul_add (a5, b[8], temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a[8], b5, temp ); | |||
temp = mul_add (a[9], b4, temp ); | |||
temp = mul_add (a[10], b3, temp ); | |||
c_avx[13] = mul_add (a[11], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[14]); | |||
temp = mul_add (a1, b[13], temp ); | |||
temp = mul_add (a[13], b1, temp ); | |||
temp = mul_add (a[14], b0, temp ); | |||
temp = mul_add (a2, b[12], temp ); | |||
temp = mul_add (a3, b[11], temp ); | |||
temp = mul_add (a4, b[10], temp ); | |||
temp = mul_add (a5, b[9], temp ); | |||
temp = mul_add (a6, b[8], temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a[8], b6, temp ); | |||
temp = mul_add (a[9], b5, temp ); | |||
temp = mul_add (a[10], b4, temp ); | |||
temp = mul_add (a[11], b3, temp ); | |||
c_avx[14] = mul_add (a[12], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[15]); | |||
temp = mul_add (a1, b[14], temp ); | |||
temp = mul_add (a[14], b1, temp ); | |||
temp = mul_add (a[15], b0, temp ); | |||
temp = mul_add (a2, b[13], temp ); | |||
temp = mul_add (a3, b[12], temp ); | |||
temp = mul_add (a4, b[11], temp ); | |||
temp = mul_add (a5, b[10], temp ); | |||
temp = mul_add (a6, b[9], temp ); | |||
temp = mul_add (a7, b[8], temp ); | |||
temp = mul_add (a[8], b7, temp ); | |||
temp = mul_add (a[9], b6, temp ); | |||
temp = mul_add (a[10], b5, temp ); | |||
temp = mul_add (a[11], b4, temp ); | |||
temp = mul_add (a[12], b3, temp ); | |||
c_avx[15] = mul_add (a[13], b2, temp ); | |||
// unrolled second triangle | |||
a0=a[14]; | |||
a1=a[15]; | |||
a2=a[13]; | |||
a3=a[12]; | |||
a4=a[11]; | |||
a5=a[10]; | |||
a6=a[9]; | |||
a7=a[8]; | |||
b0=b[14]; | |||
b1=b[15]; | |||
b2=b[13]; | |||
b3=b[12]; | |||
b4=b[11]; | |||
b5=b[10]; | |||
b6=b[9]; | |||
b7=b[8]; | |||
temp = _mm256_mullo_epi16 (a[1], b1); | |||
temp = mul_add (a[2], b0, temp ); | |||
temp = mul_add (a[3], b2, temp ); | |||
temp = mul_add (a[4], b3, temp ); | |||
temp = mul_add (a[5], b4, temp ); | |||
temp = mul_add (a[6], b5, temp ); | |||
temp = mul_add (a[7], b6, temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a6, b[7], temp ); | |||
temp = mul_add (a5, b[6], temp ); | |||
temp = mul_add (a4, b[5], temp ); | |||
temp = mul_add (a3, b[4], temp ); | |||
temp = mul_add (a2, b[3], temp ); | |||
temp = mul_add (a0, b[2], temp ); | |||
c_avx[16] = mul_add (a1, b[1], temp ); | |||
temp = _mm256_mullo_epi16 (a[2], b1); | |||
temp = mul_add (a[3], b0, temp ); | |||
temp = mul_add (a[4], b2, temp ); | |||
temp = mul_add (a[5], b3, temp ); | |||
temp = mul_add (a[6], b4, temp ); | |||
temp = mul_add (a[7], b5, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a5, b[7], temp ); | |||
temp = mul_add (a4, b[6], temp ); | |||
temp = mul_add (a3, b[5], temp ); | |||
temp = mul_add (a2, b[4], temp ); | |||
temp = mul_add (a0, b[3], temp ); | |||
c_avx[17] = mul_add (a1, b[2], temp ); | |||
temp = _mm256_mullo_epi16 (a[3], b1); | |||
temp = mul_add (a[4], b0, temp ); | |||
temp = mul_add (a[5], b2, temp ); | |||
temp = mul_add (a[6], b3, temp ); | |||
temp = mul_add (a[7], b4, temp ); | |||
temp = mul_add (a7, b5, temp ); | |||
temp = mul_add (a6, b6, temp ); | |||
temp = mul_add (a5, b7, temp ); | |||
temp = mul_add (a4, b[7], temp ); | |||
temp = mul_add (a3, b[6], temp ); | |||
temp = mul_add (a2, b[5], temp ); | |||
temp = mul_add (a0, b[4], temp ); | |||
c_avx[18] = mul_add (a1, b[3], temp ); | |||
temp = _mm256_mullo_epi16 (a[4], b1); | |||
temp = mul_add (a[5], b0, temp ); | |||
temp = mul_add (a[6], b2, temp ); | |||
temp = mul_add (a[7], b3, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a3, b[7], temp ); | |||
temp = mul_add (a2, b[6], temp ); | |||
temp = mul_add (a0, b[5], temp ); | |||
c_avx[19] = mul_add (a1, b[4], temp ); | |||
temp = _mm256_mullo_epi16 (a[5], b1); | |||
temp = mul_add (a[6], b0, temp ); | |||
temp = mul_add (a[7], b2, temp ); | |||
temp = mul_add (a7, b3, temp ); | |||
temp = mul_add (a6, b4, temp ); | |||
temp = mul_add (a5, b5, temp ); | |||
temp = mul_add (a4, b6, temp ); | |||
temp = mul_add (a3, b7, temp ); | |||
temp = mul_add (a2, b[7], temp ); | |||
temp = mul_add (a0, b[6], temp ); | |||
c_avx[20] = mul_add (a1, b[5], temp ); | |||
temp = _mm256_mullo_epi16 (a[6], b1); | |||
temp = mul_add (a[7], b0, temp ); | |||
temp = mul_add (a7, b2, temp ); | |||
temp = mul_add (a6, b3, temp ); | |||
temp = mul_add (a5, b4, temp ); | |||
temp = mul_add (a4, b5, temp ); | |||
temp = mul_add (a3, b6, temp ); | |||
temp = mul_add (a2, b7, temp ); | |||
temp = mul_add (a0, b[7], temp ); | |||
c_avx[21] = mul_add (a1, b[6], temp ); | |||
temp = _mm256_mullo_epi16 (a[7], b1); | |||
temp = mul_add (a7, b0, temp ); | |||
temp = mul_add (a6, b2, temp ); | |||
temp = mul_add (a5, b3, temp ); | |||
temp = mul_add (a4, b4, temp ); | |||
temp = mul_add (a3, b5, temp ); | |||
temp = mul_add (a2, b6, temp ); | |||
temp = mul_add (a0, b7, temp ); | |||
c_avx[22] = mul_add (a1, b[7], temp ); | |||
temp = _mm256_mullo_epi16 (a7, b1); | |||
temp = mul_add (a6, b0, temp ); | |||
temp = mul_add (a5, b2, temp ); | |||
temp = mul_add (a4, b3, temp ); | |||
temp = mul_add (a3, b4, temp ); | |||
temp = mul_add (a2, b5, temp ); | |||
temp = mul_add (a0, b6, temp ); | |||
c_avx[23] = mul_add (a1, b7, temp ); | |||
temp = _mm256_mullo_epi16 (a6, b1); | |||
temp = mul_add (a5, b0, temp ); | |||
temp = mul_add (a4, b2, temp ); | |||
temp = mul_add (a3, b3, temp ); | |||
temp = mul_add (a2, b4, temp ); | |||
temp = mul_add (a0, b5, temp ); | |||
c_avx[24] = mul_add (a1, b6, temp ); | |||
temp = _mm256_mullo_epi16 (a5, b1); | |||
temp = mul_add (a4, b0, temp ); | |||
temp = mul_add (a3, b2, temp ); | |||
temp = mul_add (a2, b3, temp ); | |||
temp = mul_add (a0, b4, temp ); | |||
c_avx[25] = mul_add (a1, b5, temp ); | |||
temp = _mm256_mullo_epi16 (a4, b1); | |||
temp = mul_add (a3, b0, temp ); | |||
temp = mul_add (a2, b2, temp ); | |||
temp = mul_add (a0, b3, temp ); | |||
c_avx[26] = mul_add (a1, b4, temp ); | |||
temp = _mm256_mullo_epi16 (a3, b1); | |||
temp = mul_add (a2, b0, temp ); | |||
temp = mul_add (a0, b2, temp ); | |||
c_avx[27] = mul_add (a1, b3, temp ); | |||
temp = _mm256_mullo_epi16 (a2, b1); | |||
temp = mul_add (a0, b0, temp ); | |||
c_avx[28] = mul_add (a1, b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
c_avx[29] = mul_add (a1, b0, temp); | |||
c_avx[30] = _mm256_mullo_epi16 (a1, b1); | |||
c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); | |||
} |
@@ -11,81 +11,102 @@ | |||
#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { | |||
uint16_t A[SABER_L][SABER_L][SABER_N]; | |||
uint16_t s[SABER_L][SABER_N]; | |||
uint16_t b[SABER_L][SABER_N] = {{0}}; | |||
uint8_t seed_A[SABER_SEEDBYTES]; | |||
uint8_t seed_s[SABER_NOISE_SEEDBYTES]; | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly s[SABER_L]; | |||
poly res[SABER_L]; | |||
uint8_t rand[SABER_NOISESEEDBYTES]; | |||
uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
randombytes(seed_A, SABER_SEEDBYTES); | |||
shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state | |||
randombytes(seed_s, SABER_NOISE_SEEDBYTES); | |||
PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A); | |||
PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, seed_s); | |||
PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1); | |||
randombytes(rand, SABER_NOISESEEDBYTES); | |||
PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, rand); | |||
PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(sk, s); | |||
PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A); // sample matrix A | |||
PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 1); // Matrix in transposed order | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_N; j++) { | |||
b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP); | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); | |||
PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); | |||
memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); | |||
PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(pk, res); // pack public key | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
uint16_t A[SABER_L][SABER_L][SABER_N]; | |||
uint16_t sp[SABER_L][SABER_N]; | |||
uint16_t bp[SABER_L][SABER_N] = {{0}}; | |||
uint16_t vp[SABER_N] = {0}; | |||
uint16_t mp[SABER_N]; | |||
uint16_t b[SABER_L][SABER_N]; | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly res[SABER_L]; | |||
poly s[SABER_L]; | |||
poly *temp = A[0]; // re-use stack space | |||
poly *vprime = &A[0][0]; | |||
poly *message = &A[0][1]; | |||
const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, noiseseed); | |||
PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A); | |||
PQCLEAN_LIGHTSABER_CLEAN_GenSecret(sp, seed_sp); | |||
PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); | |||
PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed | |||
for (i = 0; i < SABER_L; i++) { | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N; j++) { | |||
bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(ciphertext, res); | |||
PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, pk); | |||
PQCLEAN_LIGHTSABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(mp, m); | |||
// vector-vector scalar multiplication with mod p | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(temp, pk); | |||
PQCLEAN_LIGHTSABER_CLEAN_InnerProd(vprime, temp, s); | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(message, m); | |||
for (j = 0; j < SABER_N; j++) { | |||
vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); | |||
for (i = 0; i < SABER_N; i++) { | |||
vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); | |||
vprime->coeffs[i] &= SABER_P - 1; | |||
vprime->coeffs[i] >>= SABER_EP - SABER_ET; | |||
} | |||
PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); | |||
PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(msk_c, vprime); | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
uint16_t s[SABER_L][SABER_N]; | |||
uint16_t b[SABER_L][SABER_N]; | |||
uint16_t v[SABER_N] = {0}; | |||
uint16_t cm[SABER_N]; | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
size_t i; | |||
poly temp[SABER_L]; | |||
poly s[SABER_L]; | |||
const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
poly *v = &temp[0]; | |||
poly *cm = &temp[1]; | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk); | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, ciphertext); | |||
PQCLEAN_LIGHTSABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(temp, ciphertext); | |||
PQCLEAN_LIGHTSABER_CLEAN_InnerProd(&temp[0], temp, s); | |||
PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(cm, packed_cm); | |||
for (i = 0; i < SABER_N; i++) { | |||
v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); | |||
v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); | |||
v->coeffs[i] &= SABER_P - 1; | |||
v->coeffs[i] >>= SABER_EP - 1; | |||
} | |||
PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(m, v); | |||
@@ -5,7 +5,7 @@ | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); | |||
@@ -2,19 +2,21 @@ | |||
#define PARAMS_H | |||
/* Change this for different security strengths */ | |||
/* Don't change anything below this line */ | |||
#define SABER_L 2 | |||
#define SABER_MU 10 | |||
#define SABER_ET 3 | |||
#define SABER_EQ 13 | |||
#define SABER_EP 10 | |||
#define SABER_N 256 | |||
#define SABER_EP 10 | |||
#define SABER_P (1 << SABER_EP) | |||
#define SABER_EQ 13 | |||
#define SABER_Q (1 << SABER_EQ) | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISE_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_HASHBYTES 32 | |||
@@ -15,4 +15,4 @@ int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, | |||
int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); | |||
#endif /* api_h */ | |||
#endif /* PQCLEAN_LIGHTSABER_CLEAN_API_H */ |
@@ -1,140 +1,153 @@ | |||
#include "api.h" | |||
#include "SABER_params.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include <string.h> | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ((data[offset_data + 1] & 0x7) << 3) | ((data[offset_data + 2] & 0x3) << 6); | |||
bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2) & 0x01) | ((data[offset_data + 3] & 0x7) << 1) | ((data[offset_data + 4] & 0x7) << 4) | (((data[offset_data + 5]) & 0x01) << 7); | |||
bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1) & 0x03) | ((data[offset_data + 6] & 0x7) << 2) | ((data[offset_data + 7] & 0x7) << 5); | |||
out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6); | |||
out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7); | |||
out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5); | |||
in += 8; | |||
out += 3; | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j, offset_byte, offset_data; | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; | |||
data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3) & 0x07; | |||
data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6) & 0x03) | (((bytes[offset_byte + 1]) & 0x01) << 2); | |||
data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1) & 0x07; | |||
data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4) & 0x07; | |||
data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7) & 0x01) | (((bytes[offset_byte + 2]) & 0x03) << 1); | |||
data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07); | |||
data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07); | |||
out[0] = (in[0]) & 0x07; | |||
out[1] = ((in[0]) >> 3) & 0x07; | |||
out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2); | |||
out[3] = ((in[1]) >> 1) & 0x07; | |||
out[4] = ((in[1]) >> 4) & 0x07; | |||
out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1); | |||
out[6] = ((in[2] >> 2) & 0x07); | |||
out[7] = ((in[2] >> 5) & 0x07); | |||
in += 3; | |||
out += 8; | |||
} | |||
} | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); | |||
bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); | |||
bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); | |||
bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); | |||
bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); | |||
bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); | |||
bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); | |||
bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); | |||
bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); | |||
bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); | |||
bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); | |||
out[2] = ((in[1] >> 3) & 0xff); | |||
out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); | |||
out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); | |||
out[5] = ((in[3] >> 1) & 0xff); | |||
out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); | |||
out[7] = ((in[4] >> 4) & 0xff); | |||
out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); | |||
out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); | |||
out[10] = ((in[6] >> 2) & 0xff); | |||
out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); | |||
out[12] = ((in[7] >> 5) & 0xff); | |||
in += 8; | |||
out += 13; | |||
} | |||
} | |||
static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j, offset_byte, offset_data; | |||
static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); | |||
out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); | |||
out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); | |||
out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); | |||
out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); | |||
out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); | |||
out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); | |||
out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); | |||
in += 13; | |||
out += 8; | |||
} | |||
} | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); | |||
bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); | |||
out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); | |||
out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); | |||
out[4] = ((in[3] >> 2) & 0xff); | |||
in += 4; | |||
out += 5; | |||
} | |||
} | |||
static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j, offset_byte, offset_data; | |||
static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 5 * j; | |||
offset_data = 4 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); | |||
data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); | |||
data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); | |||
data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); | |||
out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); | |||
out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); | |||
out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); | |||
in += 5; | |||
out += 4; | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); | |||
POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLq(data[i], bytes + i * SABER_POLYBYTES); | |||
BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); | |||
POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); | |||
BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { | |||
size_t i, j; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
data[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { | |||
size_t i, j; | |||
memset(bytes, 0, SABER_KEYBYTES); | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); | |||
bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); | |||
} | |||
} | |||
} |
@@ -1,27 +1,28 @@ | |||
#ifndef PACK_UNPACK_H | |||
#define PACK_UNPACK_H | |||
#include "SABER_params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); | |||
#endif |
@@ -3,32 +3,40 @@ | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include "poly_mul.h" | |||
#include <stddef.h> | |||
void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { | |||
void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) { | |||
size_t i, j; | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_L; j++) { | |||
if (transpose == 1) { | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); | |||
} else { | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); | |||
if (transpose) { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1); | |||
} | |||
} | |||
} else { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1); | |||
} | |||
} | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { | |||
size_t j; | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res, b[j], s[j]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) { | |||
size_t i; | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul(c, &b[0], &s[0], 0); | |||
for (i = 1; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_CLEAN_poly_mul(c, &b[i], &s[i], 1); | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
@@ -37,13 +45,13 @@ void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], c | |||
} | |||
} | |||
void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_LIGHTSABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); | |||
PQCLEAN_LIGHTSABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); | |||
} | |||
} |
@@ -3,13 +3,21 @@ | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); | |||
typedef union { | |||
uint16_t coeffs[SABER_N]; | |||
} poly; | |||
void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose); | |||
void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); | |||
void PQCLEAN_LIGHTSABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate); | |||
#endif |
@@ -1,4 +1,4 @@ | |||
#include "poly_mul.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
@@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t | |||
} | |||
/* res += a*b */ | |||
void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { | |||
uint16_t c[2 * SABER_N] = {0}; | |||
void PQCLEAN_LIGHTSABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) { | |||
uint16_t C[2 * SABER_N] = {0}; | |||
size_t i; | |||
toom_cook_4way(c, a, b); | |||
toom_cook_4way(C, a->coeffs, b->coeffs); | |||
/* reduction */ | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
res[i - SABER_N] += (c[i - SABER_N] - c[i]); | |||
if (accumulate == 0) { | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]); | |||
} | |||
} else { | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]); | |||
} | |||
} | |||
} |
@@ -1,9 +1,3 @@ | |||
#ifndef POLY_MUL_H | |||
#define POLY_MUL_H | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); | |||
#endif |
@@ -14,9 +14,9 @@ principal-submitters: | |||
- Frederik Vercauteren | |||
implementations: | |||
- name: clean | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber | |||
- name: avx2 | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber | |||
version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber | |||
supported_platforms: | |||
- architecture: x86_64 | |||
operating_systems: | |||
@@ -2,7 +2,7 @@ | |||
LIB=libsaber_avx2.a | |||
HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h | |||
OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o | |||
OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o | |||
CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS) | |||
@@ -1,416 +1,125 @@ | |||
#include "./polymul/toom-cook_4way.c" | |||
#include "SABER_indcpa.h" | |||
#include "SABER_params.h" | |||
#include "api.h" | |||
#include "cbd.h" | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include "randombytes.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
//#include "randombytes.h" | |||
//#include "./polymul/toom_cook_4/toom-cook_4way.c" | |||
#define h1 4 //2^(EQ-EP-1) | |||
#define h1 (1 << (SABER_EQ - SABER_EP - 1)) | |||
#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) | |||
#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) ) | |||
void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly *skpv1 = A[0]; // use first row of A to hold sk temporarily | |||
toom4_points skpv1_eval[SABER_L]; | |||
poly res[SABER_L]; | |||
static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) { | |||
int32_t i, j; | |||
uint8_t rand[SABER_NOISESEEDBYTES]; | |||
uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
message_dec[j] = 0; | |||
for (i = 0; i < 8; i++) { | |||
message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i); | |||
} | |||
} | |||
} | |||
/*----------------------------------------------------------------------------------- | |||
This routine generates a=[Matrix K x K] of 256-coefficient polynomials | |||
static void GenMatrix(polyvec *a, const uint8_t *seed) { | |||
uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8]; | |||
uint16_t temp_ar[SABER_N]; | |||
int i, j, k; | |||
uint16_t mod = (SABER_Q - 1); | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
PQCLEAN_SABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8); | |||
for (k = 0; k < SABER_N; k++) { | |||
a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ; | |||
} | |||
} | |||
} | |||
} | |||
static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) { | |||
uint32_t i; | |||
randombytes(seed_A, SABER_SEEDBYTES); | |||
shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state | |||
uint8_t buf[SABER_MU * SABER_N * SABER_K / 8]; | |||
randombytes(rand, SABER_NOISESEEDBYTES); | |||
PQCLEAN_SABER_AVX2_GenSecret(skpv1, rand); | |||
PQCLEAN_SABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_K; i++) { | |||
PQCLEAN_SABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8); | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_SABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]); | |||
} | |||
} | |||
//********************************matrix-vector mul routines***************************************************** | |||
static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) { | |||
int64_t i, j; | |||
__m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time | |||
for (i = 0; i < NUM_POLY; i++) { | |||
for (j = 0; j < NUM_POLY; j++) { | |||
PQCLEAN_SABER_AVX2_GenMatrix(A, seed_A); // sample matrix A | |||
PQCLEAN_SABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order | |||
if (isTranspose == 0) { | |||
toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j); | |||
} else { | |||
toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j); | |||
} | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_N; j++) { | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
TC_interpol(c_bucket, res_avx[i]); | |||
} | |||
} | |||
static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) { | |||
int64_t i; | |||
__m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time | |||
for (i = 0; i < NUM_POLY; i++) { | |||
toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i); | |||
} | |||
TC_interpol(c_bucket, res_avx); | |||
} | |||
//********************************matrix-vector mul routines***************************************************** | |||
void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
polyvec a[SABER_K]; | |||
uint16_t skpv1[SABER_K][SABER_N]; | |||
uint8_t seed[SABER_SEEDBYTES]; | |||
uint8_t noiseseed[SABER_COINBYTES]; | |||
int32_t i, j, k; | |||
//--------------AVX declaration------------------ | |||
__m256i sk_avx[SABER_K][SABER_N / 16]; | |||
__m256i mod; | |||
__m256i res_avx[SABER_K][SABER_N / 16]; | |||
__m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
mod = _mm256_set1_epi16(SABER_Q - 1); | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
randombytes(seed, SABER_SEEDBYTES); | |||
shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state | |||
randombytes(noiseseed, SABER_COINBYTES); | |||
GenMatrix(a, seed); //sample matrix A | |||
GenSecret(skpv1, noiseseed); | |||
// Load sk into avx vectors | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); | |||
} | |||
} | |||
// Load a into avx vectors | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); | |||
} | |||
} | |||
} | |||
//------------------------do the matrix vector multiplication and rounding------------ | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sk_avx[j], b_bucket[j]); | |||
} | |||
matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order | |||
// Now truncation | |||
for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); | |||
res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); | |||
res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); | |||
} | |||
} | |||
//------------------Pack sk into byte string------- | |||
PQCLEAN_SABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q); | |||
//------------------Pack pk into byte string------- | |||
for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); | |||
} | |||
} | |||
PQCLEAN_SABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string | |||
for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format. | |||
pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i]; | |||
} | |||
PQCLEAN_SABER_AVX2_POLVECp2BS(pk, res); // pack public key | |||
} | |||
void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly res[SABER_L]; | |||
toom4_points skpv1_eval[SABER_L]; | |||
uint32_t i, j, k; | |||
polyvec a[SABER_K]; // skpv; | |||
uint8_t seed[SABER_SEEDBYTES]; | |||
uint16_t pkcl[SABER_K][SABER_N]; //public key of received by the client | |||
uint16_t skpv1[SABER_K][SABER_N]; | |||
uint16_t temp[SABER_K][SABER_N]; | |||
uint16_t message[SABER_KEYBYTES * 8]; | |||
uint8_t msk_c[SABER_SCALEBYTES_KEM]; | |||
//--------------AVX declaration------------------ | |||
__m256i sk_avx[SABER_K][SABER_N / 16]; | |||
__m256i mod, mod_p; | |||
__m256i res_avx[SABER_K][SABER_N / 16]; | |||
__m256i vprime_avx[SABER_N / 16]; | |||
__m256i a_avx[SABER_K][SABER_K][SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
__m256i pkcl_avx[SABER_K][SABER_N / 16]; | |||
__m256i message_avx[SABER_N / 16]; | |||
mod = _mm256_set1_epi16(SABER_Q - 1); | |||
mod_p = _mm256_set1_epi16(SABER_P - 1); | |||
poly *temp = A[0]; // re-use stack space | |||
poly *vprime = &A[0][0]; | |||
poly *message = &A[0][1]; | |||
const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK. | |||
seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i]; | |||
PQCLEAN_SABER_AVX2_GenSecret(temp, noiseseed); | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_SABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]); | |||
} | |||
GenMatrix(a, seed); | |||
GenSecret(skpv1, noiseseed); | |||
// ----------- Load skpv1 into avx vectors ---------- | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16])); | |||
} | |||
} | |||
PQCLEAN_SABER_AVX2_GenMatrix(A, seed_A); | |||
PQCLEAN_SABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed | |||
// ----------- Load skpv1 into avx vectors ---------- | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_K; j++) { | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16])); | |||
} | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N; j++) { | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
//-----------------matrix-vector multiplication and rounding | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sk_avx[j], b_bucket[j]); | |||
} | |||
matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order | |||
// Now truncation | |||
for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1)); | |||
res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) ); | |||
res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod); | |||
} | |||
} | |||
//-----this result should be put in b_prime for later use in server. | |||
for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]); | |||
} | |||
} | |||
PQCLEAN_SABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string | |||
//**************client matrix-vector multiplication ends******************// | |||
//------now calculate the v' | |||
//-------unpack the public_key | |||
PQCLEAN_SABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P); | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16])); | |||
} | |||
} | |||
// InnerProduct | |||
//for(k=0;k<SABER_N/16;k++){ | |||
// vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]); | |||
//} | |||
PQCLEAN_SABER_AVX2_POLVECp2BS(ciphertext, res); | |||
// vector-vector scalar multiplication with mod p | |||
PQCLEAN_SABER_AVX2_BS2POLVECp(temp, pk); | |||
PQCLEAN_SABER_AVX2_InnerProd(vprime, temp, skpv1_eval); | |||
PQCLEAN_SABER_AVX2_BS2POLmsg(message, m); | |||
vector_vector_mul(vprime_avx, pkcl_avx, b_bucket); | |||
// Computation of v'+h1 | |||
for (i = 0; i < SABER_N / 16; i++) { //adding h1 | |||
vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1)); | |||
} | |||
// unpack m; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
message[8 * j + i] = ((m[j] >> i) & 0x01); | |||
} | |||
} | |||
// message encoding | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16])); | |||
message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) ); | |||
} | |||
// SHIFTRIGHT(v'+h1-m mod p, EP-ET) | |||
for (k = 0; k < SABER_N / 16; k++) { | |||
vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]); | |||
vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p); | |||
vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) ); | |||
} | |||
// Unpack avx | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
_mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]); | |||
} | |||
PQCLEAN_SABER_AVX2_SABER_pack_4bit(msk_c, temp[0]); | |||
for (j = 0; j < SABER_SCALEBYTES_KEM; j++) { | |||
ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j]; | |||
for (i = 0; i < SABER_N; i++) { | |||
vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); | |||
vprime->coeffs[i] &= SABER_P - 1; | |||
vprime->coeffs[i] >>= SABER_EP - SABER_ET; | |||
} | |||
PQCLEAN_SABER_AVX2_POLT2BS(msk_c, vprime); | |||
} | |||
void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
size_t i; | |||
uint32_t i, j; | |||
uint16_t sksv[SABER_K][SABER_N]; //secret key of the server | |||
uint16_t pksv[SABER_K][SABER_N]; | |||
uint16_t message_dec_unpacked[SABER_KEYBYTES * 8]; // one element containes on decrypted bit; | |||
uint8_t scale_ar[SABER_SCALEBYTES_KEM]; | |||
uint16_t op[SABER_N]; | |||
//--------------AVX declaration------------------ | |||
//__m256i mod_p; | |||
__m256i v_avx[SABER_N / 16]; | |||
//__m256i acc[2*SABER_N/16]; | |||
__m256i sksv_avx[SABER_K][SABER_N / 16]; | |||
__m256i pksv_avx[SABER_K][SABER_N / 16]; | |||
poly temp[SABER_L]; | |||
toom4_points sksv_eval[SABER_L]; | |||
//mod_p=_mm256_set1_epi16(SABER_P-1); | |||
const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
poly *v = &temp[0]; | |||
poly *cm = &temp[1]; | |||
__m256i b_bucket[NUM_POLY][SCHB_N * 4]; | |||
//--------------AVX declaration ends------------------ | |||
//-------unpack the public_key | |||
PQCLEAN_SABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key | |||
PQCLEAN_SABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext | |||
for (i = 0; i < SABER_K; i++) { | |||
for (j = 0; j < SABER_N / 16; j++) { | |||
sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16])); | |||
pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16])); | |||
} | |||
PQCLEAN_SABER_AVX2_BS2POLVECq(temp, sk); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_SABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]); | |||
} | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]); | |||
} | |||
// InnerProduct(b', s, mod p) | |||
for (j = 0; j < NUM_POLY; j++) { | |||
TC_eval(sksv_avx[j], b_bucket[j]); | |||
} | |||
PQCLEAN_SABER_AVX2_BS2POLVECp(temp, ciphertext); | |||
PQCLEAN_SABER_AVX2_InnerProd(v, temp, sksv_eval); | |||
vector_vector_mul(v_avx, pksv_avx, b_bucket); | |||
PQCLEAN_SABER_AVX2_BS2POLT(cm, packed_cm); | |||
for (i = 0; i < SABER_N / 16; i++) { | |||
_mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]); | |||
} | |||
for (i = 0; i < SABER_SCALEBYTES_KEM; i++) { | |||
scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i]; | |||
} | |||
PQCLEAN_SABER_AVX2_SABER_un_pack4bit(op, scale_ar); | |||
//addition of h2 | |||
for (i = 0; i < SABER_N; i++) { | |||
message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1); | |||
v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); | |||
v->coeffs[i] &= SABER_P - 1; | |||
v->coeffs[i] >>= SABER_EP - 1; | |||
} | |||
POL2MSG(m, message_dec_unpacked); | |||
PQCLEAN_SABER_AVX2_POLmsg2BS(m, v); | |||
} |
@@ -1,46 +1,41 @@ | |||
#ifndef PARAMS_H | |||
#define PARAMS_H | |||
#include "api.h" | |||
#define SABER_K 3 | |||
/* Don't change anything below this line */ | |||
#define SABER_L 3 | |||
#define SABER_MU 8 | |||
#define SABER_ET 4 | |||
#define SABER_EQ 13 | |||
#define SABER_EP 10 | |||
#define SABER_N 256 | |||
#define SABER_Q 8192 //2^13 | |||
#define SABER_P 1024 | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_COINBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_EP 10 | |||
#define SABER_P (1 << SABER_EP) | |||
#define SABER_HASHBYTES 32 | |||
#define SABER_EQ 13 | |||
#define SABER_Q (1 << SABER_EQ) | |||
#define SABER_POLYBYTES 416 //13*256/8 | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_HASHBYTES 32 | |||
#define SABER_POLYVECBYTES (SABER_K * SABER_POLYBYTES) | |||
#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8) | |||
#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation | |||
#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8) | |||
#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES) | |||
#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES) | |||
#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8) | |||
#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES) | |||
#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8) | |||
#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8) | |||
#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES) | |||
#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES) | |||
#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES) | |||
#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) | |||
#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES) | |||
#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */ | |||
#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) | |||
#endif |
@@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
----------------------------------------------------------------------*/ | |||
static uint64_t load_littleendian(const unsigned char *x, int bytes) { | |||
static uint64_t load_littleendian(const uint8_t *x, int bytes) { | |||
int i; | |||
uint64_t r = x[0]; | |||
for (i = 1; i < bytes; i++) { | |||
@@ -20,32 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) { | |||
return r; | |||
} | |||
void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) { | |||
uint16_t Qmod_minus1 = SABER_Q - 1; | |||
void PQCLEAN_SABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) { | |||
uint32_t t, d, a[4], b[4]; | |||
int i, j; | |||
for (i = 0; i < SABER_N / 4; i++) { | |||
t = load_littleendian(buf + 4 * i, 4); | |||
t = (uint32_t) load_littleendian(buf + 4 * i, 4); | |||
d = 0; | |||
for (j = 0; j < 4; j++) { | |||
d += (t >> j) & 0x11111111; | |||
} | |||
a[0] = d & 0xf; | |||
b[0] = (d >> 4) & 0xf; | |||
a[1] = (d >> 8) & 0xf; | |||
a[0] = d & 0xf; | |||
b[0] = (d >> 4) & 0xf; | |||
a[1] = (d >> 8) & 0xf; | |||
b[1] = (d >> 12) & 0xf; | |||
a[2] = (d >> 16) & 0xf; | |||
b[2] = (d >> 20) & 0xf; | |||
a[3] = (d >> 24) & 0xf; | |||
b[3] = (d >> 28); | |||
r[4 * i + 0] = (uint16_t)(a[0] - b[0]) & Qmod_minus1; | |||
r[4 * i + 1] = (uint16_t)(a[1] - b[1]) & Qmod_minus1; | |||
r[4 * i + 2] = (uint16_t)(a[2] - b[2]) & Qmod_minus1; | |||
r[4 * i + 3] = (uint16_t)(a[3] - b[3]) & Qmod_minus1; | |||
s[4 * i + 0] = (uint16_t)(a[0] - b[0]); | |||
s[4 * i + 1] = (uint16_t)(a[1] - b[1]); | |||
s[4 * i + 2] = (uint16_t)(a[2] - b[2]); | |||
s[4 * i + 3] = (uint16_t)(a[3] - b[3]); | |||
} | |||
} |
@@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" | |||
by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, | |||
Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
----------------------------------------------------------------------*/ | |||
#include "poly.h" | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf); | |||
void PQCLEAN_SABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]); | |||
#endif |
@@ -4,14 +4,12 @@ | |||
#include "fips202.h" | |||
#include "randombytes.h" | |||
#include "verify.h" | |||
#include <immintrin.h> | |||
#include <stddef.h> | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
int PQCLEAN_SABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) { | |||
int i; | |||
size_t i; | |||
PQCLEAN_SABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk | |||
for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) { | |||
@@ -39,7 +37,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) | |||
sha3_512(kr, buf, 64); // kr[0:63] <-- Hash(buf[0:63]); | |||
// K^ <-- kr[0:31] | |||
// noiseseed (r) <-- kr[32:63]; | |||
PQCLEAN_SABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r; | |||
PQCLEAN_SABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r; | |||
sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); | |||
@@ -49,7 +47,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) | |||
} | |||
int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) { | |||
int i; | |||
size_t i; | |||
uint8_t fail; | |||
uint8_t cmp[SABER_BYTES_CCA_DEC]; | |||
uint8_t buf[64]; | |||
@@ -65,7 +63,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_ | |||
sha3_512(kr, buf, 64); | |||
PQCLEAN_SABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk); | |||
PQCLEAN_SABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk); | |||
fail = PQCLEAN_SABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC); | |||
@@ -1,35 +1,3 @@ | |||
#ifndef INDCPA_H | |||
#define INDCPA_H | |||
#include <stdint.h> | |||
void PQCLEAN_SABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk); | |||
void PQCLEAN_SABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); | |||
void PQCLEAN_SABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key); | |||
void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk); | |||
void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk, uint8_t *ciphertext); | |||
void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]); | |||
int PQCLEAN_SABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||
int PQCLEAN_SABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk); | |||
int PQCLEAN_SABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk); | |||
//uint64_t clock1,clock2; | |||
//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex; | |||
#endif |
@@ -1,502 +1,145 @@ | |||
#include "SABER_params.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include <string.h> | |||
void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6); | |||
bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01) | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7); | |||
bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 ); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07; | |||
data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07; | |||
data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 ); | |||
data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07; | |||
data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07; | |||
data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 ); | |||
data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 ); | |||
data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 ); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0; | |||
void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 2; j++) { | |||
offset_data = 2 * j; | |||
bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 ); | |||
out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4); | |||
in += 2; | |||
out += 1; | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0; | |||
void PQCLEAN_SABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 2; j++) { | |||
offset_data = 2 * j; | |||
data[offset_data] = bytes[j] & 0x0f; | |||
data[offset_data + 1] = (bytes[j] >> 4) & 0x0f; | |||
out[0] = in[0] & 0x0f; | |||
out[1] = (in[0] >> 4) & 0x0f; | |||
in += 1; | |||
out += 2; | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); | |||
out[2] = ((in[1] >> 3) & 0xff); | |||
out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); | |||
out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); | |||
out[5] = ((in[3] >> 1) & 0xff); | |||
out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); | |||
out[7] = ((in[4] >> 4) & 0xff); | |||
out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); | |||
out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); | |||
out[10] = ((in[6] >> 2) & 0xff); | |||
out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); | |||
out[12] = ((in[7] >> 5) & 0xff); | |||
in += 8; | |||
out += 13; | |||
} | |||
} | |||
static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); | |||
out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); | |||
out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); | |||
out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); | |||
out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); | |||
out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); | |||
out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); | |||
out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); | |||
in += 13; | |||
out += 8; | |||
} | |||
} | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6); | |||
bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); | |||
bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2); | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); | |||
out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); | |||
out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); | |||
out[4] = ((in[3] >> 2) & 0xff); | |||
in += 4; | |||
out += 5; | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 3 * j; | |||
offset_data = 4 * j; | |||
data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f; | |||
data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2) ; | |||
data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ; | |||
data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); | |||
bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); | |||
data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); | |||
data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); | |||
data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 10) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 5 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[ offset_byte + 1 ] & 0x03) << 8); | |||
data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) | ((bytes[ offset_byte + 2 ] & 0x0f) << 6); | |||
data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) | ((bytes[ offset_byte + 3 ] & 0x3f) << 4); | |||
data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) | ((bytes[ offset_byte + 4 ] & 0xff) << 2); | |||
} | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); | |||
out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); | |||
out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); | |||
out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); | |||
in += 5; | |||
out += 4; | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 ); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 ); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 ); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 ); | |||
bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff ); | |||
} | |||
void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 13) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 13 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
void PQCLEAN_SABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) { | |||
uint32_t j; | |||
uint32_t offset_data = 0, offset_byte = 0; | |||
//for(i=0;i<SABER_K;i++){ | |||
//i=0; | |||
//offset_byte1=i*(SABER_N*13)/8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
//offset_byte=offset_byte1+13*j; | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
} | |||
//} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
/*This function packs 11 bit data stream into 8 bits of data. | |||
*/ | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 11) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 11 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7); | |||
bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff ); | |||
bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2); | |||
bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5); | |||
bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff ); | |||
} | |||
void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 11) / 8; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = offset_byte1 + 11 * j; | |||
offset_data = 8 * j; | |||
data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 ); | |||
data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 ); | |||
data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 ); | |||
data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 ); | |||
data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 ); | |||
data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 ); | |||
data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 ); | |||
data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 ); | |||
} | |||
void PQCLEAN_SABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 14) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 7 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff)); | |||
bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6); | |||
bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff ); | |||
bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4); | |||
bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff ); | |||
bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2); | |||
bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff ); | |||
void PQCLEAN_SABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { | |||
size_t i, j; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { | |||
size_t i, j; | |||
memset(bytes, 0, SABER_KEYBYTES); | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) { | |||
uint32_t i, j; | |||
uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0; | |||
for (i = 0; i < SABER_K; i++) { | |||
offset_byte1 = i * (SABER_N * 14) / 8; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = offset_byte1 + 7 * j; | |||
offset_data = 4 * j; | |||
data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 ); | |||
data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 ); | |||
data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 ); | |||
data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 ); | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) { | |||
if (modulus == 1024) { | |||
PQCLEAN_SABER_AVX2_POLVECp2BS(bytes, data); | |||
} else if (modulus == 8192) { | |||
PQCLEAN_SABER_AVX2_POLVECq2BS(bytes, data); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) { | |||
if (modulus == 1024) { | |||
PQCLEAN_SABER_AVX2_BS2POLVECp(data, bytes); | |||
} else if (modulus == 8192) { | |||
PQCLEAN_SABER_AVX2_BS2POLVECq(data, bytes); | |||
} | |||
} |
@@ -1,56 +1,28 @@ | |||
#ifndef PACK_UNPACK_H | |||
#define PACK_UNPACK_H | |||
#include "SABER_params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); | |||
void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus); | |||
void PQCLEAN_SABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus); | |||
void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_SABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_SABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_SABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data); | |||
void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]); | |||
void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes); | |||
void PQCLEAN_SABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); | |||
#endif |
@@ -0,0 +1,62 @@ | |||
#include "cbd.h" | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
void PQCLEAN_SABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) { | |||
size_t i, j; | |||
toom4_points_product c_eval; | |||
if (transpose) { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1); | |||
} | |||
PQCLEAN_SABER_AVX2_toom4_interp(&c[i], &c_eval); | |||
} | |||
} else { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1); | |||
} | |||
PQCLEAN_SABER_AVX2_toom4_interp(&c[i], &c_eval); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) { | |||
size_t i; | |||
toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time | |||
PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0); | |||
for (i = 1; i < SABER_L; i++) { | |||
PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1); | |||
} | |||
PQCLEAN_SABER_AVX2_toom4_interp(c, &c_eval); | |||
} | |||
void PQCLEAN_SABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_SABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES); | |||
} | |||
} | |||
void PQCLEAN_SABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_SABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); | |||
} | |||
} |
@@ -1,27 +1,38 @@ | |||
#ifndef POLY_H | |||
#define POLY_H | |||
/*--------------------------------------------------------------------- | |||
This file has been adapted from the implementation | |||
(available at, Public Domain https://github.com/pq-crystals/kyber) | |||
of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM" | |||
by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint, | |||
Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle | |||
#include "SABER_params.h" | |||
#include <immintrin.h> | |||
#include <stdint.h> | |||
typedef struct { | |||
typedef union { | |||
uint16_t coeffs[SABER_N]; | |||
__m256i dummy; | |||
} poly; | |||
typedef struct { | |||
poly vec[SABER_K]; | |||
} polyvec; | |||
typedef union { | |||
uint16_t coeffs[4 * SABER_N]; | |||
__m256i dummy; | |||
} toom4_points; | |||
void PQCLEAN_SABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce); | |||
typedef union { | |||
uint16_t coeffs[8 * SABER_N]; | |||
__m256i dummy; | |||
} toom4_points_product; | |||
void PQCLEAN_SABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose); | |||
void PQCLEAN_SABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3); | |||
void PQCLEAN_SABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]); | |||
void PQCLEAN_SABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_SABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); | |||
void PQCLEAN_SABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval); | |||
void PQCLEAN_SABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b); | |||
void PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate); | |||
#endif |
@@ -1,20 +0,0 @@ | |||
#include "../SABER_params.h" | |||
#define AVX_N (SABER_N >> 4) | |||
#define small_len_avx (AVX_N >> 2) | |||
#define SCHB_N 16 | |||
#define N_SB (SABER_N >> 2) | |||
#define N_SB_RES (2*N_SB-1) | |||
#define N_SB_16 (N_SB >> 2) | |||
#define N_SB_16_RES (2*N_SB_16-1) | |||
#define AVX_N1 16 /*N/16*/ | |||
#define SCM_SIZE 16 | |||
// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements | |||
#define NUM_POLY SABER_K | |||
//int NUM_POLY=2; |
@@ -1,303 +0,0 @@ | |||
#include <immintrin.h> | |||
static void transpose_n1(__m256i *M) | |||
{ | |||
//int i; | |||
register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; | |||
register __m256i temp, temp0, temp1, temp2; | |||
//for(i=0; i<8; i=i+1) | |||
//{ | |||
r0 = _mm256_unpacklo_epi16(M[0], M[1]); | |||
r1 = _mm256_unpacklo_epi16(M[2], M[3]); | |||
r2 = _mm256_unpacklo_epi16(M[4], M[5]); | |||
r3 = _mm256_unpacklo_epi16(M[6], M[7]); | |||
r4 = _mm256_unpacklo_epi16(M[8], M[9]); | |||
r5 = _mm256_unpacklo_epi16(M[10], M[11]); | |||
r6 = _mm256_unpacklo_epi16(M[12], M[13]); | |||
r7 = _mm256_unpacklo_epi16(M[14], M[15]); | |||
temp = _mm256_unpacklo_epi32(r0, r1); | |||
temp0 = _mm256_unpacklo_epi32(r2, r3); | |||
temp1 = _mm256_unpacklo_epi32(r4, r5); | |||
temp2 = _mm256_unpacklo_epi32(r6, r7); | |||
r8 = _mm256_unpackhi_epi32(r0, r1); | |||
r9 = _mm256_unpackhi_epi32(r2, r3); | |||
r10 = _mm256_unpackhi_epi32(r4, r5); | |||
r11 = _mm256_unpackhi_epi32(r6, r7); | |||
r0 = _mm256_unpacklo_epi64(temp, temp0); | |||
r2 = _mm256_unpackhi_epi64(temp, temp0); | |||
r1 = _mm256_unpacklo_epi64(temp1, temp2); | |||
r3 = _mm256_unpackhi_epi64(temp1, temp2); | |||
temp = _mm256_unpackhi_epi16(M[0], M[1]); | |||
temp0 = _mm256_unpackhi_epi16(M[2], M[3]); | |||
temp1 = _mm256_unpackhi_epi16(M[4], M[5]); | |||
temp2 = _mm256_unpackhi_epi16(M[6], M[7]); | |||
r4 = _mm256_unpackhi_epi16(M[8], M[9]); | |||
M[0] = _mm256_permute2f128_si256(r0, r1, 0x20); | |||
M[8] = _mm256_permute2f128_si256(r0, r1, 0x31); | |||
M[1] = _mm256_permute2f128_si256(r2, r3, 0x20); | |||
M[9] = _mm256_permute2f128_si256(r2, r3, 0x31); | |||
r5 = _mm256_unpackhi_epi16(M[10], M[11]); | |||
r6 = _mm256_unpackhi_epi16(M[12], M[13]); | |||
r7 = _mm256_unpackhi_epi16(M[14], M[15]); | |||
r0 = _mm256_unpacklo_epi64(r8, r9); | |||
r1 = _mm256_unpacklo_epi64(r10, r11); | |||
r2 = _mm256_unpackhi_epi64(r8, r9); | |||
r3 = _mm256_unpackhi_epi64(r10, r11); | |||
M[3] = _mm256_permute2f128_si256(r2, r3, 0x20); | |||
M[11] = _mm256_permute2f128_si256(r2, r3, 0x31); | |||
M[2] = _mm256_permute2f128_si256(r0, r1, 0x20); | |||
M[10] = _mm256_permute2f128_si256(r0, r1, 0x31); | |||
//for(i=0; i<4; i=i+1) | |||
//{ | |||
r0 = _mm256_unpacklo_epi32(temp, temp0); | |||
r1 = _mm256_unpacklo_epi32(temp1, temp2); | |||
r2 = _mm256_unpacklo_epi32(r4, r5); | |||
r3 = _mm256_unpacklo_epi32(r6, r7); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
r8 = _mm256_unpacklo_epi64(r0, r1); | |||
r10 = _mm256_unpackhi_epi64(r0, r1); | |||
r9 = _mm256_unpacklo_epi64(r2, r3); | |||
r11 = _mm256_unpackhi_epi64(r2, r3); | |||
M[4] = _mm256_permute2f128_si256(r8, r9, 0x20); | |||
M[12] = _mm256_permute2f128_si256(r8, r9, 0x31); | |||
M[5] = _mm256_permute2f128_si256(r10, r11, 0x20); | |||
M[13] = _mm256_permute2f128_si256(r10, r11, 0x31); | |||
r0 = _mm256_unpackhi_epi32(temp, temp0); | |||
r1 = _mm256_unpackhi_epi32(temp1, temp2); | |||
r2 = _mm256_unpackhi_epi32(r4, r5); | |||
r3 = _mm256_unpackhi_epi32(r6, r7); | |||
//} | |||
// for(i=0; i<2; i=i+1) | |||
// { | |||
r4 = _mm256_unpacklo_epi64(r0, r1); | |||
r6 = _mm256_unpackhi_epi64(r0, r1); | |||
r5 = _mm256_unpacklo_epi64(r2, r3); | |||
r7 = _mm256_unpackhi_epi64(r2, r3); | |||
// } | |||
//------------------------------------------------------- | |||
M[6] = _mm256_permute2f128_si256(r4, r5, 0x20); | |||
M[14] = _mm256_permute2f128_si256(r4, r5, 0x31); | |||
M[7] = _mm256_permute2f128_si256(r6, r7, 0x20); | |||
M[15] = _mm256_permute2f128_si256(r6, r7, 0x31); | |||
} | |||
/* | |||
void transpose_unrolled(__m256i *M) | |||
{ | |||
int i; | |||
__m256i tL[8], tH[8]; | |||
__m256i bL[4], bH[4], cL[4], cH[4]; | |||
__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; | |||
__m256i r0, r1, r2, r3, r4, r5, r6, r7; | |||
//for(i=0; i<8; i=i+1) | |||
//{ | |||
tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); | |||
tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); | |||
tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); | |||
tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); | |||
tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); | |||
tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); | |||
tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); | |||
tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); | |||
tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); | |||
tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); | |||
tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); | |||
tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); | |||
tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); | |||
tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); | |||
tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); | |||
tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); | |||
//} | |||
//------------------------------------------------------- | |||
//for(i=0; i<4; i=i+1) | |||
//{ | |||
bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); | |||
bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); | |||
bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); | |||
bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); | |||
bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); | |||
bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); | |||
bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); | |||
bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); | |||
dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); | |||
dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); | |||
dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]); | |||
M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); | |||
M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); | |||
M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); | |||
M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); | |||
//} | |||
//for(i=0; i<2; i=i+1) | |||
//{ | |||
eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); | |||
eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); | |||
eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); | |||
eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); | |||
//} | |||
//------------------------------------------------------- | |||
//------------------------------------------------------- | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); | |||
cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); | |||
fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); | |||
gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); | |||
} | |||
//------------------------------------------------------- | |||
M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); | |||
M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); | |||
M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); | |||
M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); | |||
M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); | |||
M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); | |||
M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); | |||
M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); | |||
M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); | |||
M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); | |||
M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); | |||
M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); | |||
} | |||
void transpose1(__m256i *M) | |||
{ | |||
int i; | |||
__m256i tL[8], tH[8]; | |||
__m256i bL[4], bH[4], cL[4], cH[4]; | |||
__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2]; | |||
for(i=0; i<8; i=i+1) | |||
{ | |||
tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); | |||
tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); | |||
} | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); | |||
bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); | |||
} | |||
for(i=0; i<4; i=i+1) | |||
{ | |||
cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); | |||
cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); | |||
dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); | |||
eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); | |||
fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); | |||
} | |||
for(i=0; i<2; i=i+1) | |||
{ | |||
gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); | |||
gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); | |||
} | |||
M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20); | |||
M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31); | |||
M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20); | |||
M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31); | |||
M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20); | |||
M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31); | |||
M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20); | |||
M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31); | |||
M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20); | |||
M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31); | |||
M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20); | |||
M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31); | |||
M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20); | |||
M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31); | |||
M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20); | |||
M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31); | |||
} | |||
*/ |
@@ -1,753 +0,0 @@ | |||
//#define SCM_SIZE 16 | |||
//#pragma STDC FP_CONTRACT ON | |||
#include <immintrin.h> | |||
static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { | |||
return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c); | |||
} | |||
static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched | |||
//the c_avx are added cummulatively | |||
{ | |||
register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; | |||
register __m256i temp; | |||
a0=a[0]; | |||
a1=a[1]; | |||
a2=a[2]; | |||
a3=a[3]; | |||
a4=a[4]; | |||
a5=a[5]; | |||
a6=a[6]; | |||
a7=a[7]; | |||
b0=b[0]; | |||
b1=b[1]; | |||
b2=b[2]; | |||
b3=b[3]; | |||
b4=b[4]; | |||
b5=b[5]; | |||
b6=b[6]; | |||
b7=b[7]; | |||
// New Unrolled first triangle | |||
//otherwise accumulate | |||
c_avx[0] = mul_add(a0, b0, c_avx[0]); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
temp=mul_add(a1, b0, temp); | |||
c_avx[1] = _mm256_add_epi16(temp, c_avx[1]); | |||
temp = _mm256_mullo_epi16 (a0, b2); | |||
temp = mul_add(a1, b1, temp); | |||
temp=mul_add(a2, b0, temp); | |||
c_avx[2] = _mm256_add_epi16(temp, c_avx[2]); | |||
temp = _mm256_mullo_epi16 (a0, b3); | |||
temp = mul_add(a1, b2, temp); | |||
temp = mul_add(a2, b1, temp); | |||
temp=mul_add(a3, b0, temp); | |||
c_avx[3] = _mm256_add_epi16(temp, c_avx[3]); | |||
temp = _mm256_mullo_epi16 (a0, b4); | |||
temp = mul_add(a1, b3, temp); | |||
temp = mul_add(a3, b1, temp); | |||
temp = mul_add(a4, b0, temp); | |||
temp=mul_add(a2, b2, temp); | |||
c_avx[4] = _mm256_add_epi16(temp, c_avx[4]); | |||
temp = _mm256_mullo_epi16 (a0, b5); | |||
temp = mul_add(a1, b4 , temp); | |||
temp = mul_add(a2, b3, temp); | |||
temp = mul_add(a3, b2, temp); | |||
temp = mul_add( a4, b1, temp); | |||
temp=mul_add(a5, b0, temp); | |||
c_avx[5] = _mm256_add_epi16(temp, c_avx[5]); | |||
temp = _mm256_mullo_epi16 (a0, b6); | |||
temp = mul_add(a1, b5, temp); | |||
temp = mul_add(a5, b1, temp); | |||
temp = mul_add(a6, b0, temp); | |||
temp = mul_add(a2, b4, temp); | |||
temp = mul_add(a3, b3, temp); | |||
temp=mul_add(a4, b2, temp); | |||
c_avx[6] = _mm256_add_epi16(temp, c_avx[6]); | |||
temp = _mm256_mullo_epi16 (a0, b7); | |||
temp = mul_add(a1, b6, temp); | |||
temp = mul_add (a6, b1, temp); | |||
temp = mul_add (a7, b0, temp); | |||
temp = mul_add(a2, b5, temp); | |||
temp = mul_add (a3, b4, temp); | |||
temp = mul_add (a4, b3, temp); | |||
temp=mul_add(a5, b2, temp); | |||
c_avx[7] = _mm256_add_epi16(temp, c_avx[7]); | |||
temp = _mm256_mullo_epi16 (a0, b[8]); | |||
temp = mul_add (a1, b7, temp); | |||
temp = mul_add (a7, b1, temp); | |||
temp = mul_add (a[8], b0, temp); | |||
temp = mul_add (a2, b6,temp); | |||
temp = mul_add(a3, b5, temp); | |||
temp = mul_add (a4, b4,temp); | |||
temp = mul_add (a5, b3, temp); | |||
temp=mul_add(a6, b2, temp); | |||
c_avx[8] = _mm256_add_epi16(temp, c_avx[8]); | |||
temp = _mm256_mullo_epi16 (a0, b[9]); | |||
temp = mul_add (a1, b[8], temp); | |||
temp = mul_add (a[8], b1, temp); | |||
temp = mul_add (a[9], b0, temp); | |||
temp = mul_add (a2, b7, temp); | |||
temp = mul_add (a3, b6, temp); | |||
temp = mul_add (a4, b5, temp); | |||
temp = mul_add (a5, b4, temp); | |||
temp = mul_add (a6, b3, temp); | |||
temp=mul_add(a7, b2, temp); | |||
c_avx[9] = _mm256_add_epi16(temp, c_avx[9]); | |||
temp= _mm256_mullo_epi16 (a0, b[10]); | |||
temp = mul_add (a1, b[9], temp); | |||
temp = mul_add (a[9], b1, temp); | |||
temp = mul_add (a[10], b0, temp); | |||
temp = mul_add (a2, b[8], temp); | |||
temp = mul_add (a3, b7, temp); | |||
temp = mul_add (a4, b6, temp); | |||
temp = mul_add (a5, b5, temp); | |||
temp = mul_add (a6, b4, temp); | |||
temp = mul_add (a7, b3, temp); | |||
temp=mul_add(a[8], b2, temp); | |||
c_avx[10] = _mm256_add_epi16(temp, c_avx[10]); | |||
temp = _mm256_mullo_epi16 (a0, b[11]); | |||
temp = mul_add (a1, b[10], temp ); | |||
temp = mul_add (a[10], b1, temp ); | |||
temp = mul_add (a[11], b0, temp ); | |||
temp = mul_add (a2, b[9], temp ); | |||
temp = mul_add (a3, b[8], temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a[8], b3, temp ); | |||
temp=mul_add(a[9], b2, temp); | |||
c_avx[11] = _mm256_add_epi16(temp, c_avx[11]); | |||
temp = _mm256_mullo_epi16 (a0, b[12]); | |||
temp = mul_add (a1, b[11], temp); | |||
temp = mul_add (a[11], b1, temp); | |||
temp = mul_add (a[12], b0, temp); | |||
temp = mul_add (a2, b[10], temp); | |||
temp = mul_add (a3, b[9], temp); | |||
temp = mul_add (a4, b[8], temp); | |||
temp = mul_add (a5, b7, temp); | |||
temp = mul_add (a6, b6, temp); | |||
temp = mul_add (a7, b5, temp); | |||
temp = mul_add (a[8], b4, temp); | |||
temp = mul_add (a[9], b3, temp); | |||
temp=mul_add(a[10], b2, temp); | |||
c_avx[12] = _mm256_add_epi16(temp, c_avx[12]); | |||
temp = _mm256_mullo_epi16 (a0, b[13]); | |||
temp = mul_add (a1, b[12], temp ); | |||
temp = mul_add (a[12], b1, temp ); | |||
temp = mul_add (a[13], b0, temp ); | |||
temp = mul_add (a2, b[11], temp ); | |||
temp = mul_add (a3, b[10], temp ); | |||
temp = mul_add (a4, b[9], temp ); | |||
temp = mul_add (a5, b[8], temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a[8], b5, temp ); | |||
temp = mul_add (a[9], b4, temp ); | |||
temp = mul_add (a[10], b3, temp ); | |||
temp=mul_add(a[11], b2, temp); | |||
c_avx[13] = _mm256_add_epi16(temp, c_avx[13]); | |||
temp = _mm256_mullo_epi16 (a0, b[14]); | |||
temp = mul_add (a1, b[13], temp ); | |||
temp = mul_add (a[13], b1, temp ); | |||
temp = mul_add (a[14], b0, temp ); | |||
temp = mul_add (a2, b[12], temp ); | |||
temp = mul_add (a3, b[11], temp ); | |||
temp = mul_add (a4, b[10], temp ); | |||
temp = mul_add (a5, b[9], temp ); | |||
temp = mul_add (a6, b[8], temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a[8], b6, temp ); | |||
temp = mul_add (a[9], b5, temp ); | |||
temp = mul_add (a[10], b4, temp ); | |||
temp = mul_add (a[11], b3, temp ); | |||
temp=mul_add(a[12], b2, temp); | |||
c_avx[14] = _mm256_add_epi16(temp, c_avx[14]); | |||
temp = _mm256_mullo_epi16 (a0, b[15]); | |||
temp = mul_add (a1, b[14], temp ); | |||
temp = mul_add (a[14], b1, temp ); | |||
temp = mul_add (a[15], b0, temp ); | |||
temp = mul_add (a2, b[13], temp ); | |||
temp = mul_add (a3, b[12], temp ); | |||
temp = mul_add (a4, b[11], temp ); | |||
temp = mul_add (a5, b[10], temp ); | |||
temp = mul_add (a6, b[9], temp ); | |||
temp = mul_add (a7, b[8], temp ); | |||
temp = mul_add (a[8], b7, temp ); | |||
temp = mul_add (a[9], b6, temp ); | |||
temp = mul_add (a[10], b5, temp ); | |||
temp = mul_add (a[11], b4, temp ); | |||
temp = mul_add (a[12], b3, temp ); | |||
temp=mul_add(a[13], b2, temp); | |||
c_avx[15] = _mm256_add_epi16(temp, c_avx[15]); | |||
// unrolled second triangle | |||
a0=a[14]; | |||
a1=a[15]; | |||
a2=a[13]; | |||
a3=a[12]; | |||
a4=a[11]; | |||
a5=a[10]; | |||
a6=a[9]; | |||
a7=a[8]; | |||
b0=b[14]; | |||
b1=b[15]; | |||
b2=b[13]; | |||
b3=b[12]; | |||
b4=b[11]; | |||
b5=b[10]; | |||
b6=b[9]; | |||
b7=b[8]; | |||
temp = _mm256_mullo_epi16 (a[1], b1); | |||
temp = mul_add (a[2], b0, temp ); | |||
temp = mul_add (a[3], b2, temp ); | |||
temp = mul_add (a[4], b3, temp ); | |||
temp = mul_add (a[5], b4, temp ); | |||
temp = mul_add (a[6], b5, temp ); | |||
temp = mul_add (a[7], b6, temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a6, b[7], temp ); | |||
temp = mul_add (a5, b[6], temp ); | |||
temp = mul_add (a4, b[5], temp ); | |||
temp = mul_add (a3, b[4], temp ); | |||
temp = mul_add (a2, b[3], temp ); | |||
temp = mul_add (a0, b[2], temp ); | |||
temp=mul_add(a1, b[1], temp); | |||
c_avx[16] = _mm256_add_epi16(temp, c_avx[16]); | |||
temp = _mm256_mullo_epi16 (a[2], b1); | |||
temp = mul_add (a[3], b0, temp ); | |||
temp = mul_add (a[4], b2, temp ); | |||
temp = mul_add (a[5], b3, temp ); | |||
temp = mul_add (a[6], b4, temp ); | |||
temp = mul_add (a[7], b5, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a5, b[7], temp ); | |||
temp = mul_add (a4, b[6], temp ); | |||
temp = mul_add (a3, b[5], temp ); | |||
temp = mul_add (a2, b[4], temp ); | |||
temp = mul_add (a0, b[3], temp ); | |||
temp=mul_add(a1, b[2], temp); | |||
c_avx[17] = _mm256_add_epi16(temp, c_avx[17]); | |||
temp = _mm256_mullo_epi16 (a[3], b1); | |||
temp = mul_add (a[4], b0, temp ); | |||
temp = mul_add (a[5], b2, temp ); | |||
temp = mul_add (a[6], b3, temp ); | |||
temp = mul_add (a[7], b4, temp ); | |||
temp = mul_add (a7, b5, temp ); | |||
temp = mul_add (a6, b6, temp ); | |||
temp = mul_add (a5, b7, temp ); | |||
temp = mul_add (a4, b[7], temp ); | |||
temp = mul_add (a3, b[6], temp ); | |||
temp = mul_add (a2, b[5], temp ); | |||
temp = mul_add (a0, b[4], temp ); | |||
temp=mul_add(a1, b[3], temp); | |||
c_avx[18] = _mm256_add_epi16(temp, c_avx[18]); | |||
temp = _mm256_mullo_epi16 (a[4], b1); | |||
temp = mul_add (a[5], b0, temp ); | |||
temp = mul_add (a[6], b2, temp ); | |||
temp = mul_add (a[7], b3, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a3, b[7], temp ); | |||
temp = mul_add (a2, b[6], temp ); | |||
temp = mul_add (a0, b[5], temp ); | |||
temp=mul_add(a1, b[4], temp); | |||
c_avx[19] = _mm256_add_epi16(temp, c_avx[19]); | |||
temp = _mm256_mullo_epi16 (a[5], b1); | |||
temp = mul_add (a[6], b0, temp ); | |||
temp = mul_add (a[7], b2, temp ); | |||
temp = mul_add (a7, b3, temp ); | |||
temp = mul_add (a6, b4, temp ); | |||
temp = mul_add (a5, b5, temp ); | |||
temp = mul_add (a4, b6, temp ); | |||
temp = mul_add (a3, b7, temp ); | |||
temp = mul_add (a2, b[7], temp ); | |||
temp = mul_add (a0, b[6], temp ); | |||
temp=mul_add(a1, b[5], temp); | |||
c_avx[20] = _mm256_add_epi16(temp, c_avx[20]); | |||
temp = _mm256_mullo_epi16 (a[6], b1); | |||
temp = mul_add (a[7], b0, temp ); | |||
temp = mul_add (a7, b2, temp ); | |||
temp = mul_add (a6, b3, temp ); | |||
temp = mul_add (a5, b4, temp ); | |||
temp = mul_add (a4, b5, temp ); | |||
temp = mul_add (a3, b6, temp ); | |||
temp = mul_add (a2, b7, temp ); | |||
temp = mul_add (a0, b[7], temp ); | |||
temp=mul_add(a1, b[6], temp); | |||
c_avx[21] = _mm256_add_epi16(temp, c_avx[21]); | |||
temp = _mm256_mullo_epi16 (a[7], b1); | |||
temp = mul_add (a7, b0, temp ); | |||
temp = mul_add (a6, b2, temp ); | |||
temp = mul_add (a5, b3, temp ); | |||
temp = mul_add (a4, b4, temp ); | |||
temp = mul_add (a3, b5, temp ); | |||
temp = mul_add (a2, b6, temp ); | |||
temp = mul_add (a0, b7, temp ); | |||
temp=mul_add(a1, b[7], temp); | |||
c_avx[22] = _mm256_add_epi16(temp, c_avx[22]); | |||
temp = _mm256_mullo_epi16 (a7, b1); | |||
temp = mul_add (a6, b0, temp ); | |||
temp = mul_add (a5, b2, temp ); | |||
temp = mul_add (a4, b3, temp ); | |||
temp = mul_add (a3, b4, temp ); | |||
temp = mul_add (a2, b5, temp ); | |||
temp = mul_add (a0, b6, temp ); | |||
temp=mul_add(a1, b7, temp); | |||
c_avx[23] = _mm256_add_epi16(temp, c_avx[23]); | |||
temp = _mm256_mullo_epi16 (a6, b1); | |||
temp = mul_add (a5, b0, temp ); | |||
temp = mul_add (a4, b2, temp ); | |||
temp = mul_add (a3, b3, temp ); | |||
temp = mul_add (a2, b4, temp ); | |||
temp = mul_add (a0, b5, temp ); | |||
temp=mul_add(a1, b6, temp); | |||
c_avx[24] = _mm256_add_epi16(temp, c_avx[24]); | |||
temp = _mm256_mullo_epi16 (a5, b1); | |||
temp = mul_add (a4, b0, temp ); | |||
temp = mul_add (a3, b2, temp ); | |||
temp = mul_add (a2, b3, temp ); | |||
temp = mul_add (a0, b4, temp ); | |||
temp=mul_add(a1, b5, temp); | |||
c_avx[25] = _mm256_add_epi16(temp, c_avx[25]); | |||
temp = _mm256_mullo_epi16 (a4, b1); | |||
temp = mul_add (a3, b0, temp ); | |||
temp = mul_add (a2, b2, temp ); | |||
temp = mul_add (a0, b3, temp ); | |||
temp=mul_add(a1, b4, temp); | |||
c_avx[26] = _mm256_add_epi16(temp, c_avx[26]); | |||
temp = _mm256_mullo_epi16 (a3, b1); | |||
temp = mul_add (a2, b0, temp ); | |||
temp = mul_add (a0, b2, temp ); | |||
temp=mul_add(a1, b3, temp); | |||
c_avx[27] = _mm256_add_epi16(temp, c_avx[27]); | |||
temp = _mm256_mullo_epi16 (a2, b1); | |||
temp = mul_add (a0, b0, temp ); | |||
temp=mul_add(a1, b2, temp); | |||
c_avx[28] = _mm256_add_epi16(temp, c_avx[28]); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
temp=mul_add(a1, b0, temp); | |||
c_avx[29] = _mm256_add_epi16(temp, c_avx[29]); | |||
c_avx[30] = mul_add(a1, b1, c_avx[30]); | |||
c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); | |||
} | |||
static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched | |||
//the c_avx are not added cummulatively | |||
{ | |||
__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7; | |||
__m256i temp; | |||
a0=a[0]; | |||
a1=a[1]; | |||
a2=a[2]; | |||
a3=a[3]; | |||
a4=a[4]; | |||
a5=a[5]; | |||
a6=a[6]; | |||
a7=a[7]; | |||
b0=b[0]; | |||
b1=b[1]; | |||
b2=b[2]; | |||
b3=b[3]; | |||
b4=b[4]; | |||
b5=b[5]; | |||
b6=b[6]; | |||
b7=b[7]; | |||
// New Unrolled first triangle | |||
c_avx[0] = _mm256_mullo_epi16 (a0, b0); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
c_avx[1]=mul_add(a1, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b2); | |||
temp = mul_add(a1, b1, temp); | |||
c_avx[2]= mul_add(a2, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b3); | |||
temp = mul_add(a1, b2, temp); | |||
temp = mul_add(a2, b1, temp); | |||
c_avx[3]= mul_add(a3, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b4); | |||
temp = mul_add(a1, b3, temp); | |||
temp = mul_add(a3, b1, temp); | |||
temp = mul_add(a4, b0, temp); | |||
c_avx[4]= mul_add(a2, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b5); | |||
temp = mul_add(a1, b4 , temp); | |||
temp = mul_add(a2, b3, temp); | |||
temp = mul_add(a3, b2, temp); | |||
temp = mul_add( a4, b1, temp); | |||
c_avx[5] = mul_add(a5, b0, temp); | |||
temp = _mm256_mullo_epi16 (a0, b6); | |||
temp = mul_add(a1, b5, temp); | |||
temp = mul_add(a5, b1, temp); | |||
temp = mul_add(a6, b0, temp); | |||
temp = mul_add(a2, b4, temp); | |||
temp = mul_add(a3, b3, temp); | |||
c_avx[6] = mul_add(a4, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b7); | |||
temp = mul_add(a1, b6, temp); | |||
temp = mul_add (a6, b1, temp); | |||
temp = mul_add (a7, b0, temp); | |||
temp = mul_add(a2, b5, temp); | |||
temp = mul_add (a3, b4, temp); | |||
temp = mul_add (a4, b3, temp); | |||
c_avx[7] = mul_add (a5, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[8]); | |||
temp = mul_add (a1, b7, temp); | |||
temp = mul_add (a7, b1, temp); | |||
temp = mul_add (a[8], b0, temp); | |||
temp = mul_add (a2, b6,temp); | |||
temp = mul_add(a3, b5, temp); | |||
temp = mul_add (a4, b4,temp); | |||
temp = mul_add (a5, b3, temp); | |||
c_avx[8] = mul_add (a6, b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[9]); | |||
temp = mul_add (a1, b[8], temp); | |||
temp = mul_add (a[8], b1, temp); | |||
temp = mul_add (a[9], b0, temp); | |||
temp = mul_add (a2, b7, temp); | |||
temp = mul_add (a3, b6, temp); | |||
temp = mul_add (a4, b5, temp); | |||
temp = mul_add (a5, b4, temp); | |||
temp = mul_add (a6, b3, temp); | |||
c_avx[9] = mul_add (a7, b2, temp); | |||
temp= _mm256_mullo_epi16 (a0, b[10]); | |||
temp = mul_add (a1, b[9], temp); | |||
temp = mul_add (a[9], b1, temp); | |||
temp = mul_add (a[10], b0, temp); | |||
temp = mul_add (a2, b[8], temp); | |||
temp = mul_add (a3, b7, temp); | |||
temp = mul_add (a4, b6, temp); | |||
temp = mul_add (a5, b5, temp); | |||
temp = mul_add (a6, b4, temp); | |||
temp = mul_add (a7, b3, temp); | |||
c_avx[10] = mul_add (a[8], b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[11]); | |||
temp = mul_add (a1, b[10], temp ); | |||
temp = mul_add (a[10], b1, temp ); | |||
temp = mul_add (a[11], b0, temp ); | |||
temp = mul_add (a2, b[9], temp ); | |||
temp = mul_add (a3, b[8], temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a[8], b3, temp ); | |||
c_avx[11] = mul_add (a[9], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[12]); | |||
temp = mul_add (a1, b[11], temp); | |||
temp = mul_add (a[11], b1, temp); | |||
temp = mul_add (a[12], b0, temp); | |||
temp = mul_add (a2, b[10], temp); | |||
temp = mul_add (a3, b[9], temp); | |||
temp = mul_add (a4, b[8], temp); | |||
temp = mul_add (a5, b7, temp); | |||
temp = mul_add (a6, b6, temp); | |||
temp = mul_add (a7, b5, temp); | |||
temp = mul_add (a[8], b4, temp); | |||
temp = mul_add (a[9], b3, temp); | |||
c_avx[12] = mul_add (a[10], b2, temp); | |||
temp = _mm256_mullo_epi16 (a0, b[13]); | |||
temp = mul_add (a1, b[12], temp ); | |||
temp = mul_add (a[12], b1, temp ); | |||
temp = mul_add (a[13], b0, temp ); | |||
temp = mul_add (a2, b[11], temp ); | |||
temp = mul_add (a3, b[10], temp ); | |||
temp = mul_add (a4, b[9], temp ); | |||
temp = mul_add (a5, b[8], temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a[8], b5, temp ); | |||
temp = mul_add (a[9], b4, temp ); | |||
temp = mul_add (a[10], b3, temp ); | |||
c_avx[13] = mul_add (a[11], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[14]); | |||
temp = mul_add (a1, b[13], temp ); | |||
temp = mul_add (a[13], b1, temp ); | |||
temp = mul_add (a[14], b0, temp ); | |||
temp = mul_add (a2, b[12], temp ); | |||
temp = mul_add (a3, b[11], temp ); | |||
temp = mul_add (a4, b[10], temp ); | |||
temp = mul_add (a5, b[9], temp ); | |||
temp = mul_add (a6, b[8], temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a[8], b6, temp ); | |||
temp = mul_add (a[9], b5, temp ); | |||
temp = mul_add (a[10], b4, temp ); | |||
temp = mul_add (a[11], b3, temp ); | |||
c_avx[14] = mul_add (a[12], b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b[15]); | |||
temp = mul_add (a1, b[14], temp ); | |||
temp = mul_add (a[14], b1, temp ); | |||
temp = mul_add (a[15], b0, temp ); | |||
temp = mul_add (a2, b[13], temp ); | |||
temp = mul_add (a3, b[12], temp ); | |||
temp = mul_add (a4, b[11], temp ); | |||
temp = mul_add (a5, b[10], temp ); | |||
temp = mul_add (a6, b[9], temp ); | |||
temp = mul_add (a7, b[8], temp ); | |||
temp = mul_add (a[8], b7, temp ); | |||
temp = mul_add (a[9], b6, temp ); | |||
temp = mul_add (a[10], b5, temp ); | |||
temp = mul_add (a[11], b4, temp ); | |||
temp = mul_add (a[12], b3, temp ); | |||
c_avx[15] = mul_add (a[13], b2, temp ); | |||
// unrolled second triangle | |||
a0=a[14]; | |||
a1=a[15]; | |||
a2=a[13]; | |||
a3=a[12]; | |||
a4=a[11]; | |||
a5=a[10]; | |||
a6=a[9]; | |||
a7=a[8]; | |||
b0=b[14]; | |||
b1=b[15]; | |||
b2=b[13]; | |||
b3=b[12]; | |||
b4=b[11]; | |||
b5=b[10]; | |||
b6=b[9]; | |||
b7=b[8]; | |||
temp = _mm256_mullo_epi16 (a[1], b1); | |||
temp = mul_add (a[2], b0, temp ); | |||
temp = mul_add (a[3], b2, temp ); | |||
temp = mul_add (a[4], b3, temp ); | |||
temp = mul_add (a[5], b4, temp ); | |||
temp = mul_add (a[6], b5, temp ); | |||
temp = mul_add (a[7], b6, temp ); | |||
temp = mul_add (a7, b7, temp ); | |||
temp = mul_add (a6, b[7], temp ); | |||
temp = mul_add (a5, b[6], temp ); | |||
temp = mul_add (a4, b[5], temp ); | |||
temp = mul_add (a3, b[4], temp ); | |||
temp = mul_add (a2, b[3], temp ); | |||
temp = mul_add (a0, b[2], temp ); | |||
c_avx[16] = mul_add (a1, b[1], temp ); | |||
temp = _mm256_mullo_epi16 (a[2], b1); | |||
temp = mul_add (a[3], b0, temp ); | |||
temp = mul_add (a[4], b2, temp ); | |||
temp = mul_add (a[5], b3, temp ); | |||
temp = mul_add (a[6], b4, temp ); | |||
temp = mul_add (a[7], b5, temp ); | |||
temp = mul_add (a7, b6, temp ); | |||
temp = mul_add (a6, b7, temp ); | |||
temp = mul_add (a5, b[7], temp ); | |||
temp = mul_add (a4, b[6], temp ); | |||
temp = mul_add (a3, b[5], temp ); | |||
temp = mul_add (a2, b[4], temp ); | |||
temp = mul_add (a0, b[3], temp ); | |||
c_avx[17] = mul_add (a1, b[2], temp ); | |||
temp = _mm256_mullo_epi16 (a[3], b1); | |||
temp = mul_add (a[4], b0, temp ); | |||
temp = mul_add (a[5], b2, temp ); | |||
temp = mul_add (a[6], b3, temp ); | |||
temp = mul_add (a[7], b4, temp ); | |||
temp = mul_add (a7, b5, temp ); | |||
temp = mul_add (a6, b6, temp ); | |||
temp = mul_add (a5, b7, temp ); | |||
temp = mul_add (a4, b[7], temp ); | |||
temp = mul_add (a3, b[6], temp ); | |||
temp = mul_add (a2, b[5], temp ); | |||
temp = mul_add (a0, b[4], temp ); | |||
c_avx[18] = mul_add (a1, b[3], temp ); | |||
temp = _mm256_mullo_epi16 (a[4], b1); | |||
temp = mul_add (a[5], b0, temp ); | |||
temp = mul_add (a[6], b2, temp ); | |||
temp = mul_add (a[7], b3, temp ); | |||
temp = mul_add (a7, b4, temp ); | |||
temp = mul_add (a6, b5, temp ); | |||
temp = mul_add (a5, b6, temp ); | |||
temp = mul_add (a4, b7, temp ); | |||
temp = mul_add (a3, b[7], temp ); | |||
temp = mul_add (a2, b[6], temp ); | |||
temp = mul_add (a0, b[5], temp ); | |||
c_avx[19] = mul_add (a1, b[4], temp ); | |||
temp = _mm256_mullo_epi16 (a[5], b1); | |||
temp = mul_add (a[6], b0, temp ); | |||
temp = mul_add (a[7], b2, temp ); | |||
temp = mul_add (a7, b3, temp ); | |||
temp = mul_add (a6, b4, temp ); | |||
temp = mul_add (a5, b5, temp ); | |||
temp = mul_add (a4, b6, temp ); | |||
temp = mul_add (a3, b7, temp ); | |||
temp = mul_add (a2, b[7], temp ); | |||
temp = mul_add (a0, b[6], temp ); | |||
c_avx[20] = mul_add (a1, b[5], temp ); | |||
temp = _mm256_mullo_epi16 (a[6], b1); | |||
temp = mul_add (a[7], b0, temp ); | |||
temp = mul_add (a7, b2, temp ); | |||
temp = mul_add (a6, b3, temp ); | |||
temp = mul_add (a5, b4, temp ); | |||
temp = mul_add (a4, b5, temp ); | |||
temp = mul_add (a3, b6, temp ); | |||
temp = mul_add (a2, b7, temp ); | |||
temp = mul_add (a0, b[7], temp ); | |||
c_avx[21] = mul_add (a1, b[6], temp ); | |||
temp = _mm256_mullo_epi16 (a[7], b1); | |||
temp = mul_add (a7, b0, temp ); | |||
temp = mul_add (a6, b2, temp ); | |||
temp = mul_add (a5, b3, temp ); | |||
temp = mul_add (a4, b4, temp ); | |||
temp = mul_add (a3, b5, temp ); | |||
temp = mul_add (a2, b6, temp ); | |||
temp = mul_add (a0, b7, temp ); | |||
c_avx[22] = mul_add (a1, b[7], temp ); | |||
temp = _mm256_mullo_epi16 (a7, b1); | |||
temp = mul_add (a6, b0, temp ); | |||
temp = mul_add (a5, b2, temp ); | |||
temp = mul_add (a4, b3, temp ); | |||
temp = mul_add (a3, b4, temp ); | |||
temp = mul_add (a2, b5, temp ); | |||
temp = mul_add (a0, b6, temp ); | |||
c_avx[23] = mul_add (a1, b7, temp ); | |||
temp = _mm256_mullo_epi16 (a6, b1); | |||
temp = mul_add (a5, b0, temp ); | |||
temp = mul_add (a4, b2, temp ); | |||
temp = mul_add (a3, b3, temp ); | |||
temp = mul_add (a2, b4, temp ); | |||
temp = mul_add (a0, b5, temp ); | |||
c_avx[24] = mul_add (a1, b6, temp ); | |||
temp = _mm256_mullo_epi16 (a5, b1); | |||
temp = mul_add (a4, b0, temp ); | |||
temp = mul_add (a3, b2, temp ); | |||
temp = mul_add (a2, b3, temp ); | |||
temp = mul_add (a0, b4, temp ); | |||
c_avx[25] = mul_add (a1, b5, temp ); | |||
temp = _mm256_mullo_epi16 (a4, b1); | |||
temp = mul_add (a3, b0, temp ); | |||
temp = mul_add (a2, b2, temp ); | |||
temp = mul_add (a0, b3, temp ); | |||
c_avx[26] = mul_add (a1, b4, temp ); | |||
temp = _mm256_mullo_epi16 (a3, b1); | |||
temp = mul_add (a2, b0, temp ); | |||
temp = mul_add (a0, b2, temp ); | |||
c_avx[27] = mul_add (a1, b3, temp ); | |||
temp = _mm256_mullo_epi16 (a2, b1); | |||
temp = mul_add (a0, b0, temp ); | |||
c_avx[28] = mul_add (a1, b2, temp ); | |||
temp = _mm256_mullo_epi16 (a0, b1); | |||
c_avx[29] = mul_add (a1, b0, temp); | |||
c_avx[30] = _mm256_mullo_epi16 (a1, b1); | |||
c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0); | |||
} |
@@ -11,81 +11,102 @@ | |||
#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1))) | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) { | |||
uint16_t A[SABER_L][SABER_L][SABER_N]; | |||
uint16_t s[SABER_L][SABER_N]; | |||
uint16_t b[SABER_L][SABER_N] = {{0}}; | |||
uint8_t seed_A[SABER_SEEDBYTES]; | |||
uint8_t seed_s[SABER_NOISE_SEEDBYTES]; | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly s[SABER_L]; | |||
poly res[SABER_L]; | |||
uint8_t rand[SABER_NOISESEEDBYTES]; | |||
uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
randombytes(seed_A, SABER_SEEDBYTES); | |||
shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state | |||
randombytes(seed_s, SABER_NOISE_SEEDBYTES); | |||
PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A); | |||
PQCLEAN_SABER_CLEAN_GenSecret(s, seed_s); | |||
PQCLEAN_SABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1); | |||
randombytes(rand, SABER_NOISESEEDBYTES); | |||
PQCLEAN_SABER_CLEAN_GenSecret(s, rand); | |||
PQCLEAN_SABER_CLEAN_POLVECq2BS(sk, s); | |||
PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A); // sample matrix A | |||
PQCLEAN_SABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 1); // Matrix in transposed order | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_N; j++) { | |||
b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP); | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
PQCLEAN_SABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s); | |||
PQCLEAN_SABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b); | |||
memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A)); | |||
PQCLEAN_SABER_CLEAN_POLVECp2BS(pk, res); // pack public key | |||
} | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
uint16_t A[SABER_L][SABER_L][SABER_N]; | |||
uint16_t sp[SABER_L][SABER_N]; | |||
uint16_t bp[SABER_L][SABER_N] = {{0}}; | |||
uint16_t vp[SABER_N] = {0}; | |||
uint16_t mp[SABER_N]; | |||
uint16_t b[SABER_L][SABER_N]; | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) { | |||
size_t i, j; | |||
poly A[SABER_L][SABER_L]; | |||
poly res[SABER_L]; | |||
poly s[SABER_L]; | |||
poly *temp = A[0]; // re-use stack space | |||
poly *vprime = &A[0][0]; | |||
poly *message = &A[0][1]; | |||
const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES; | |||
uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
PQCLEAN_SABER_CLEAN_GenSecret(s, noiseseed); | |||
PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A); | |||
PQCLEAN_SABER_CLEAN_GenSecret(sp, seed_sp); | |||
PQCLEAN_SABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0); | |||
PQCLEAN_SABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed | |||
for (i = 0; i < SABER_L; i++) { | |||
// rounding | |||
for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits | |||
for (j = 0; j < SABER_N; j++) { | |||
bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP); | |||
res[i].coeffs[j] += h1; | |||
res[i].coeffs[j] >>= SABER_EQ - SABER_EP; | |||
res[i].coeffs[j] &= SABER_Q - 1; | |||
} | |||
} | |||
PQCLEAN_SABER_CLEAN_POLVECp2BS(ciphertext, res); | |||
PQCLEAN_SABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp); | |||
PQCLEAN_SABER_CLEAN_BS2POLVECp(b, pk); | |||
PQCLEAN_SABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp); | |||
PQCLEAN_SABER_CLEAN_BS2POLmsg(mp, m); | |||
// vector-vector scalar multiplication with mod p | |||
PQCLEAN_SABER_CLEAN_BS2POLVECp(temp, pk); | |||
PQCLEAN_SABER_CLEAN_InnerProd(vprime, temp, s); | |||
PQCLEAN_SABER_CLEAN_BS2POLmsg(message, m); | |||
for (j = 0; j < SABER_N; j++) { | |||
vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET); | |||
for (i = 0; i < SABER_N; i++) { | |||
vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1)); | |||
vprime->coeffs[i] &= SABER_P - 1; | |||
vprime->coeffs[i] >>= SABER_EP - SABER_ET; | |||
} | |||
PQCLEAN_SABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp); | |||
PQCLEAN_SABER_CLEAN_POLT2BS(msk_c, vprime); | |||
} | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
uint16_t s[SABER_L][SABER_N]; | |||
uint16_t b[SABER_L][SABER_N]; | |||
uint16_t v[SABER_N] = {0}; | |||
uint16_t cm[SABER_N]; | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) { | |||
size_t i; | |||
poly temp[SABER_L]; | |||
poly s[SABER_L]; | |||
const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES; | |||
poly *v = &temp[0]; | |||
poly *cm = &temp[1]; | |||
PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk); | |||
PQCLEAN_SABER_CLEAN_BS2POLVECp(b, ciphertext); | |||
PQCLEAN_SABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s); | |||
PQCLEAN_SABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES); | |||
PQCLEAN_SABER_CLEAN_BS2POLVECp(temp, ciphertext); | |||
PQCLEAN_SABER_CLEAN_InnerProd(&temp[0], temp, s); | |||
PQCLEAN_SABER_CLEAN_BS2POLT(cm, packed_cm); | |||
for (i = 0; i < SABER_N; i++) { | |||
v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1); | |||
v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET)); | |||
v->coeffs[i] &= SABER_P - 1; | |||
v->coeffs[i] >>= SABER_EP - 1; | |||
} | |||
PQCLEAN_SABER_CLEAN_POLmsg2BS(m, v); | |||
@@ -5,7 +5,7 @@ | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]); | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]); | |||
void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]); | |||
@@ -2,19 +2,21 @@ | |||
#define PARAMS_H | |||
/* Change this for different security strengths */ | |||
/* Don't change anything below this line */ | |||
#define SABER_L 3 | |||
#define SABER_MU 8 | |||
#define SABER_ET 4 | |||
#define SABER_EQ 13 | |||
#define SABER_EP 10 | |||
#define SABER_N 256 | |||
#define SABER_EP 10 | |||
#define SABER_P (1 << SABER_EP) | |||
#define SABER_EQ 13 | |||
#define SABER_Q (1 << SABER_EQ) | |||
#define SABER_SEEDBYTES 32 | |||
#define SABER_NOISE_SEEDBYTES 32 | |||
#define SABER_NOISESEEDBYTES 32 | |||
#define SABER_KEYBYTES 32 | |||
#define SABER_HASHBYTES 32 | |||
@@ -15,4 +15,4 @@ int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, cons | |||
int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk); | |||
#endif /* api_h */ | |||
#endif /* PQCLEAN_SABER_CLEAN_API_H */ |
@@ -1,132 +1,145 @@ | |||
#include "api.h" | |||
#include "SABER_params.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include <string.h> | |||
void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 2; j++) { | |||
offset_byte = j; | |||
offset_data = 2 * j; | |||
bytes[offset_byte] = (data[offset_data] & 0x0f) | ((data[offset_data + 1] & 0x0f) << 4); | |||
out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4); | |||
in += 2; | |||
out += 1; | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j, offset_byte, offset_data; | |||
void PQCLEAN_SABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 2; j++) { | |||
offset_byte = j; | |||
offset_data = 2 * j; | |||
data[offset_data] = bytes[offset_byte] & 0x0f; | |||
data[offset_data + 1] = (bytes[offset_byte] >> 4) & 0x0f; | |||
out[0] = in[0] & 0x0f; | |||
out[1] = (in[0] >> 4) & 0x0f; | |||
in += 1; | |||
out += 2; | |||
} | |||
} | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); | |||
bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5); | |||
bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff); | |||
bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2); | |||
bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7); | |||
bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff); | |||
bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4); | |||
bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff); | |||
bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1); | |||
bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6); | |||
bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff); | |||
bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3); | |||
bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff); | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5); | |||
out[2] = ((in[1] >> 3) & 0xff); | |||
out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2); | |||
out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7); | |||
out[5] = ((in[3] >> 1) & 0xff); | |||
out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4); | |||
out[7] = ((in[4] >> 4) & 0xff); | |||
out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1); | |||
out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6); | |||
out[10] = ((in[6] >> 2) & 0xff); | |||
out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3); | |||
out[12] = ((in[7] >> 5) & 0xff); | |||
in += 8; | |||
out += 13; | |||
} | |||
} | |||
static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j, offset_byte, offset_data; | |||
static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 8; j++) { | |||
offset_byte = 13 * j; | |||
offset_data = 8 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8); | |||
data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11); | |||
data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6); | |||
data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9); | |||
data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12); | |||
data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7); | |||
data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10); | |||
data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5); | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8); | |||
out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11); | |||
out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6); | |||
out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9); | |||
out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12); | |||
out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7); | |||
out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10); | |||
out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5); | |||
in += 13; | |||
out += 8; | |||
} | |||
} | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) { | |||
size_t j, offset_byte, offset_data; | |||
static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) { | |||
size_t j; | |||
const uint16_t *in = data->coeffs; | |||
uint8_t *out = bytes; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 5 * j; | |||
offset_data = 4 * j; | |||
bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff)); | |||
bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2); | |||
bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4); | |||
bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6); | |||
bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff); | |||
out[0] = (in[0] & (0xff)); | |||
out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2); | |||
out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4); | |||
out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6); | |||
out[4] = ((in[3] >> 2) & 0xff); | |||
in += 4; | |||
out += 5; | |||
} | |||
} | |||
static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j, offset_byte, offset_data; | |||
static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) { | |||
size_t j; | |||
const uint8_t *in = bytes; | |||
uint16_t *out = data->coeffs; | |||
for (j = 0; j < SABER_N / 4; j++) { | |||
offset_byte = 5 * j; | |||
offset_data = 4 * j; | |||
data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8); | |||
data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6); | |||
data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4); | |||
data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2); | |||
out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8); | |||
out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6); | |||
out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4); | |||
out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2); | |||
in += 5; | |||
out += 4; | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) { | |||
void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLq2BS(bytes + i * SABER_POLYBYTES, data[i]); | |||
POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
void PQCLEAN_SABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLq(data[i], bytes + i * SABER_POLYBYTES); | |||
BS2POLq(&data[i], bytes + i * SABER_POLYBYTES); | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) { | |||
void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]); | |||
POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]); | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
void PQCLEAN_SABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) { | |||
size_t i; | |||
for (i = 0; i < SABER_L; i++) { | |||
BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8)); | |||
BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES); | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) { | |||
void PQCLEAN_SABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) { | |||
size_t i, j; | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
data[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01); | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) { | |||
void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) { | |||
size_t i, j; | |||
memset(bytes, 0, SABER_KEYBYTES); | |||
for (j = 0; j < SABER_KEYBYTES; j++) { | |||
for (i = 0; i < 8; i++) { | |||
bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i); | |||
bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i); | |||
} | |||
} | |||
} |
@@ -1,27 +1,28 @@ | |||
#ifndef PACK_UNPACK_H | |||
#define PACK_UNPACK_H | |||
#include "SABER_params.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <stdio.h> | |||
void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]); | |||
void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data); | |||
void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_SABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]); | |||
void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]); | |||
void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]); | |||
void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]); | |||
void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_SABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]); | |||
void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_SABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]); | |||
void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_SABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]); | |||
void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]); | |||
void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data); | |||
#endif |
@@ -3,32 +3,40 @@ | |||
#include "fips202.h" | |||
#include "pack_unpack.h" | |||
#include "poly.h" | |||
#include "poly_mul.h" | |||
#include <stddef.h> | |||
void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) { | |||
void PQCLEAN_SABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) { | |||
size_t i, j; | |||
for (i = 0; i < SABER_L; i++) { | |||
for (j = 0; j < SABER_L; j++) { | |||
if (transpose == 1) { | |||
PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]); | |||
} else { | |||
PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]); | |||
if (transpose) { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1); | |||
} | |||
} | |||
} else { | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0); | |||
for (j = 1; j < SABER_L; j++) { | |||
PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1); | |||
} | |||
} | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) { | |||
size_t j; | |||
for (j = 0; j < SABER_L; j++) { | |||
PQCLEAN_SABER_CLEAN_poly_mul_acc(res, b[j], s[j]); | |||
void PQCLEAN_SABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) { | |||
size_t i; | |||
PQCLEAN_SABER_CLEAN_poly_mul(c, &b[0], &s[0], 0); | |||
for (i = 1; i < SABER_L; i++) { | |||
PQCLEAN_SABER_CLEAN_poly_mul(c, &b[i], &s[i], 1); | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) { | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
void PQCLEAN_SABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYVECBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES); | |||
@@ -37,13 +45,13 @@ void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const | |||
} | |||
} | |||
void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) { | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
void PQCLEAN_SABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) { | |||
size_t i; | |||
uint8_t buf[SABER_L * SABER_POLYCOINBYTES]; | |||
shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES); | |||
shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES); | |||
for (i = 0; i < SABER_L; i++) { | |||
PQCLEAN_SABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES); | |||
PQCLEAN_SABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES); | |||
} | |||
} |
@@ -3,13 +3,21 @@ | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose); | |||
typedef union { | |||
uint16_t coeffs[SABER_N]; | |||
} poly; | |||
void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]); | |||
void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_SABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose); | |||
void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]); | |||
void PQCLEAN_SABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]); | |||
void PQCLEAN_SABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]); | |||
void PQCLEAN_SABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]); | |||
void PQCLEAN_SABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate); | |||
#endif |
@@ -1,4 +1,4 @@ | |||
#include "poly_mul.h" | |||
#include "poly.h" | |||
#include <stdint.h> | |||
#include <string.h> | |||
@@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t | |||
} | |||
/* res += a*b */ | |||
void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) { | |||
uint16_t c[2 * SABER_N] = {0}; | |||
void PQCLEAN_SABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) { | |||
uint16_t C[2 * SABER_N] = {0}; | |||
size_t i; | |||
toom_cook_4way(c, a, b); | |||
toom_cook_4way(C, a->coeffs, b->coeffs); | |||
/* reduction */ | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
res[i - SABER_N] += (c[i - SABER_N] - c[i]); | |||
if (accumulate == 0) { | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]); | |||
} | |||
} else { | |||
for (i = SABER_N; i < 2 * SABER_N; i++) { | |||
c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]); | |||
} | |||
} | |||
} |
@@ -1,9 +1,3 @@ | |||
#ifndef POLY_MUL_H | |||
#define POLY_MUL_H | |||
#include "SABER_params.h" | |||
#include <stdint.h> | |||
void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]); | |||
#endif |
@@ -3,5 +3,14 @@ consistency_checks: | |||
scheme: firesaber | |||
implementation: clean | |||
files: | |||
- api.h | |||
- cbd.h | |||
- pack_unpack.h | |||
- kem.h | |||
- SABER_indcpa.h | |||
- SABER_params.h | |||
- verify.h | |||
- cbd.c | |||
- kem.c | |||
- pack_unpack.c | |||
- verify.c |
@@ -3,5 +3,14 @@ consistency_checks: | |||
scheme: firesaber | |||
implementation: avx2 | |||
files: | |||
- api.h | |||
- cbd.h | |||
- poly_mul.h | |||
- pack_unpack.h | |||
- SABER_indcpa.h | |||
- SABER_params.h | |||
- verify.h | |||
- cbd.c | |||
- kem.c | |||
- pack_unpack.c | |||
- verify.c |
@@ -3,13 +3,27 @@ consistency_checks: | |||
scheme: lightsaber | |||
implementation: clean | |||
files: | |||
- api.h | |||
- cbd.h | |||
- pack_unpack.h | |||
- kem.h | |||
- SABER_indcpa.h | |||
- SABER_params.h | |||
- verify.h | |||
- cbd.c | |||
- kem.c | |||
- pack_unpack.c | |||
- verify.c | |||
- source: | |||
scheme: saber | |||
implementation: clean | |||
files: | |||
- cbd.h | |||
- pack_unpack.h | |||
- kem.h | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- verify.c | |||
- source: | |||
scheme: saber | |||
@@ -22,13 +36,20 @@ consistency_checks: | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- pack_unpack.c | |||
- poly.c | |||
- poly_mul.c | |||
- SABER_indcpa.c | |||
- verify.c | |||
- source: | |||
scheme: firesaber | |||
implementation: clean | |||
files: | |||
- cbd.h | |||
- pack_unpack.h | |||
- kem.h | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- verify.c | |||
- source: | |||
scheme: firesaber | |||
@@ -41,5 +62,7 @@ consistency_checks: | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- pack_unpack.c | |||
- poly.c | |||
- poly_mul.c | |||
- SABER_indcpa.c | |||
- verify.c |
@@ -3,7 +3,16 @@ consistency_checks: | |||
scheme: lightsaber | |||
implementation: avx2 | |||
files: | |||
- api.h | |||
- cbd.h | |||
- poly_mul.h | |||
- pack_unpack.h | |||
- SABER_indcpa.h | |||
- SABER_params.h | |||
- verify.h | |||
- cbd.c | |||
- kem.c | |||
- pack_unpack.c | |||
- verify.c | |||
- source: | |||
scheme: saber | |||
@@ -24,7 +33,12 @@ consistency_checks: | |||
scheme: saber | |||
implementation: avx2 | |||
files: | |||
- cbd.h | |||
- poly_mul.h | |||
- pack_unpack.h | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- verify.c | |||
- source: | |||
scheme: firesaber | |||
@@ -45,5 +59,10 @@ consistency_checks: | |||
scheme: firesaber | |||
implementation: avx2 | |||
files: | |||
- cbd.h | |||
- poly_mul.h | |||
- pack_unpack.h | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- verify.c |
@@ -3,13 +3,27 @@ consistency_checks: | |||
scheme: saber | |||
implementation: clean | |||
files: | |||
- api.h | |||
- cbd.h | |||
- pack_unpack.h | |||
- kem.h | |||
- SABER_indcpa.h | |||
- SABER_params.h | |||
- verify.h | |||
- cbd.c | |||
- kem.c | |||
- pack_unpack.c | |||
- verify.c | |||
- source: | |||
scheme: firesaber | |||
implementation: clean | |||
files: | |||
- cbd.h | |||
- pack_unpack.h | |||
- kem.h | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- verify.c | |||
- source: | |||
scheme: firesaber | |||
@@ -22,5 +36,7 @@ consistency_checks: | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- pack_unpack.c | |||
- poly.c | |||
- poly_mul.c | |||
- SABER_indcpa.c | |||
- verify.c |
@@ -3,7 +3,16 @@ consistency_checks: | |||
scheme: saber | |||
implementation: avx2 | |||
files: | |||
- api.h | |||
- cbd.h | |||
- poly_mul.h | |||
- pack_unpack.h | |||
- SABER_indcpa.h | |||
- SABER_params.h | |||
- verify.h | |||
- cbd.c | |||
- kem.c | |||
- pack_unpack.c | |||
- verify.c | |||
- source: | |||
scheme: firesaber | |||
@@ -24,5 +33,10 @@ consistency_checks: | |||
scheme: firesaber | |||
implementation: avx2 | |||
files: | |||
- cbd.h | |||
- poly_mul.h | |||
- pack_unpack.h | |||
- SABER_indcpa.h | |||
- verify.h | |||
- kem.c | |||
- verify.c |