From 0a3146831f0112d01a620a4e5d3f50b87bc50fe1 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Thu, 15 Oct 2020 20:00:04 -0400
Subject: [PATCH 01/10] Update Saber and add AVX2 implementation

---
 crypto_kem/firesaber/META.yml                 |   11 +-
 crypto_kem/firesaber/avx2/LICENSE             |    1 +
 crypto_kem/firesaber/avx2/Makefile            |   22 +
 crypto_kem/firesaber/avx2/SABER_indcpa.c      |  416 +++++++
 crypto_kem/firesaber/avx2/SABER_indcpa.h      |   13 +
 crypto_kem/firesaber/avx2/SABER_params.h      |   45 +
 crypto_kem/firesaber/avx2/api.h               |   18 +
 crypto_kem/firesaber/avx2/cbd.c               |   52 +
 crypto_kem/firesaber/avx2/cbd.h               |   16 +
 crypto_kem/firesaber/avx2/kem.c               |   79 ++
 crypto_kem/firesaber/avx2/kem.h               |   35 +
 crypto_kem/firesaber/avx2/pack_unpack.c       |  502 ++++++++
 crypto_kem/firesaber/avx2/pack_unpack.h       |   56 +
 crypto_kem/firesaber/avx2/poly.h              |   27 +
 crypto_kem/firesaber/avx2/polymul/consts.h    |   20 +
 crypto_kem/firesaber/avx2/polymul/matrix.c    |  303 +++++
 crypto_kem/firesaber/avx2/polymul/scm_avx.c   |  753 ++++++++++++
 .../firesaber/avx2/polymul/toom-cook_4way.c   | 1010 +++++++++++++++++
 crypto_kem/firesaber/avx2/verify.c            |   35 +
 crypto_kem/firesaber/avx2/verify.h            |   22 +
 crypto_kem/firesaber/clean/LICENSE            |    9 +-
 crypto_kem/firesaber/clean/Makefile           |    4 +-
 crypto_kem/firesaber/clean/SABER_indcpa.c     |  316 +-----
 crypto_kem/firesaber/clean/SABER_indcpa.h     |   12 +-
 crypto_kem/firesaber/clean/SABER_params.h     |   42 +-
 crypto_kem/firesaber/clean/api.h              |   12 +-
 crypto_kem/firesaber/clean/cbd.c              |   34 +-
 crypto_kem/firesaber/clean/cbd.h              |    7 +-
 crypto_kem/firesaber/clean/kem.c              |   78 +-
 crypto_kem/firesaber/clean/pack_unpack.c      |  336 ++----
 crypto_kem/firesaber/clean/pack_unpack.h      |   29 +-
 crypto_kem/firesaber/clean/poly.c             |   58 +-
 crypto_kem/firesaber/clean/poly.h             |   23 +-
 crypto_kem/firesaber/clean/poly_mul.c         |   18 +-
 crypto_kem/firesaber/clean/poly_mul.h         |    8 +-
 crypto_kem/firesaber/clean/verify.c           |   13 +-
 crypto_kem/firesaber/clean/verify.h           |    7 +-
 crypto_kem/lightsaber/META.yml                |   11 +-
 crypto_kem/lightsaber/avx2/LICENSE            |    1 +
 crypto_kem/lightsaber/avx2/Makefile           |   22 +
 crypto_kem/lightsaber/avx2/SABER_indcpa.c     |  416 +++++++
 crypto_kem/lightsaber/avx2/SABER_indcpa.h     |   13 +
 crypto_kem/lightsaber/avx2/SABER_params.h     |   46 +
 crypto_kem/lightsaber/avx2/api.h              |   18 +
 crypto_kem/lightsaber/avx2/cbd.c              |   51 +
 crypto_kem/lightsaber/avx2/cbd.h              |   16 +
 crypto_kem/lightsaber/avx2/kem.c              |   79 ++
 crypto_kem/lightsaber/avx2/kem.h              |   35 +
 crypto_kem/lightsaber/avx2/pack_unpack.c      |  502 ++++++++
 crypto_kem/lightsaber/avx2/pack_unpack.h      |   56 +
 crypto_kem/lightsaber/avx2/poly.h             |   27 +
 crypto_kem/lightsaber/avx2/polymul/consts.h   |   20 +
 crypto_kem/lightsaber/avx2/polymul/matrix.c   |  303 +++++
 crypto_kem/lightsaber/avx2/polymul/scm_avx.c  |  753 ++++++++++++
 .../lightsaber/avx2/polymul/toom-cook_4way.c  | 1010 +++++++++++++++++
 crypto_kem/lightsaber/avx2/verify.c           |   35 +
 crypto_kem/lightsaber/avx2/verify.h           |   22 +
 crypto_kem/lightsaber/clean/LICENSE           |    9 +-
 crypto_kem/lightsaber/clean/Makefile          |    4 +-
 crypto_kem/lightsaber/clean/SABER_indcpa.c    |  316 +-----
 crypto_kem/lightsaber/clean/SABER_indcpa.h    |   12 +-
 crypto_kem/lightsaber/clean/SABER_params.h    |   43 +-
 crypto_kem/lightsaber/clean/api.h             |   12 +-
 crypto_kem/lightsaber/clean/cbd.c             |   27 +-
 crypto_kem/lightsaber/clean/cbd.h             |    7 +-
 crypto_kem/lightsaber/clean/kem.c             |   78 +-
 crypto_kem/lightsaber/clean/pack_unpack.c     |  338 ++----
 crypto_kem/lightsaber/clean/pack_unpack.h     |   29 +-
 crypto_kem/lightsaber/clean/poly.c            |   58 +-
 crypto_kem/lightsaber/clean/poly.h            |   23 +-
 crypto_kem/lightsaber/clean/poly_mul.c        |   18 +-
 crypto_kem/lightsaber/clean/poly_mul.h        |    8 +-
 crypto_kem/lightsaber/clean/verify.c          |   13 +-
 crypto_kem/lightsaber/clean/verify.h          |    7 +-
 crypto_kem/saber/META.yml                     |   11 +-
 crypto_kem/saber/avx2/LICENSE                 |    1 +
 crypto_kem/saber/avx2/Makefile                |   22 +
 crypto_kem/saber/avx2/SABER_indcpa.c          |  416 +++++++
 crypto_kem/saber/avx2/SABER_indcpa.h          |   13 +
 crypto_kem/saber/avx2/SABER_params.h          |   46 +
 crypto_kem/saber/avx2/api.h                   |   18 +
 crypto_kem/saber/avx2/cbd.c                   |   51 +
 crypto_kem/saber/avx2/cbd.h                   |   16 +
 crypto_kem/saber/avx2/kem.c                   |   79 ++
 crypto_kem/saber/avx2/kem.h                   |   35 +
 crypto_kem/saber/avx2/pack_unpack.c           |  502 ++++++++
 crypto_kem/saber/avx2/pack_unpack.h           |   56 +
 crypto_kem/saber/avx2/poly.h                  |   27 +
 crypto_kem/saber/avx2/polymul/consts.h        |   20 +
 crypto_kem/saber/avx2/polymul/matrix.c        |  303 +++++
 crypto_kem/saber/avx2/polymul/scm_avx.c       |  753 ++++++++++++
 .../saber/avx2/polymul/toom-cook_4way.c       | 1010 +++++++++++++++++
 crypto_kem/saber/avx2/verify.c                |   35 +
 crypto_kem/saber/avx2/verify.h                |   22 +
 crypto_kem/saber/clean/LICENSE                |    9 +-
 crypto_kem/saber/clean/Makefile               |    4 +-
 crypto_kem/saber/clean/SABER_indcpa.c         |  316 +-----
 crypto_kem/saber/clean/SABER_indcpa.h         |   12 +-
 crypto_kem/saber/clean/SABER_params.h         |   43 +-
 crypto_kem/saber/clean/api.h                  |   12 +-
 crypto_kem/saber/clean/cbd.c                  |   31 +-
 crypto_kem/saber/clean/cbd.h                  |    7 +-
 crypto_kem/saber/clean/kem.c                  |   78 +-
 crypto_kem/saber/clean/pack_unpack.c          |  340 ++----
 crypto_kem/saber/clean/pack_unpack.h          |   29 +-
 crypto_kem/saber/clean/poly.c                 |   58 +-
 crypto_kem/saber/clean/poly.h                 |   23 +-
 crypto_kem/saber/clean/poly_mul.c             |   18 +-
 crypto_kem/saber/clean/poly_mul.h             |    8 +-
 crypto_kem/saber/clean/verify.c               |   13 +-
 crypto_kem/saber/clean/verify.h               |    7 +-
 test/duplicate_consistency/firesaber_avx2.yml |    7 +
 .../duplicate_consistency/firesaber_clean.yml |   36 +-
 .../duplicate_consistency/lightsaber_avx2.yml |   45 +
 .../lightsaber_clean.yml                      |   78 +-
 test/duplicate_consistency/saber_avx2.yml     |   26 +
 test/duplicate_consistency/saber_clean.yml    |   57 +-
 117 files changed, 11459 insertions(+), 2114 deletions(-)
 create mode 100644 crypto_kem/firesaber/avx2/LICENSE
 create mode 100644 crypto_kem/firesaber/avx2/Makefile
 create mode 100644 crypto_kem/firesaber/avx2/SABER_indcpa.c
 create mode 100644 crypto_kem/firesaber/avx2/SABER_indcpa.h
 create mode 100644 crypto_kem/firesaber/avx2/SABER_params.h
 create mode 100644 crypto_kem/firesaber/avx2/api.h
 create mode 100644 crypto_kem/firesaber/avx2/cbd.c
 create mode 100644 crypto_kem/firesaber/avx2/cbd.h
 create mode 100644 crypto_kem/firesaber/avx2/kem.c
 create mode 100644 crypto_kem/firesaber/avx2/kem.h
 create mode 100644 crypto_kem/firesaber/avx2/pack_unpack.c
 create mode 100644 crypto_kem/firesaber/avx2/pack_unpack.h
 create mode 100644 crypto_kem/firesaber/avx2/poly.h
 create mode 100644 crypto_kem/firesaber/avx2/polymul/consts.h
 create mode 100644 crypto_kem/firesaber/avx2/polymul/matrix.c
 create mode 100644 crypto_kem/firesaber/avx2/polymul/scm_avx.c
 create mode 100644 crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c
 create mode 100644 crypto_kem/firesaber/avx2/verify.c
 create mode 100644 crypto_kem/firesaber/avx2/verify.h
 create mode 100644 crypto_kem/lightsaber/avx2/LICENSE
 create mode 100644 crypto_kem/lightsaber/avx2/Makefile
 create mode 100644 crypto_kem/lightsaber/avx2/SABER_indcpa.c
 create mode 100644 crypto_kem/lightsaber/avx2/SABER_indcpa.h
 create mode 100644 crypto_kem/lightsaber/avx2/SABER_params.h
 create mode 100644 crypto_kem/lightsaber/avx2/api.h
 create mode 100644 crypto_kem/lightsaber/avx2/cbd.c
 create mode 100644 crypto_kem/lightsaber/avx2/cbd.h
 create mode 100644 crypto_kem/lightsaber/avx2/kem.c
 create mode 100644 crypto_kem/lightsaber/avx2/kem.h
 create mode 100644 crypto_kem/lightsaber/avx2/pack_unpack.c
 create mode 100644 crypto_kem/lightsaber/avx2/pack_unpack.h
 create mode 100644 crypto_kem/lightsaber/avx2/poly.h
 create mode 100644 crypto_kem/lightsaber/avx2/polymul/consts.h
 create mode 100644 crypto_kem/lightsaber/avx2/polymul/matrix.c
 create mode 100644 crypto_kem/lightsaber/avx2/polymul/scm_avx.c
 create mode 100644 crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c
 create mode 100644 crypto_kem/lightsaber/avx2/verify.c
 create mode 100644 crypto_kem/lightsaber/avx2/verify.h
 create mode 100644 crypto_kem/saber/avx2/LICENSE
 create mode 100644 crypto_kem/saber/avx2/Makefile
 create mode 100644 crypto_kem/saber/avx2/SABER_indcpa.c
 create mode 100644 crypto_kem/saber/avx2/SABER_indcpa.h
 create mode 100644 crypto_kem/saber/avx2/SABER_params.h
 create mode 100644 crypto_kem/saber/avx2/api.h
 create mode 100644 crypto_kem/saber/avx2/cbd.c
 create mode 100644 crypto_kem/saber/avx2/cbd.h
 create mode 100644 crypto_kem/saber/avx2/kem.c
 create mode 100644 crypto_kem/saber/avx2/kem.h
 create mode 100644 crypto_kem/saber/avx2/pack_unpack.c
 create mode 100644 crypto_kem/saber/avx2/pack_unpack.h
 create mode 100644 crypto_kem/saber/avx2/poly.h
 create mode 100644 crypto_kem/saber/avx2/polymul/consts.h
 create mode 100644 crypto_kem/saber/avx2/polymul/matrix.c
 create mode 100644 crypto_kem/saber/avx2/polymul/scm_avx.c
 create mode 100644 crypto_kem/saber/avx2/polymul/toom-cook_4way.c
 create mode 100644 crypto_kem/saber/avx2/verify.c
 create mode 100644 crypto_kem/saber/avx2/verify.h
 create mode 100644 test/duplicate_consistency/firesaber_avx2.yml
 create mode 100644 test/duplicate_consistency/lightsaber_avx2.yml
 create mode 100644 test/duplicate_consistency/saber_avx2.yml

diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml
index e58c7a7c..def16e46 100644
--- a/crypto_kem/firesaber/META.yml
+++ b/crypto_kem/firesaber/META.yml
@@ -14,4 +14,13 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/commit/14ede83f1ff3bcc41f0464543542366c68b55871
+      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+    - name: avx2
+      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/firesaber/avx2/LICENSE b/crypto_kem/firesaber/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/firesaber/avx2/Makefile b/crypto_kem/firesaber/avx2/Makefile
new file mode 100644
index 00000000..a44bbdb4
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libfiresaber_avx2.a
+HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
+OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o 
+
+CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/firesaber/avx2/SABER_indcpa.c b/crypto_kem/firesaber/avx2/SABER_indcpa.c
new file mode 100644
index 00000000..ab017224
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/SABER_indcpa.c
@@ -0,0 +1,416 @@
+#include "./polymul/toom-cook_4way.c"
+#include "SABER_indcpa.h"
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include "fips202.h"
+#include "pack_unpack.h"
+#include "randombytes.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+//#include "randombytes.h"
+//#include "./polymul/toom_cook_4/toom-cook_4way.c"
+
+#define h1 4 //2^(EQ-EP-1)
+
+#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+
+
+static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) {
+    int32_t i, j;
+
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        message_dec[j] = 0;
+        for (i = 0; i < 8; i++) {
+            message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i);
+        }
+    }
+}
+
+/*-----------------------------------------------------------------------------------
+    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
+-------------------------------------------------------------------------------------*/
+
+static void GenMatrix(polyvec *a, const uint8_t *seed) {
+    uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8];
+
+    uint16_t temp_ar[SABER_N];
+
+    int i, j, k;
+    uint16_t mod = (SABER_Q - 1);
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            PQCLEAN_FIRESABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8);
+            for (k = 0; k < SABER_N; k++) {
+                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
+            }
+        }
+    }
+}
+
+static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
+
+    uint32_t i;
+
+    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
+
+    for (i = 0; i < SABER_K; i++) {
+        PQCLEAN_FIRESABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
+    }
+}
+
+//********************************matrix-vector mul routines*****************************************************
+static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) {
+    int64_t i, j;
+
+    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
+
+    for (i = 0; i < NUM_POLY; i++) {
+        for (j = 0; j < NUM_POLY; j++) {
+
+            if (isTranspose == 0) {
+                toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j);
+            } else {
+                toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j);
+            }
+        }
+
+        TC_interpol(c_bucket, res_avx[i]);
+    }
+
+}
+
+static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) {
+
+    int64_t i;
+
+    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
+
+    for (i = 0; i < NUM_POLY; i++) {
+        toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i);
+    }
+    TC_interpol(c_bucket, res_avx);
+}
+
+//********************************matrix-vector mul routines*****************************************************
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
+
+    polyvec a[SABER_K];
+
+    uint16_t skpv1[SABER_K][SABER_N];
+
+
+
+    uint8_t seed[SABER_SEEDBYTES];
+    uint8_t noiseseed[SABER_COINBYTES];
+    int32_t i, j, k;
+
+
+//--------------AVX declaration------------------
+
+    __m256i sk_avx[SABER_K][SABER_N / 16];
+    __m256i mod;
+    __m256i res_avx[SABER_K][SABER_N / 16];
+    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
+    //__m256i acc[2*SABER_N/16];
+
+    mod = _mm256_set1_epi16(SABER_Q - 1);
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+
+//--------------AVX declaration ends------------------
+
+    randombytes(seed, SABER_SEEDBYTES);
+
+    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state
+    randombytes(noiseseed, SABER_COINBYTES);
+
+
+    GenMatrix(a, seed); //sample matrix A
+
+    GenSecret(skpv1, noiseseed);
+
+
+// Load sk into avx vectors
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+        }
+
+    }
+
+    // Load a into avx vectors
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            for (k = 0; k < SABER_N / 16; k++) {
+                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
+            }
+        }
+    }
+
+
+
+    //------------------------do the matrix vector multiplication and rounding------------
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sk_avx[j], b_bucket[j]);
+    }
+    matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order
+
+    // Now truncation
+
+
+    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N / 16; j++) {
+            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
+            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
+            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
+        }
+    }
+
+    //------------------Pack sk into byte string-------
+
+    PQCLEAN_FIRESABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q);
+
+    //------------------Pack pk into byte string-------
+
+    for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key
+        for (j = 0; j < SABER_N / 16; j++) {
+            _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
+        }
+    }
+    PQCLEAN_FIRESABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string
+
+
+    for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
+        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
+    }
+
+}
+
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+
+
+    uint32_t i, j, k;
+    polyvec a[SABER_K];     // skpv;
+    uint8_t seed[SABER_SEEDBYTES];
+    uint16_t pkcl[SABER_K][SABER_N];    //public key of received by the client
+
+
+    uint16_t skpv1[SABER_K][SABER_N];
+    uint16_t temp[SABER_K][SABER_N];
+    uint16_t message[SABER_KEYBYTES * 8];
+
+    uint8_t msk_c[SABER_SCALEBYTES_KEM];
+
+    //--------------AVX declaration------------------
+
+    __m256i sk_avx[SABER_K][SABER_N / 16];
+    __m256i mod, mod_p;
+    __m256i res_avx[SABER_K][SABER_N / 16];
+    __m256i vprime_avx[SABER_N / 16];
+    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
+    //__m256i acc[2*SABER_N/16];
+
+    __m256i pkcl_avx[SABER_K][SABER_N / 16];
+
+    __m256i message_avx[SABER_N / 16];
+
+    mod = _mm256_set1_epi16(SABER_Q - 1);
+    mod_p = _mm256_set1_epi16(SABER_P - 1);
+
+
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+
+    //--------------AVX declaration ends------------------
+    for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK.
+        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
+    }
+
+    GenMatrix(a, seed);
+    GenSecret(skpv1, noiseseed);
+
+    // ----------- Load skpv1 into avx vectors ----------
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+        }
+    }
+
+    // ----------- Load skpv1 into avx vectors ----------
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            for (k = 0; k < SABER_N / 16; k++) {
+                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
+            }
+        }
+    }
+    //-----------------matrix-vector multiplication and rounding
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sk_avx[j], b_bucket[j]);
+    }
+    matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order
+
+    // Now truncation
+
+    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N / 16; j++) {
+            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
+            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
+            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
+
+        }
+    }
+
+
+    //-----this result should be put in b_prime for later use in server.
+    for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays
+        for (j = 0; j < SABER_N / 16; j++) {
+            _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
+        }
+    }
+
+    PQCLEAN_FIRESABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string
+
+//**************client matrix-vector multiplication ends******************//
+
+    //------now calculate the v'
+
+    //-------unpack the public_key
+    PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P);
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16]));
+        }
+    }
+
+    // InnerProduct
+    //for(k=0;k<SABER_N/16;k++){
+    //  vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]);
+    //}
+
+    // vector-vector scalar multiplication with mod p
+
+    vector_vector_mul(pkcl_avx, b_bucket, vprime_avx);
+
+    // Computation of v'+h1
+    for (i = 0; i < SABER_N / 16; i++) { //adding h1
+        vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1));
+    }
+
+    // unpack m;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            message[8 * j + i] = ((m[j] >> i) & 0x01);
+        }
+    }
+    // message encoding
+    for (i = 0; i < SABER_N / 16; i++) {
+        message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16]));
+        message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) );
+    }
+
+    // SHIFTRIGHT(v'+h1-m mod p, EP-ET)
+    for (k = 0; k < SABER_N / 16; k++) {
+        vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]);
+        vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p);
+        vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) );
+    }
+
+    // Unpack avx
+    for (j = 0; j < SABER_N / 16; j++) {
+        _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]);
+    }
+
+    PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(msk_c, temp[0]);
+
+
+    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
+        ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j];
+    }
+
+}
+
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+
+    uint32_t i, j;
+    uint16_t sksv[SABER_K][SABER_N]; //secret key of the server
+    uint16_t pksv[SABER_K][SABER_N];
+    uint16_t message_dec_unpacked[SABER_KEYBYTES * 8];  // one element containes on decrypted bit;
+    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
+    uint16_t op[SABER_N];
+
+    //--------------AVX declaration------------------
+
+
+    //__m256i mod_p;
+
+    __m256i v_avx[SABER_N / 16];
+
+    //__m256i acc[2*SABER_N/16];
+
+    __m256i sksv_avx[SABER_K][SABER_N / 16];
+    __m256i pksv_avx[SABER_K][SABER_N / 16];
+
+    //mod_p=_mm256_set1_epi16(SABER_P-1);
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+    //--------------AVX declaration ends------------------
+
+    //-------unpack the public_key
+
+    PQCLEAN_FIRESABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key
+    PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16]));
+            pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16]));
+        }
+    }
+
+    for (i = 0; i < SABER_N / 16; i++) {
+        v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]);
+    }
+
+
+    // InnerProduct(b', s, mod p)
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sksv_avx[j], b_bucket[j]);
+    }
+
+    vector_vector_mul(pksv_avx, b_bucket, v_avx);
+
+    for (i = 0; i < SABER_N / 16; i++) {
+        _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
+    }
+
+
+    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
+        scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i];
+    }
+
+    PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(op, scale_ar);
+
+
+    //addition of h2
+    for (i = 0; i < SABER_N; i++) {
+        message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1);
+    }
+
+
+    POL2MSG(m, message_dec_unpacked);
+}
diff --git a/crypto_kem/firesaber/avx2/SABER_indcpa.h b/crypto_kem/firesaber/avx2/SABER_indcpa.h
new file mode 100644
index 00000000..1b6c8311
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/SABER_indcpa.h
@@ -0,0 +1,13 @@
+#ifndef INDCPA_H
+#define INDCPA_H
+#include "SABER_params.h"
+#include <stdint.h>
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
+
+
+#endif
diff --git a/crypto_kem/firesaber/avx2/SABER_params.h b/crypto_kem/firesaber/avx2/SABER_params.h
new file mode 100644
index 00000000..e1476b6a
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/SABER_params.h
@@ -0,0 +1,45 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+#include "api.h"
+
+
+
+
+#define SABER_K 4
+#define SABER_MU 6
+#define SABER_ET 6
+
+#define SABER_EQ 13
+#define SABER_EP 10
+
+#define SABER_N 256
+#define SABER_Q 8192 //2^13
+#define SABER_P 1024
+
+#define SABER_SEEDBYTES       32
+#define SABER_NOISESEEDBYTES  32
+#define SABER_COINBYTES       32
+#define SABER_KEYBYTES        32
+
+#define SABER_HASHBYTES       32
+
+#define SABER_POLYBYTES              416 //13*256/8 
+
+#define SABER_POLYVECBYTES           (SABER_K * SABER_POLYBYTES)
+
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
+
+#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
+
+#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+
+#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
+#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
+
+#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
+
+#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
+
+#endif
diff --git a/crypto_kem/firesaber/avx2/api.h b/crypto_kem/firesaber/avx2/api.h
new file mode 100644
index 00000000..cb5240dd
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/api.h
@@ -0,0 +1,18 @@
+#ifndef PQCLEAN_FIRESABER_AVX2_API_H
+#define PQCLEAN_FIRESABER_AVX2_API_H
+
+
+#define PQCLEAN_FIRESABER_AVX2_CRYPTO_ALGNAME "FireSaber"
+#define PQCLEAN_FIRESABER_AVX2_CRYPTO_BYTES 32
+#define PQCLEAN_FIRESABER_AVX2_CRYPTO_CIPHERTEXTBYTES 1472
+#define PQCLEAN_FIRESABER_AVX2_CRYPTO_PUBLICKEYBYTES 1312
+#define PQCLEAN_FIRESABER_AVX2_CRYPTO_SECRETKEYBYTES 3040
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
+
+
+#endif /* PQCLEAN_FIRESABER_AVX2_API_H */
diff --git a/crypto_kem/firesaber/avx2/cbd.c b/crypto_kem/firesaber/avx2/cbd.c
new file mode 100644
index 00000000..37970a81
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/cbd.c
@@ -0,0 +1,52 @@
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include <stdint.h>
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+
+
+static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+    int i;
+    uint64_t r = x[0];
+    for (i = 1; i < bytes; i++) {
+        r |= (uint64_t)x[i] << (8 * i);
+    }
+    return r;
+}
+
+
+void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) {
+    uint16_t Qmod_minus1 = SABER_Q - 1;
+
+    uint32_t t, d, a[4], b[4];
+    int i, j;
+
+    for (i = 0; i < SABER_N / 4; i++) {
+        t = load_littleendian(buf + 3 * i, 3);
+        d = 0;
+        for (j = 0; j < 3; j++) {
+            d += (t >> j) & 0x249249;
+        }
+
+        a[0] =  d & 0x7;
+        b[0] = (d >>  3) & 0x7;
+        a[1] = (d >>  6) & 0x7;
+        b[1] = (d >>  9) & 0x7;
+        a[2] = (d >> 12) & 0x7;
+        b[2] = (d >> 15) & 0x7;
+        a[3] = (d >> 18) & 0x7;
+        b[3] = (d >> 21);
+
+        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
+        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
+        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
+        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
+
+    }
+}
diff --git a/crypto_kem/firesaber/avx2/cbd.h b/crypto_kem/firesaber/avx2/cbd.h
new file mode 100644
index 00000000..210bcc50
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/cbd.h
@@ -0,0 +1,16 @@
+#ifndef CBD_H
+#define CBD_H
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+#include "poly.h"
+#include <stdint.h>
+
+void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf);
+
+
+#endif
diff --git a/crypto_kem/firesaber/avx2/kem.c b/crypto_kem/firesaber/avx2/kem.c
new file mode 100644
index 00000000..2e72e6aa
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/kem.c
@@ -0,0 +1,79 @@
+#include "SABER_indcpa.h"
+#include "SABER_params.h"
+#include "api.h"
+#include "fips202.h"
+#include "randombytes.h"
+#include "verify.h"
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
+    int i;
+
+    PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
+    for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
+        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
+    }
+
+    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended.
+
+    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number.
+    // This is output when check in PQCLEAN_FIRESABER_AVX2_crypto_kem_dec() fails.
+    return (0);
+}
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) {
+
+    uint8_t kr[64]; // Will contain key, coins
+    uint8_t buf[64];
+
+    randombytes(buf, 32);
+
+    sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
+
+    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
+
+    sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
+    // K^ <-- kr[0:31]
+    // noiseseed (r) <-- kr[32:63];
+    PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
+
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
+
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
+
+    return (0);
+}
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
+    int i;
+    uint8_t fail;
+    uint8_t cmp[SABER_BYTES_CCA_DEC];
+    uint8_t buf[64];
+    uint8_t kr[64]; // Will contain key, coins
+    const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
+
+    PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message
+
+    // Multitarget countermeasure for coins + contributory KEM
+    for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk
+        buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i];
+    }
+
+    sha3_512(kr, buf, 64);
+
+    PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk);
+
+    fail = PQCLEAN_FIRESABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC);
+
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c)
+
+    PQCLEAN_FIRESABER_AVX2_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail);
+
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
+
+    return (0);
+}
diff --git a/crypto_kem/firesaber/avx2/kem.h b/crypto_kem/firesaber/avx2/kem.h
new file mode 100644
index 00000000..a55514d9
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/kem.h
@@ -0,0 +1,35 @@
+#ifndef INDCPA_H
+#define INDCPA_H
+
+#include <stdint.h>
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk);
+
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
+
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
+
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk);
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk,  uint8_t *ciphertext);
+
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]);
+
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+
+
+
+//uint64_t clock1,clock2;
+
+//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex;
+
+
+#endif
diff --git a/crypto_kem/firesaber/avx2/pack_unpack.c b/crypto_kem/firesaber/avx2/pack_unpack.c
new file mode 100644
index 00000000..33c481b3
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/pack_unpack.c
@@ -0,0 +1,502 @@
+#include "pack_unpack.h"
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 3 * j;
+        offset_data = 8 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6);
+        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7);
+        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 );
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 3 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
+        data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07;
+        data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 );
+        data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07;
+        data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07;
+        data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 );
+        data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 );
+        data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 );
+    }
+
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0;
+
+    for (j = 0; j < SABER_N / 2; j++) {
+        offset_data = 2 * j;
+        bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 );
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0;
+
+    for (j = 0; j < SABER_N / 2; j++) {
+        offset_data = 2 * j;
+        data[offset_data] = bytes[j] & 0x0f;
+        data[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 3 * j;
+        offset_data = 4 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6);
+        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
+        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2);
+    }
+}
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 3 * j;
+        offset_data = 4 * j;
+        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
+        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |  ((bytes[offset_byte + 1] & 0x0f) << 2)  ;
+        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ;
+        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
+    }
+
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
+        }
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
+        }
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
+
+            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
+
+        }
+    }
+
+
+}
+
+void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+    }
+}
+
+
+
+void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
+            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
+            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
+            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
+
+        }
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        }
+    }
+
+
+}
+
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
+            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
+            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
+            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
+
+        }
+    }
+
+
+}
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
+
+            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
+
+        }
+    }
+
+
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        }
+    }
+
+
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    //for(i=0;i<SABER_K;i++){
+    //i=0;
+    //offset_byte1=i*(SABER_N*13)/8;
+    for (j = 0; j < SABER_N / 8; j++) {
+        //offset_byte=offset_byte1+13*j;
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+    }
+    //}
+
+
+}
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+    /*This function packs 11 bit data stream into 8 bits of data.
+    */
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 11) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 11 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1);
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7);
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5);
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff );
+
+        }
+    }
+
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 11) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 11 * j;
+            offset_data = 8 * j;
+
+            data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 );
+
+            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 );
+
+            data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 );
+
+            data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 );
+
+            data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 );
+
+            data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 );
+
+            data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 );
+
+            data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 );
+        }
+    }
+
+
+}
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 14) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 7 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff );
+        }
+    }
+
+
+}
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 14) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 7 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 );
+
+            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 );
+
+            data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 );
+
+            data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 );
+        }
+    }
+
+
+}
+
+void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
+
+    if (modulus == 1024) {
+        PQCLEAN_FIRESABER_AVX2_POLVECp2BS(bytes, data);
+    } else if (modulus == 8192) {
+        PQCLEAN_FIRESABER_AVX2_POLVECq2BS(bytes, data);
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) {
+
+    if (modulus == 1024) {
+        PQCLEAN_FIRESABER_AVX2_BS2POLVECp(data, bytes);
+    } else if (modulus == 8192) {
+        PQCLEAN_FIRESABER_AVX2_BS2POLVECq(data, bytes);
+    }
+
+}
diff --git a/crypto_kem/firesaber/avx2/pack_unpack.h b/crypto_kem/firesaber/avx2/pack_unpack.h
new file mode 100644
index 00000000..ba8a568f
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/pack_unpack.h
@@ -0,0 +1,56 @@
+#ifndef PACK_UNPACK_H
+#define PACK_UNPACK_H
+#include "SABER_params.h"
+#include <stdint.h>
+#include <stdio.h>
+
+void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus);
+
+void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+
+void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+
+void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes);
+
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+
+#endif
diff --git a/crypto_kem/firesaber/avx2/poly.h b/crypto_kem/firesaber/avx2/poly.h
new file mode 100644
index 00000000..8443de34
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/poly.h
@@ -0,0 +1,27 @@
+#ifndef POLY_H
+#define POLY_H
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+#include "SABER_params.h"
+#include <stdint.h>
+
+typedef struct {
+    uint16_t coeffs[SABER_N];
+} poly;
+
+typedef struct {
+    poly vec[SABER_K];
+} polyvec;
+
+void PQCLEAN_FIRESABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce);
+
+
+void PQCLEAN_FIRESABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3);
+
+
+#endif
diff --git a/crypto_kem/firesaber/avx2/polymul/consts.h b/crypto_kem/firesaber/avx2/polymul/consts.h
new file mode 100644
index 00000000..40826398
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/polymul/consts.h
@@ -0,0 +1,20 @@
+#include "../SABER_params.h"
+
+#define AVX_N (SABER_N >> 4)
+#define small_len_avx (AVX_N >> 2)
+
+#define SCHB_N 16
+
+#define N_SB (SABER_N >> 2)
+#define N_SB_RES (2*N_SB-1)
+
+#define N_SB_16 (N_SB >> 2)
+#define N_SB_16_RES (2*N_SB_16-1)
+
+#define AVX_N1 16 /*N/16*/ 
+
+#define SCM_SIZE 16
+
+// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements
+#define NUM_POLY SABER_K
+//int NUM_POLY=2; 
diff --git a/crypto_kem/firesaber/avx2/polymul/matrix.c b/crypto_kem/firesaber/avx2/polymul/matrix.c
new file mode 100644
index 00000000..5fa35783
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/polymul/matrix.c
@@ -0,0 +1,303 @@
+#include <immintrin.h>
+
+static void transpose_n1(__m256i *M)
+{
+	//int i;
+	register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
+	register __m256i temp, temp0, temp1, temp2;
+
+	//for(i=0; i<8; i=i+1)
+	//{
+		r0 = _mm256_unpacklo_epi16(M[0], M[1]); 
+		r1 = _mm256_unpacklo_epi16(M[2], M[3]); 
+		r2 = _mm256_unpacklo_epi16(M[4], M[5]); 
+		r3 = _mm256_unpacklo_epi16(M[6], M[7]);
+		r4 = _mm256_unpacklo_epi16(M[8], M[9]); 
+		r5 = _mm256_unpacklo_epi16(M[10], M[11]);
+		r6 = _mm256_unpacklo_epi16(M[12], M[13]); 
+		r7 = _mm256_unpacklo_epi16(M[14], M[15]); 
+
+
+		temp = _mm256_unpacklo_epi32(r0, r1); 
+		temp0 = _mm256_unpacklo_epi32(r2, r3); 
+		temp1 = _mm256_unpacklo_epi32(r4, r5); 
+		temp2 = _mm256_unpacklo_epi32(r6, r7); 
+
+		r8 = _mm256_unpackhi_epi32(r0, r1); 
+		r9 = _mm256_unpackhi_epi32(r2, r3); 
+		r10 = _mm256_unpackhi_epi32(r4, r5); 
+		r11 = _mm256_unpackhi_epi32(r6, r7);
+
+		r0 = _mm256_unpacklo_epi64(temp, temp0); 
+		r2 = _mm256_unpackhi_epi64(temp, temp0); 
+
+		r1 = _mm256_unpacklo_epi64(temp1, temp2); 
+		r3 = _mm256_unpackhi_epi64(temp1, temp2);
+
+		temp = _mm256_unpackhi_epi16(M[0], M[1]); 
+		temp0 = _mm256_unpackhi_epi16(M[2], M[3]); 
+		temp1 = _mm256_unpackhi_epi16(M[4], M[5]); 
+		temp2 = _mm256_unpackhi_epi16(M[6], M[7]); 
+		r4 = _mm256_unpackhi_epi16(M[8], M[9]); 
+
+		M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
+		M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
+		M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
+		M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
+
+
+		r5 = _mm256_unpackhi_epi16(M[10], M[11]); 
+		r6 = _mm256_unpackhi_epi16(M[12], M[13]); 
+		r7 = _mm256_unpackhi_epi16(M[14], M[15]); 
+
+
+
+		r0 = _mm256_unpacklo_epi64(r8, r9); 
+		r1 = _mm256_unpacklo_epi64(r10, r11); 
+
+		r2 = _mm256_unpackhi_epi64(r8, r9); 
+		r3 = _mm256_unpackhi_epi64(r10, r11); 
+
+
+
+		M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
+		M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
+		M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
+		M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
+
+
+	//for(i=0; i<4; i=i+1)
+	//{
+		r0 = _mm256_unpacklo_epi32(temp, temp0); 
+		r1 = _mm256_unpacklo_epi32(temp1, temp2);
+		r2 = _mm256_unpacklo_epi32(r4, r5); 
+		r3 = _mm256_unpacklo_epi32(r6, r7); 
+
+	//}
+
+
+	//for(i=0; i<2; i=i+1)
+	//{
+		r8 = _mm256_unpacklo_epi64(r0, r1); 
+		r10 = _mm256_unpackhi_epi64(r0, r1); 
+
+		r9 = _mm256_unpacklo_epi64(r2, r3); 
+		r11 = _mm256_unpackhi_epi64(r2, r3); 
+
+		M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
+		M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
+		M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
+		M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
+
+		r0 = _mm256_unpackhi_epi32(temp, temp0); 
+		r1 = _mm256_unpackhi_epi32(temp1, temp2); 
+		r2 = _mm256_unpackhi_epi32(r4, r5); 
+		r3 = _mm256_unpackhi_epi32(r6, r7); 
+
+	//}
+//	for(i=0; i<2; i=i+1)
+//	{
+		r4 = _mm256_unpacklo_epi64(r0, r1); 
+		r6 = _mm256_unpackhi_epi64(r0, r1); 
+
+		r5 = _mm256_unpacklo_epi64(r2, r3); 
+		r7 = _mm256_unpackhi_epi64(r2, r3); 
+
+//	}
+
+	//-------------------------------------------------------
+
+	M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
+	M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
+	M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
+	M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
+}
+
+/*
+void transpose_unrolled(__m256i *M)
+{
+	int i;
+	__m256i tL[8], tH[8];
+	__m256i bL[4], bH[4], cL[4], cH[4];
+	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
+
+	__m256i r0, r1, r2, r3, r4, r5, r6, r7;
+
+	//for(i=0; i<8; i=i+1)
+	//{
+		tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); 
+		tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); 
+
+		tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); 
+		tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); 
+
+		tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); 
+		tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); 
+
+		tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); 
+		tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); 
+
+		tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); 
+		tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); 
+
+		tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); 
+		tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); 
+
+		tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); 
+		tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); 
+
+		tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); 
+		tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); 
+
+	//}
+
+	//-------------------------------------------------------
+	//for(i=0; i<4; i=i+1)
+	//{
+		bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); 
+		bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); 
+
+		bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); 
+		bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); 
+
+		bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); 
+		bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); 
+
+		bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); 
+		bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); 
+
+	//}
+
+	//for(i=0; i<2; i=i+1)
+	//{
+		dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); 
+		dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); 
+
+		dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); 
+		dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]);
+
+		M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
+		M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
+		M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
+		M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
+
+	//}
+	//for(i=0; i<2; i=i+1)
+	//{
+		eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); 
+		eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); 
+
+		eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); 
+		eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); 
+
+	//}
+
+	//-------------------------------------------------------
+
+	//-------------------------------------------------------
+	for(i=0; i<4; i=i+1)
+	{
+		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
+		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
+	}
+
+
+	for(i=0; i<2; i=i+1)
+	{
+		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
+		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
+		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
+	}
+
+	//-------------------------------------------------------
+
+
+
+	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
+	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
+	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
+	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
+
+	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
+	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
+	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
+	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
+
+	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
+	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
+	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
+	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
+}
+
+
+void transpose1(__m256i *M)
+{
+	int i;
+	__m256i tL[8], tH[8];
+	__m256i bL[4], bH[4], cL[4], cH[4];
+	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
+
+	for(i=0; i<8; i=i+1)
+	{
+		tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); 
+		tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); 
+	}
+
+	for(i=0; i<4; i=i+1)
+	{
+		bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); 
+		bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); 
+	}
+	for(i=0; i<4; i=i+1)
+	{
+		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
+		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
+	}
+
+	for(i=0; i<2; i=i+1)
+	{
+		dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); 
+		dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); 
+		eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); 
+	}
+
+	for(i=0; i<2; i=i+1)
+	{
+		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
+		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
+		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
+	}
+
+	M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
+	M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
+	M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
+	M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
+
+	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
+	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
+	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
+	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
+
+	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
+	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
+	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
+	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
+
+	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
+	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
+	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
+	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
+}
+*/
diff --git a/crypto_kem/firesaber/avx2/polymul/scm_avx.c b/crypto_kem/firesaber/avx2/polymul/scm_avx.c
new file mode 100644
index 00000000..4e4f11f8
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/polymul/scm_avx.c
@@ -0,0 +1,753 @@
+//#define SCM_SIZE 16
+
+//#pragma STDC FP_CONTRACT ON
+
+#include <immintrin.h>
+
+inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
+    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
+}
+
+
+static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
+									      //the c_avx are added cummulatively
+{
+
+	register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+	register __m256i temp;
+
+
+	a0=a[0];
+	a1=a[1];
+	a2=a[2];
+	a3=a[3];
+	a4=a[4];
+	a5=a[5];
+	a6=a[6];
+	a7=a[7];
+
+	b0=b[0];
+	b1=b[1];
+	b2=b[2];
+	b3=b[3];
+	b4=b[4];
+	b5=b[5];
+	b6=b[6];
+	b7=b[7];
+
+	// New Unrolled first triangle
+
+	//otherwise accumulate
+	c_avx[0] = mul_add(a0, b0, c_avx[0]);
+	
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	temp=mul_add(a1, b0, temp);
+	c_avx[1] = _mm256_add_epi16(temp, c_avx[1]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b2);
+	temp = mul_add(a1, b1, temp);
+	temp=mul_add(a2, b0, temp);
+	c_avx[2] = _mm256_add_epi16(temp, c_avx[2]);
+	
+
+	temp = _mm256_mullo_epi16 (a0, b3);
+	temp = mul_add(a1, b2, temp);
+	temp = mul_add(a2, b1, temp);
+	temp=mul_add(a3, b0, temp);
+	c_avx[3] = _mm256_add_epi16(temp, c_avx[3]);
+
+	temp = _mm256_mullo_epi16 (a0, b4);
+	temp = mul_add(a1, b3, temp);
+	temp = mul_add(a3, b1, temp);
+	temp = mul_add(a4, b0, temp);
+	temp=mul_add(a2, b2, temp);
+	c_avx[4] = _mm256_add_epi16(temp, c_avx[4]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b5);
+	temp = mul_add(a1, b4 , temp);
+	temp = mul_add(a2, b3, temp);
+	temp = mul_add(a3, b2, temp);
+	temp = mul_add( a4, b1, temp);
+	temp=mul_add(a5, b0, temp);
+	c_avx[5] = _mm256_add_epi16(temp, c_avx[5]);
+	
+	temp = _mm256_mullo_epi16 (a0, b6);
+	temp = mul_add(a1, b5, temp);
+	temp = mul_add(a5, b1, temp);
+	temp = mul_add(a6, b0, temp);
+	temp = mul_add(a2, b4, temp);
+	temp = mul_add(a3, b3, temp);
+	temp=mul_add(a4, b2, temp);
+	c_avx[6] = _mm256_add_epi16(temp, c_avx[6]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b7);
+	temp = mul_add(a1, b6, temp);
+	temp = mul_add (a6, b1, temp);
+	temp = mul_add (a7, b0, temp);
+	temp = mul_add(a2, b5, temp);
+	temp = mul_add (a3, b4, temp);
+	temp = mul_add (a4, b3, temp);
+	temp=mul_add(a5, b2, temp);
+	c_avx[7] = _mm256_add_epi16(temp, c_avx[7]);
+
+	temp = _mm256_mullo_epi16 (a0, b[8]);
+	temp = mul_add (a1, b7, temp);
+	temp = mul_add (a7, b1, temp);
+	temp = mul_add (a[8], b0, temp);
+	temp = mul_add (a2, b6,temp);
+	temp = mul_add(a3, b5, temp);
+	temp = mul_add (a4, b4,temp);
+	temp = mul_add (a5, b3, temp);
+	
+		temp=mul_add(a6, b2, temp);
+		c_avx[8] = _mm256_add_epi16(temp, c_avx[8]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[9]);
+	temp = mul_add (a1, b[8], temp);
+	temp = mul_add (a[8], b1, temp);
+	temp = mul_add (a[9], b0, temp);
+	temp = mul_add (a2, b7, temp);
+	temp = mul_add (a3, b6, temp);
+	temp = mul_add (a4, b5, temp);
+	temp = mul_add (a5, b4, temp);
+	temp = mul_add (a6, b3, temp);
+		temp=mul_add(a7, b2, temp);
+		c_avx[9] = _mm256_add_epi16(temp, c_avx[9]);
+
+
+	temp= _mm256_mullo_epi16 (a0, b[10]);
+	temp = mul_add (a1, b[9], temp);
+	temp = mul_add (a[9], b1, temp);
+	temp = mul_add (a[10], b0, temp);
+	temp = mul_add (a2, b[8], temp);
+	temp = mul_add (a3, b7, temp);
+	temp = mul_add (a4, b6, temp);
+	temp = mul_add (a5, b5, temp);
+	temp = mul_add (a6, b4, temp);
+	temp = mul_add (a7, b3, temp);
+		temp=mul_add(a[8], b2, temp);
+		c_avx[10] = _mm256_add_epi16(temp, c_avx[10]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[11]);
+	temp = mul_add (a1, b[10], temp );
+	temp = mul_add (a[10], b1, temp );
+	temp = mul_add (a[11], b0, temp );
+	temp = mul_add (a2, b[9], temp );
+	temp = mul_add (a3, b[8], temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a[8], b3, temp );
+		temp=mul_add(a[9], b2, temp);
+		c_avx[11] = _mm256_add_epi16(temp, c_avx[11]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[12]);
+	temp = mul_add (a1, b[11], temp);
+	temp = mul_add (a[11], b1, temp);
+	temp = mul_add (a[12], b0, temp);
+	temp = mul_add (a2, b[10], temp);
+	temp = mul_add (a3, b[9], temp);
+	temp = mul_add (a4, b[8], temp);
+	temp = mul_add (a5, b7, temp);
+	temp = mul_add (a6, b6, temp);
+	temp = mul_add (a7, b5, temp);
+	temp = mul_add (a[8], b4, temp);
+	temp = mul_add (a[9], b3, temp);
+		temp=mul_add(a[10], b2, temp);
+		c_avx[12] = _mm256_add_epi16(temp, c_avx[12]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[13]);
+	temp = mul_add (a1, b[12], temp );
+	temp = mul_add (a[12], b1, temp );
+	temp = mul_add (a[13], b0, temp );
+	temp = mul_add (a2, b[11], temp );
+	temp = mul_add (a3, b[10], temp );
+	temp = mul_add (a4, b[9], temp );
+	temp = mul_add (a5, b[8], temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a[8], b5, temp );
+	temp = mul_add (a[9], b4, temp );
+	temp = mul_add (a[10], b3, temp );
+		temp=mul_add(a[11], b2, temp);
+		c_avx[13] = _mm256_add_epi16(temp, c_avx[13]);
+
+
+
+	temp = _mm256_mullo_epi16 (a0, b[14]);
+	temp = mul_add (a1, b[13], temp );
+	temp = mul_add (a[13], b1, temp );
+	temp = mul_add (a[14], b0, temp );
+	temp = mul_add (a2, b[12], temp );
+	temp = mul_add (a3, b[11], temp );
+	temp = mul_add (a4, b[10], temp );
+	temp = mul_add (a5, b[9], temp );
+	temp = mul_add (a6, b[8], temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a[8], b6, temp );
+	temp = mul_add (a[9], b5, temp );
+	temp = mul_add (a[10], b4, temp );
+	temp = mul_add (a[11], b3, temp );
+		temp=mul_add(a[12], b2, temp);
+		c_avx[14] = _mm256_add_epi16(temp, c_avx[14]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[15]);
+	temp = mul_add (a1, b[14], temp );
+	temp = mul_add (a[14], b1, temp );
+	temp = mul_add (a[15], b0, temp );
+	temp = mul_add (a2, b[13], temp );
+	temp = mul_add (a3, b[12], temp );
+	temp = mul_add (a4, b[11], temp );
+	temp = mul_add (a5, b[10], temp );
+	temp = mul_add (a6, b[9], temp );
+	temp = mul_add (a7, b[8], temp );
+	temp = mul_add (a[8], b7, temp );
+	temp = mul_add (a[9], b6, temp );
+	temp = mul_add (a[10], b5, temp );
+	temp = mul_add (a[11], b4, temp );
+	temp = mul_add (a[12], b3, temp );
+		temp=mul_add(a[13], b2, temp);
+		c_avx[15] = _mm256_add_epi16(temp, c_avx[15]);
+
+
+	// unrolled second triangle
+	a0=a[14];
+	a1=a[15];
+	a2=a[13];
+	a3=a[12];
+	a4=a[11];
+	a5=a[10];
+	a6=a[9];
+	a7=a[8];
+
+	b0=b[14];
+	b1=b[15];
+	b2=b[13];
+	b3=b[12];
+	b4=b[11];
+	b5=b[10];
+	b6=b[9];
+	b7=b[8];
+
+	temp = _mm256_mullo_epi16 (a[1], b1);
+	temp = mul_add (a[2], b0, temp );
+	temp = mul_add (a[3], b2, temp );
+	temp = mul_add (a[4], b3, temp );
+	temp = mul_add (a[5], b4, temp );
+	temp = mul_add (a[6], b5, temp );
+	temp = mul_add (a[7], b6, temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a6, b[7], temp );
+	temp = mul_add (a5, b[6], temp );
+	temp = mul_add (a4, b[5], temp );
+	temp = mul_add (a3, b[4], temp );
+	temp = mul_add (a2, b[3], temp );
+	temp = mul_add (a0, b[2], temp );
+		temp=mul_add(a1, b[1], temp);
+		c_avx[16] = _mm256_add_epi16(temp, c_avx[16]);
+
+
+	temp = _mm256_mullo_epi16 (a[2], b1);
+	temp = mul_add (a[3], b0, temp );
+	temp = mul_add (a[4], b2, temp );
+	temp = mul_add (a[5], b3, temp );
+	temp = mul_add (a[6], b4, temp );
+	temp = mul_add (a[7], b5, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a5, b[7], temp );
+	temp = mul_add (a4, b[6], temp );
+	temp = mul_add (a3, b[5], temp );
+	temp = mul_add (a2, b[4], temp );
+	temp = mul_add (a0, b[3], temp );
+		temp=mul_add(a1, b[2], temp);
+		c_avx[17] = _mm256_add_epi16(temp, c_avx[17]);
+
+
+	temp = _mm256_mullo_epi16 (a[3], b1);
+	temp = mul_add (a[4], b0, temp );
+	temp = mul_add (a[5], b2, temp );
+	temp = mul_add (a[6], b3, temp );
+	temp = mul_add (a[7], b4, temp );
+	temp = mul_add (a7, b5, temp );
+	temp = mul_add (a6, b6, temp );
+	temp = mul_add (a5, b7, temp );
+	temp = mul_add (a4, b[7], temp );
+	temp = mul_add (a3, b[6], temp );
+	temp = mul_add (a2, b[5], temp );
+	temp = mul_add (a0, b[4], temp );
+		temp=mul_add(a1, b[3], temp);
+		c_avx[18] = _mm256_add_epi16(temp, c_avx[18]);
+
+
+	temp = _mm256_mullo_epi16 (a[4], b1);
+	temp = mul_add (a[5], b0, temp );
+	temp = mul_add (a[6], b2, temp );
+	temp = mul_add (a[7], b3, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a3, b[7], temp );
+	temp = mul_add (a2, b[6], temp );
+	temp = mul_add (a0, b[5], temp );
+		temp=mul_add(a1, b[4], temp);
+		c_avx[19] = _mm256_add_epi16(temp, c_avx[19]);
+
+
+	temp = _mm256_mullo_epi16 (a[5], b1);
+	temp = mul_add (a[6], b0, temp );
+	temp = mul_add (a[7], b2, temp );
+	temp = mul_add (a7, b3, temp );
+	temp = mul_add (a6, b4, temp );
+	temp = mul_add (a5, b5, temp );
+	temp = mul_add (a4, b6, temp );
+	temp = mul_add (a3, b7, temp );
+	temp = mul_add (a2, b[7], temp );
+	temp = mul_add (a0, b[6], temp );
+		temp=mul_add(a1, b[5], temp);
+		c_avx[20] = _mm256_add_epi16(temp, c_avx[20]);
+
+
+	temp = _mm256_mullo_epi16 (a[6], b1);
+	temp = mul_add (a[7], b0, temp );
+	temp = mul_add (a7, b2, temp );
+	temp = mul_add (a6, b3, temp );
+	temp = mul_add (a5, b4, temp );
+	temp = mul_add (a4, b5, temp );
+	temp = mul_add (a3, b6, temp );
+	temp = mul_add (a2, b7, temp );
+	temp = mul_add (a0, b[7], temp );
+		temp=mul_add(a1, b[6], temp);
+		c_avx[21] = _mm256_add_epi16(temp, c_avx[21]);
+
+
+	temp = _mm256_mullo_epi16 (a[7], b1);
+	temp = mul_add (a7, b0, temp );
+	temp = mul_add (a6, b2, temp );
+	temp = mul_add (a5, b3, temp );
+	temp = mul_add (a4, b4, temp );
+	temp = mul_add (a3, b5, temp );
+	temp = mul_add (a2, b6, temp );
+	temp = mul_add (a0, b7, temp );
+		temp=mul_add(a1, b[7], temp);
+		c_avx[22] = _mm256_add_epi16(temp, c_avx[22]);
+
+
+	temp = _mm256_mullo_epi16 (a7, b1);
+	temp = mul_add (a6, b0, temp );
+	temp = mul_add (a5, b2, temp );
+	temp = mul_add (a4, b3, temp );
+	temp = mul_add (a3, b4, temp );
+	temp = mul_add (a2, b5, temp );
+	temp = mul_add (a0, b6, temp );
+		temp=mul_add(a1, b7, temp);
+		c_avx[23] = _mm256_add_epi16(temp, c_avx[23]);
+
+
+	temp = _mm256_mullo_epi16 (a6, b1);
+	temp = mul_add (a5, b0, temp );
+	temp = mul_add (a4, b2, temp );
+	temp = mul_add (a3, b3, temp );
+	temp = mul_add (a2, b4, temp );
+	temp = mul_add (a0, b5, temp );
+		temp=mul_add(a1, b6, temp);
+		c_avx[24] = _mm256_add_epi16(temp, c_avx[24]);
+
+
+	temp = _mm256_mullo_epi16 (a5, b1);
+	temp = mul_add (a4, b0, temp );
+	temp = mul_add (a3, b2, temp );
+	temp = mul_add (a2, b3, temp );
+	temp = mul_add (a0, b4, temp );
+		temp=mul_add(a1, b5, temp);
+		c_avx[25] = _mm256_add_epi16(temp, c_avx[25]);
+
+
+	temp = _mm256_mullo_epi16 (a4, b1);
+	temp = mul_add (a3, b0, temp );
+	temp = mul_add (a2, b2, temp );
+	temp = mul_add (a0, b3, temp );
+		temp=mul_add(a1, b4, temp);
+		c_avx[26] = _mm256_add_epi16(temp, c_avx[26]);
+
+
+	temp = _mm256_mullo_epi16 (a3, b1);
+	temp = mul_add (a2, b0, temp );
+	temp = mul_add (a0, b2, temp );
+		temp=mul_add(a1, b3, temp);
+		c_avx[27] = _mm256_add_epi16(temp, c_avx[27]);
+
+
+	temp = _mm256_mullo_epi16 (a2, b1);
+	temp = mul_add (a0, b0, temp );
+		temp=mul_add(a1, b2, temp);
+		c_avx[28] = _mm256_add_epi16(temp, c_avx[28]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+		temp=mul_add(a1, b0, temp);
+		c_avx[29] = _mm256_add_epi16(temp, c_avx[29]);
+
+
+		c_avx[30] = mul_add(a1, b1, c_avx[30]);
+
+
+
+	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
+
+
+}
+
+
+
+static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
+									      //the c_avx are not added cummulatively
+{
+
+	__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+	__m256i temp;
+
+
+	a0=a[0];
+	a1=a[1];
+	a2=a[2];
+	a3=a[3];
+	a4=a[4];
+	a5=a[5];
+	a6=a[6];
+	a7=a[7];
+
+	b0=b[0];
+	b1=b[1];
+	b2=b[2];
+	b3=b[3];
+	b4=b[4];
+	b5=b[5];
+	b6=b[6];
+	b7=b[7];
+
+	// New Unrolled first triangle
+	c_avx[0] = _mm256_mullo_epi16 (a0, b0);
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	c_avx[1]=mul_add(a1, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b2);
+
+	temp = mul_add(a1, b1, temp);
+	c_avx[2]= mul_add(a2, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b3);
+	temp = mul_add(a1, b2, temp);
+	temp = mul_add(a2, b1, temp);
+	c_avx[3]= mul_add(a3, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b4);
+	temp = mul_add(a1, b3, temp);
+	temp = mul_add(a3, b1, temp);
+	temp = mul_add(a4, b0, temp);
+	c_avx[4]= mul_add(a2, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b5);
+	temp = mul_add(a1, b4 , temp);
+	temp = mul_add(a2, b3, temp);
+	temp = mul_add(a3, b2, temp);
+	temp = mul_add( a4, b1, temp);
+	c_avx[5] = mul_add(a5, b0, temp);
+	
+	temp = _mm256_mullo_epi16 (a0, b6);
+	temp = mul_add(a1, b5, temp);
+	temp = mul_add(a5, b1, temp);
+	temp = mul_add(a6, b0, temp);
+	temp = mul_add(a2, b4, temp);
+	temp = mul_add(a3, b3, temp);
+	c_avx[6] = mul_add(a4, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b7);
+	temp = mul_add(a1, b6, temp);
+	temp = mul_add (a6, b1, temp);
+	temp = mul_add (a7, b0, temp);
+	temp = mul_add(a2, b5, temp);
+	temp = mul_add (a3, b4, temp);
+	temp = mul_add (a4, b3, temp);
+	c_avx[7] = mul_add (a5, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[8]);
+	temp = mul_add (a1, b7, temp);
+	temp = mul_add (a7, b1, temp);
+	temp = mul_add (a[8], b0, temp);
+	temp = mul_add (a2, b6,temp);
+	temp = mul_add(a3, b5, temp);
+	temp = mul_add (a4, b4,temp);
+	temp = mul_add (a5, b3, temp);
+	c_avx[8] = mul_add (a6, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[9]);
+	temp = mul_add (a1, b[8], temp);
+	temp = mul_add (a[8], b1, temp);
+	temp = mul_add (a[9], b0, temp);
+	temp = mul_add (a2, b7, temp);
+	temp = mul_add (a3, b6, temp);
+	temp = mul_add (a4, b5, temp);
+	temp = mul_add (a5, b4, temp);
+	temp = mul_add (a6, b3, temp);
+	c_avx[9] = mul_add (a7, b2, temp);
+
+	temp= _mm256_mullo_epi16 (a0, b[10]);
+	temp = mul_add (a1, b[9], temp);
+	temp = mul_add (a[9], b1, temp);
+	temp = mul_add (a[10], b0, temp);
+	temp = mul_add (a2, b[8], temp);
+	temp = mul_add (a3, b7, temp);
+	temp = mul_add (a4, b6, temp);
+	temp = mul_add (a5, b5, temp);
+	temp = mul_add (a6, b4, temp);
+	temp = mul_add (a7, b3, temp);
+	c_avx[10] = mul_add (a[8], b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[11]);
+	temp = mul_add (a1, b[10], temp );
+	temp = mul_add (a[10], b1, temp );
+	temp = mul_add (a[11], b0, temp );
+	temp = mul_add (a2, b[9], temp );
+	temp = mul_add (a3, b[8], temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a[8], b3, temp );
+	c_avx[11] = mul_add (a[9], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[12]);
+	temp = mul_add (a1, b[11], temp);
+	temp = mul_add (a[11], b1, temp);
+	temp = mul_add (a[12], b0, temp);
+	temp = mul_add (a2, b[10], temp);
+	temp = mul_add (a3, b[9], temp);
+	temp = mul_add (a4, b[8], temp);
+	temp = mul_add (a5, b7, temp);
+	temp = mul_add (a6, b6, temp);
+	temp = mul_add (a7, b5, temp);
+	temp = mul_add (a[8], b4, temp);
+	temp = mul_add (a[9], b3, temp);
+	c_avx[12] = mul_add (a[10], b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[13]);
+	temp = mul_add (a1, b[12], temp );
+	temp = mul_add (a[12], b1, temp );
+	temp = mul_add (a[13], b0, temp );
+	temp = mul_add (a2, b[11], temp );
+	temp = mul_add (a3, b[10], temp );
+	temp = mul_add (a4, b[9], temp );
+	temp = mul_add (a5, b[8], temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a[8], b5, temp );
+	temp = mul_add (a[9], b4, temp );
+	temp = mul_add (a[10], b3, temp );
+	c_avx[13] = mul_add (a[11], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[14]);
+	temp = mul_add (a1, b[13], temp );
+	temp = mul_add (a[13], b1, temp );
+	temp = mul_add (a[14], b0, temp );
+	temp = mul_add (a2, b[12], temp );
+	temp = mul_add (a3, b[11], temp );
+	temp = mul_add (a4, b[10], temp );
+	temp = mul_add (a5, b[9], temp );
+	temp = mul_add (a6, b[8], temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a[8], b6, temp );
+	temp = mul_add (a[9], b5, temp );
+	temp = mul_add (a[10], b4, temp );
+	temp = mul_add (a[11], b3, temp );
+	c_avx[14] = mul_add (a[12], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[15]);
+	temp = mul_add (a1, b[14], temp );
+	temp = mul_add (a[14], b1, temp );
+	temp = mul_add (a[15], b0, temp );
+	temp = mul_add (a2, b[13], temp );
+	temp = mul_add (a3, b[12], temp );
+	temp = mul_add (a4, b[11], temp );
+	temp = mul_add (a5, b[10], temp );
+	temp = mul_add (a6, b[9], temp );
+	temp = mul_add (a7, b[8], temp );
+	temp = mul_add (a[8], b7, temp );
+	temp = mul_add (a[9], b6, temp );
+	temp = mul_add (a[10], b5, temp );
+	temp = mul_add (a[11], b4, temp );
+	temp = mul_add (a[12], b3, temp );
+	c_avx[15] = mul_add (a[13], b2, temp );
+
+
+	// unrolled second triangle
+	a0=a[14];
+	a1=a[15];
+	a2=a[13];
+	a3=a[12];
+	a4=a[11];
+	a5=a[10];
+	a6=a[9];
+	a7=a[8];
+
+	b0=b[14];
+	b1=b[15];
+	b2=b[13];
+	b3=b[12];
+	b4=b[11];
+	b5=b[10];
+	b6=b[9];
+	b7=b[8];
+	
+
+	temp = _mm256_mullo_epi16 (a[1], b1);
+	temp = mul_add (a[2], b0, temp );
+	temp = mul_add (a[3], b2, temp );
+	temp = mul_add (a[4], b3, temp );
+	temp = mul_add (a[5], b4, temp );
+	temp = mul_add (a[6], b5, temp );
+	temp = mul_add (a[7], b6, temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a6, b[7], temp );
+	temp = mul_add (a5, b[6], temp );
+	temp = mul_add (a4, b[5], temp );
+	temp = mul_add (a3, b[4], temp );
+	temp = mul_add (a2, b[3], temp );
+	temp = mul_add (a0, b[2], temp );
+	c_avx[16] = mul_add (a1, b[1], temp );
+
+	temp = _mm256_mullo_epi16 (a[2], b1);
+	temp = mul_add (a[3], b0, temp );
+	temp = mul_add (a[4], b2, temp );
+	temp = mul_add (a[5], b3, temp );
+	temp = mul_add (a[6], b4, temp );
+	temp = mul_add (a[7], b5, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a5, b[7], temp );
+	temp = mul_add (a4, b[6], temp );
+	temp = mul_add (a3, b[5], temp );
+	temp = mul_add (a2, b[4], temp );
+	temp = mul_add (a0, b[3], temp );
+	c_avx[17] = mul_add (a1, b[2], temp );
+
+	temp = _mm256_mullo_epi16 (a[3], b1);
+	temp = mul_add (a[4], b0, temp );
+	temp = mul_add (a[5], b2, temp );
+	temp = mul_add (a[6], b3, temp );
+	temp = mul_add (a[7], b4, temp );
+	temp = mul_add (a7, b5, temp );
+	temp = mul_add (a6, b6, temp );
+	temp = mul_add (a5, b7, temp );
+	temp = mul_add (a4, b[7], temp );
+	temp = mul_add (a3, b[6], temp );
+	temp = mul_add (a2, b[5], temp );
+	temp = mul_add (a0, b[4], temp );
+	c_avx[18] = mul_add (a1, b[3], temp );
+
+	temp = _mm256_mullo_epi16 (a[4], b1);
+	temp = mul_add (a[5], b0, temp );
+	temp = mul_add (a[6], b2, temp );
+	temp = mul_add (a[7], b3, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a3, b[7], temp );
+	temp = mul_add (a2, b[6], temp );
+	temp = mul_add (a0, b[5], temp );
+	c_avx[19] = mul_add (a1, b[4], temp );
+
+	temp = _mm256_mullo_epi16 (a[5], b1);
+	temp = mul_add (a[6], b0, temp );
+	temp = mul_add (a[7], b2, temp );
+	temp = mul_add (a7, b3, temp );
+	temp = mul_add (a6, b4, temp );
+	temp = mul_add (a5, b5, temp );
+	temp = mul_add (a4, b6, temp );
+	temp = mul_add (a3, b7, temp );
+	temp = mul_add (a2, b[7], temp );
+	temp = mul_add (a0, b[6], temp );
+	c_avx[20] = mul_add (a1, b[5], temp );
+
+	temp = _mm256_mullo_epi16 (a[6], b1);
+	temp = mul_add (a[7], b0, temp );
+	temp = mul_add (a7, b2, temp );
+	temp = mul_add (a6, b3, temp );
+	temp = mul_add (a5, b4, temp );
+	temp = mul_add (a4, b5, temp );
+	temp = mul_add (a3, b6, temp );
+	temp = mul_add (a2, b7, temp );
+	temp = mul_add (a0, b[7], temp );
+	c_avx[21] = mul_add (a1, b[6], temp );
+
+	temp = _mm256_mullo_epi16 (a[7], b1);
+	temp = mul_add (a7, b0, temp );
+	temp = mul_add (a6, b2, temp );
+	temp = mul_add (a5, b3, temp );
+	temp = mul_add (a4, b4, temp );
+	temp = mul_add (a3, b5, temp );
+	temp = mul_add (a2, b6, temp );
+	temp = mul_add (a0, b7, temp );
+	c_avx[22] = mul_add (a1, b[7], temp );
+
+	temp = _mm256_mullo_epi16 (a7, b1);
+	temp = mul_add (a6, b0, temp );
+	temp = mul_add (a5, b2, temp );
+	temp = mul_add (a4, b3, temp );
+	temp = mul_add (a3, b4, temp );
+	temp = mul_add (a2, b5, temp );
+	temp = mul_add (a0, b6, temp );
+	c_avx[23] = mul_add (a1, b7, temp );
+
+	temp = _mm256_mullo_epi16 (a6, b1);
+	temp = mul_add (a5, b0, temp );
+	temp = mul_add (a4, b2, temp );
+	temp = mul_add (a3, b3, temp );
+	temp = mul_add (a2, b4, temp );
+	temp = mul_add (a0, b5, temp );
+	c_avx[24] = mul_add (a1, b6, temp );
+
+	temp = _mm256_mullo_epi16 (a5, b1);
+	temp = mul_add (a4, b0, temp );
+	temp = mul_add (a3, b2, temp );
+	temp = mul_add (a2, b3, temp );
+	temp = mul_add (a0, b4, temp );
+	c_avx[25] = mul_add (a1, b5, temp );
+
+	temp = _mm256_mullo_epi16 (a4, b1);
+	temp = mul_add (a3, b0, temp );
+	temp = mul_add (a2, b2, temp );
+	temp = mul_add (a0, b3, temp );
+	c_avx[26] = mul_add (a1, b4, temp );
+
+	temp = _mm256_mullo_epi16 (a3, b1);
+	temp = mul_add (a2, b0, temp );
+	temp = mul_add (a0, b2, temp );
+	c_avx[27] = mul_add (a1, b3, temp );
+
+	temp = _mm256_mullo_epi16 (a2, b1);
+	temp = mul_add (a0, b0, temp );
+	c_avx[28] = mul_add (a1, b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	c_avx[29] = mul_add (a1, b0, temp);
+
+	c_avx[30] = _mm256_mullo_epi16 (a1, b1);
+
+
+	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
+
+}
diff --git a/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c b/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c
new file mode 100644
index 00000000..78fb86c2
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c
@@ -0,0 +1,1010 @@
+/*
+Cleaned version for step by step approach look into the _debug file
+*/
+//#include "timing.c"
+#include "consts.h"
+#include "matrix.c"
+#include "scm_avx.c"
+
+static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX.
+{
+	__m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time
+
+	//uint16_t i;
+
+	register __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+
+
+		//CLOCK1=cpucycles();
+		
+		//------------------AVX evaluation for 1st poly-----------------------
+
+                    r0_avx=a[0];
+                    r1_avx=a[1];
+                    r2_avx=a[2];
+                    r3_avx=a[3];
+		    a_bucket[0]=r0_avx;
+		    a_bucket[1]=r1_avx;
+		    a_bucket[2]=r2_avx;
+		    a_bucket[3]=r3_avx;
+		    a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]);
+
+
+		//------------------AVX evaluation for 1st poly ends------------------
+
+
+		//------------------AVX evaluation for 2nd poly-----------------------
+                    r0_avx=a[small_len_avx];
+                    r1_avx=a[small_len_avx+1];
+                    r2_avx=a[small_len_avx+2];
+                    r3_avx=a[small_len_avx+3];
+		    a_bucket[0+9]=r0_avx;
+		    a_bucket[1+9]=r1_avx;
+		    a_bucket[2+9]=r2_avx;
+		    a_bucket[3+9]=r3_avx;
+		    a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]);
+
+	
+		//------------------AVX evaluation for 2nd poly ends------------------
+
+
+		//------------------AVX evaluation for 3rd poly-----------------------
+                    r0_avx=a[2*small_len_avx];
+                    r1_avx=a[2*small_len_avx+1];
+                    r2_avx=a[2*small_len_avx+2];
+                    r3_avx=a[2*small_len_avx+3];
+		    a_bucket[0+18]=r0_avx;
+		    a_bucket[1+18]=r1_avx;
+		    a_bucket[2+18]=r2_avx;
+		    a_bucket[3+18]=r3_avx;
+		    a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]);
+		
+		//------------------AVX evaluation for 3rd poly ends------------------
+
+
+		//------------------AVX evaluation for 4th poly-----------------------
+
+                    r0_avx=a[3*small_len_avx];
+                    r1_avx=a[3*small_len_avx+1];
+                    r2_avx=a[3*small_len_avx+2];
+                    r3_avx=a[3*small_len_avx+3];
+		    a_bucket[0+27]=r0_avx;
+		    a_bucket[1+27]=r1_avx;
+		    a_bucket[2+27]=r2_avx;
+		    a_bucket[3+27]=r3_avx;
+		    a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]);
+		
+		//------------------AVX evaluation for 4th poly ends------------------
+
+		//------------------AVX evaluation for 5th poly-----------------------
+		
+                    r0_avx=a[4*small_len_avx+0];
+                    r1_avx=a[4*small_len_avx+1];
+                    r2_avx=a[4*small_len_avx+2];
+                    r3_avx=a[4*small_len_avx+3];
+		    a_bucket[0+36]=r0_avx;
+		    a_bucket[1+36]=r1_avx;
+		    a_bucket[2+36]=r2_avx;
+		    a_bucket[3+36]=r3_avx;
+		    a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]);
+		
+		//------------------AVX evaluation for 5th poly ends------------------
+
+
+		//------------------AVX evaluation for 6th poly-----------------------
+                    r0_avx=a[5*small_len_avx];
+                    r1_avx=a[5*small_len_avx+1];
+                    r2_avx=a[5*small_len_avx+2];
+                    r3_avx=a[5*small_len_avx+3];
+		    a_bucket[0+45]=r0_avx;
+		    a_bucket[1+45]=r1_avx;
+		    a_bucket[2+45]=r2_avx;
+		    a_bucket[3+45]=r3_avx;
+		    a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]);
+		
+		//------------------AVX evaluation for 6th poly ends------------------
+
+		//------------------AVX evaluation for 7th poly-----------------------
+
+                    r0_avx=a[6*small_len_avx];
+                    r1_avx=a[6*small_len_avx+1];
+                    r2_avx=a[6*small_len_avx+2];
+                    r3_avx=a[6*small_len_avx+3];
+		    a_bucket[0+54]=r0_avx;
+		    a_bucket[1+54]=r1_avx;
+		    a_bucket[2+54]=r2_avx;
+		    a_bucket[3+54]=r3_avx;
+		    a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]);
+
+		//------------------AVX evaluation for 7th poly ends------------------
+		
+	
+
+		//CLOCK2=cpucycles();
+		//CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1);
+		//printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1);
+
+
+		//CLOCK1=cpucycles();
+		//-----------------Forward transposes--------------------------------------
+			transpose_n1(a_bucket);
+			transpose_n1(a_bucket+16);
+			transpose_n1(a_bucket+32);
+			transpose_n1(a_bucket+48);
+
+		//-----------------Forwatrd transposes ends---------------------------------
+
+		//----------------------all multiplications---------------------------------
+		if(f==0){
+			schoolbook_avx_new2(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
+			schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
+			schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
+		}
+		else{
+			schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
+			//schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
+			schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
+		}
+		/*
+		schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f);
+		schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f);
+		schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f);
+		schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f);
+		*/
+
+
+		//----------------------all multiplications ends-----------------------------
+
+
+		//-----------------Reverse transposes--------------------------------------
+
+			/*
+			transpose(c_bucket);
+			transpose(c_bucket+16);
+
+			transpose(c_bucket+2*SCM_SIZE);
+			transpose(c_bucket+16+2*SCM_SIZE);
+
+			transpose(c_bucket+4*SCM_SIZE);
+			transpose(c_bucket+16+4*SCM_SIZE);
+
+			transpose(c_bucket+6*SCM_SIZE);
+			transpose(c_bucket+16+6*SCM_SIZE);
+			*/
+		//-----------------Reverse transposes ends---------------------------------
+
+		//CLOCK2=cpucycles();
+		//CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1);
+
+		//KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6);
+		
+}
+
+static void KARA_eval(__m256i* b, __m256i *b_bucket){
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+
+		//-------1st poly----------------------------------------------------
+                    r0_avx=b[0];
+                    r1_avx=b[1];
+                    r2_avx=b[2];
+                    r3_avx=b[3];
+		    b_bucket[0]=r0_avx;
+		    b_bucket[1]=r1_avx;
+		    b_bucket[2]=r2_avx;
+		    b_bucket[3]=r3_avx;
+		    b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]);
+		//-------2nd poly----------------------------------------------------
+
+                    r0_avx=b[small_len_avx];
+                    r1_avx=b[small_len_avx+1];
+                    r2_avx=b[small_len_avx+2];
+                    r3_avx=b[small_len_avx+3];
+		    b_bucket[0+9]=r0_avx;
+		    b_bucket[1+9]=r1_avx;
+		    b_bucket[2+9]=r2_avx;
+		    b_bucket[3+9]=r3_avx;
+		    b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]);
+
+		//-------3rd poly----------------------------------------------------
+
+                    r0_avx=b[2*small_len_avx+0];
+                    r1_avx=b[2*small_len_avx+1];
+                    r2_avx=b[2*small_len_avx+2];
+                    r3_avx=b[2*small_len_avx+3];
+		    b_bucket[0+18]=r0_avx;
+		    b_bucket[1+18]=r1_avx;
+		    b_bucket[2+18]=r2_avx;
+		    b_bucket[3+18]=r3_avx;
+		    b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]);
+
+		//-------4th poly----------------------------------------------------
+                    r0_avx=b[3*small_len_avx];
+                    r1_avx=b[3*small_len_avx+1];
+                    r2_avx=b[3*small_len_avx+2];
+                    r3_avx=b[3*small_len_avx+3];
+		    b_bucket[0+27]=r0_avx;
+		    b_bucket[1+27]=r1_avx;
+		    b_bucket[2+27]=r2_avx;
+		    b_bucket[3+27]=r3_avx;
+		    b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]);
+
+		//-------5th poly----------------------------------------------------
+
+                    r0_avx=b[4*small_len_avx];
+                    r1_avx=b[4*small_len_avx+1];
+                    r2_avx=b[4*small_len_avx+2];
+                    r3_avx=b[4*small_len_avx+3];
+		    b_bucket[0+36]=r0_avx;
+		    b_bucket[1+36]=r1_avx;
+		    b_bucket[2+36]=r2_avx;
+		    b_bucket[3+36]=r3_avx;
+		    b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]);
+
+		//-------6th poly----------------------------------------------------
+
+                    r0_avx=b[5*small_len_avx];
+                    r1_avx=b[5*small_len_avx+1];
+                    r2_avx=b[5*small_len_avx+2];
+                    r3_avx=b[5*small_len_avx+3];
+		    b_bucket[0+45]=r0_avx;
+		    b_bucket[1+45]=r1_avx;
+		    b_bucket[2+45]=r2_avx;
+		    b_bucket[3+45]=r3_avx;
+		    b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]);
+
+		//-------7th poly----------------------------------------------------
+
+                    r0_avx=b[6*small_len_avx];
+                    r1_avx=b[6*small_len_avx+1];
+                    r2_avx=b[6*small_len_avx+2];
+                    r3_avx=b[6*small_len_avx+3];
+		    b_bucket[0+54]=r0_avx;
+		    b_bucket[1+54]=r1_avx;
+		    b_bucket[2+54]=r2_avx;
+		    b_bucket[3+54]=r3_avx;
+		    b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]);
+
+		//--------------Evaluating B poly ends-------------------------------
+
+			transpose_n1(b_bucket);
+			transpose_n1(b_bucket+16);
+			transpose_n1(b_bucket+32);
+			transpose_n1(b_bucket+48);	
+}
+
+static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){
+
+		//int64_t i;
+		register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
+
+		__m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
+
+		//CLOCK1=cpucycles();
+
+		   //------------------------AVX interpolation for 1st poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[0];
+				res_avx2 = c_bucket[1];
+				res_avx4 = c_bucket[2];
+				res_avx6 = c_bucket[3];
+
+				c6_avx=c_bucket[6];
+				c7_avx=c_bucket[7];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[16];
+				res_avx3 = c_bucket[17];
+				res_avx5 = c_bucket[18];
+				res_avx7 = c_bucket[19];
+
+				c22_avx=c_bucket[22];
+				c23_avx=c_bucket[23];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final0[0]=res_avx0;
+				result_final0[1]=res_avx1;
+
+				result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final0[6]=res_avx6;
+				result_final0[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 1st poly ends--------------
+
+
+		   //------------------------AVX interpolation for 2nd poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[9]; //c_bucket0
+				res_avx2 = c_bucket[10]; //c_bucket1
+				res_avx4 = c_bucket[11]; //c_bucket2
+				res_avx6 = c_bucket[12]; //c_bucket3
+
+				c6_avx=c_bucket[15]; //c_bucket6
+				c7_avx=c_bucket[32]; //c_bucket7
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[25]; //c_bucket0
+				res_avx3 = c_bucket[26]; //c_bucket1
+				res_avx5 = c_bucket[27]; //c_bucket2
+				res_avx7 = c_bucket[28]; //c_bucket3
+
+				c22_avx=c_bucket[31];
+				c23_avx=c_bucket[48];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final1[0]=res_avx0;
+				result_final1[1]=res_avx1;
+
+				result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final1[6]=res_avx6;
+				result_final1[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 2nd poly ends--------------
+
+		   //------------------------AVX interpolation for 3rd poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[34]; //c_bucket0
+				res_avx2 = c_bucket[35]; //c_bucket1
+				res_avx4 = c_bucket[36];
+				res_avx6 = c_bucket[37];
+
+				c6_avx=c_bucket[40];
+				c7_avx=c_bucket[41];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[50]; //c_bucket0
+				res_avx3 = c_bucket[51]; //c_bucket1
+				res_avx5 = c_bucket[52];
+				res_avx7 = c_bucket[53];
+
+				c22_avx=c_bucket[56];
+				c23_avx=c_bucket[57];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+			//loop4
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+			//loop5
+				result_final2[0]=res_avx0;
+				result_final2[1]=res_avx1;
+
+				result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final2[6]=res_avx6;
+				result_final2[7]=res_avx7;
+
+		   //------------------------AVX interpolation for 3rd poly ends--------------
+		
+		   //------------------------AVX interpolation for 4th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[43];
+				res_avx2 = c_bucket[44];
+				res_avx4 = c_bucket[45];
+				res_avx6 = c_bucket[46];
+
+				c6_avx=c_bucket[65];
+				c7_avx=c_bucket[66];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[59];
+				res_avx3 = c_bucket[60];
+				res_avx5 = c_bucket[61];
+				res_avx7 = c_bucket[62];
+
+				c22_avx=c_bucket[81];
+				c23_avx=c_bucket[82];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final3[0]=res_avx0;
+				result_final3[1]=res_avx1;
+
+				result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final3[6]=res_avx6;
+				result_final3[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 4th poly ends--------------
+
+		   //------------------------AVX interpolation for 5th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[68];
+				res_avx2 = c_bucket[69];
+				res_avx4 = c_bucket[70];
+				res_avx6 = c_bucket[71];
+
+				c6_avx=c_bucket[74];
+				c7_avx=c_bucket[75];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[84];
+				res_avx3 = c_bucket[85];
+				res_avx5 = c_bucket[86];
+				res_avx7 = c_bucket[87];
+
+				c22_avx=c_bucket[90];
+				c23_avx=c_bucket[91];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final4[0]=res_avx0;
+				result_final4[1]=res_avx1;
+
+				result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final4[6]=res_avx6;
+				result_final4[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 5th poly ends--------------
+
+		   //------------------------AVX interpolation for 6th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[77];
+				res_avx2 = c_bucket[78];
+				res_avx4 = c_bucket[79];
+				res_avx6 = c_bucket[96];
+
+				c6_avx=c_bucket[99];
+				c7_avx=c_bucket[100];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[93];
+				res_avx3 = c_bucket[94];
+				res_avx5 = c_bucket[95];
+				res_avx7 = c_bucket[112];
+
+				c22_avx=c_bucket[115];
+				c23_avx=c_bucket[116];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final5[0]=res_avx0;
+				result_final5[1]=res_avx1;
+
+				result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final5[6]=res_avx6;
+				result_final5[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 6th poly ends--------------
+
+		   //------------------------AVX interpolation for 7th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[102];
+				res_avx2 = c_bucket[103];
+				res_avx4 = c_bucket[104];
+				res_avx6 = c_bucket[105];
+
+				c6_avx=c_bucket[108];
+				c7_avx=c_bucket[109];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[118];
+				res_avx3 = c_bucket[119];
+				res_avx5 = c_bucket[120];
+				res_avx7 = c_bucket[121];
+
+				c22_avx=c_bucket[124];
+				c23_avx=c_bucket[125];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final6[0]=res_avx0;
+				result_final6[1]=res_avx1;
+
+				result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final6[6]=res_avx6;
+				result_final6[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 7th poly ends--------------
+
+		//CLOCK2=cpucycles();
+		//CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1);
+		//printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1);
+
+
+
+}
+
+static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ 
+
+	int i;
+
+//---------------AVX data-----------------------------
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+	__m256i aw_avx[7*small_len_avx];
+
+//----------------AVX data----------------------------
+
+
+// EVALUATION
+
+	//CLOCK1=cpucycles();
+
+	for (i=0; i<small_len_avx; i++){
+		r0_avx=a_avx[i];
+		r1_avx=a_avx[i + small_len_avx];
+		r2_avx=a_avx[i + 2*small_len_avx];
+		r3_avx=a_avx[i + 3*small_len_avx];
+		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
+		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		aw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		aw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx=_mm256_slli_epi16(r0_avx,2);
+		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
+		r4_avx=_mm256_slli_epi16(r4_avx,1);
+		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
+		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
+		aw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		aw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx= _mm256_slli_epi16(r3_avx, 3);
+		r6_avx= _mm256_slli_epi16(r2_avx, 2);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		r6_avx= _mm256_slli_epi16(r1_avx, 1);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		aw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
+		aw_avx[6*small_len_avx+i]= r0_avx; 
+		aw_avx[i]= r3_avx;
+	}
+
+
+	//CLOCK2=cpucycles();
+	//CLOCK_TC_EVAL=CLOCK_TC_EVAL+(CLOCK2-CLOCK1);
+
+	batch_64coefficient_multiplications_new(aw_avx, b_bucket, c_bucket, f);//New
+
+}
+
+static void TC_eval(__m256i* b_avx, __m256i* b_bucket){
+
+	int i;
+	__m256i bw_avx[7*small_len_avx];
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+
+	for (i=0; i<small_len_avx; i++){
+		
+		r0_avx=b_avx[i];
+		r1_avx=b_avx[i + small_len_avx];
+		r2_avx=b_avx[i + 2*small_len_avx];
+		r3_avx=b_avx[i + 3*small_len_avx];
+		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
+		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		bw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		bw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx=_mm256_slli_epi16(r0_avx,2);
+		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
+		r4_avx=_mm256_slli_epi16(r4_avx,1);
+		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
+		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
+		bw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		bw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx= _mm256_slli_epi16(r3_avx, 3);
+		r6_avx= _mm256_slli_epi16(r2_avx, 2);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		r6_avx= _mm256_slli_epi16(r1_avx, 1);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		bw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
+		bw_avx[6*small_len_avx+i]= r0_avx;
+		bw_avx[i]= r3_avx;
+	}
+
+	KARA_eval(bw_avx, b_bucket);
+
+}
+
+
+static void TC_interpol(__m256i *c_bucket, __m256i* res_avx){
+
+	int i;
+
+	register __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
+
+	__m256i w1_avx[2*small_len_avx],w2_avx[2*small_len_avx],w3_avx[2*small_len_avx],w4_avx[2*small_len_avx],w5_avx[2*small_len_avx],w6_avx[2*small_len_avx],w7_avx[2*small_len_avx];
+
+	__m256i res_avx_output[2*AVX_N1];
+
+	//CLOCK1=cpucycles();
+
+	
+	transpose_n1(c_bucket);
+	transpose_n1(c_bucket+16);
+
+	transpose_n1(c_bucket+2*SCM_SIZE);
+	transpose_n1(c_bucket+16+2*SCM_SIZE);
+
+	transpose_n1(c_bucket+4*SCM_SIZE);
+	transpose_n1(c_bucket+16+4*SCM_SIZE);
+
+	transpose_n1(c_bucket+6*SCM_SIZE);
+	transpose_n1(c_bucket+16+6*SCM_SIZE);
+	
+
+	KARA_interpol(c_bucket, w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx);
+
+	for (i = 0; i < 2*small_len_avx; i++) {
+
+		r0_avx = w1_avx[i];
+		r1_avx = w2_avx[i];
+		r2_avx = w3_avx[i];
+		r3_avx = w4_avx[i];
+		r4_avx = w5_avx[i];
+		r5_avx = w6_avx[i];
+		r6_avx = w7_avx[i];
+		r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
+		r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
+		r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
+		r3_avx = _mm256_srli_epi16(r3_avx, 1);
+		r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
+		temp_avx = _mm256_slli_epi16(r6_avx, 6);
+		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+		r4_avx = _mm256_slli_epi16(r4_avx, 1);
+		r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
+		r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
+		temp_avx = _mm256_slli_epi16(r2_avx, 6);
+		r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
+		r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
+		r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
+		r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
+		temp_avx = _mm256_mullo_epi16 (r2_avx, _mm256_set1_epi16(45));
+		r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+		temp_avx = _mm256_slli_epi16(r2_avx, 3);
+		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+		r4_avx = _mm256_mullo_epi16 (r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
+		r4_avx = _mm256_srli_epi16(r4_avx, 3);
+		r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
+		temp_avx = _mm256_slli_epi16(r3_avx, 4);
+		r1_avx= _mm256_add_epi16(r1_avx, temp_avx);
+		r1_avx = _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
+		r1_avx= _mm256_srli_epi16(r1_avx, 1); 	
+		r3_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		r3_avx= _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
+		temp_avx= _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(30));
+		temp_avx= _mm256_sub_epi16(temp_avx, r5_avx);
+		temp_avx= _mm256_mullo_epi16 (temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
+		r5_avx= _mm256_srli_epi16(temp_avx, 2);
+		r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
+		r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
+
+		if(i<small_len_avx){
+			res_avx_output[0*small_len_avx+i]=r6_avx;
+			res_avx_output[1*small_len_avx+i]=r5_avx;
+			res_avx_output[2*small_len_avx+i]=r4_avx;
+			res_avx_output[3*small_len_avx+i]=r3_avx;
+			res_avx_output[4*small_len_avx+i]=r2_avx;
+			res_avx_output[5*small_len_avx+i]=r1_avx;
+			res_avx_output[6*small_len_avx+i]=r0_avx;
+		}
+		else{
+			res_avx_output[0*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[0*small_len_avx+i], r6_avx);
+			res_avx_output[1*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[1*small_len_avx+i], r5_avx);
+			res_avx_output[2*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[2*small_len_avx+i], r4_avx);
+			res_avx_output[3*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[3*small_len_avx+i], r3_avx);
+			res_avx_output[4*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[4*small_len_avx+i], r2_avx);
+			res_avx_output[5*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[5*small_len_avx+i], r1_avx);
+			res_avx_output[6*small_len_avx+i]=r0_avx;
+		}
+	}
+
+	//CLOCK2=cpucycles();
+	//CLOCK_TC_INTER=CLOCK_TC_INTER+(CLOCK2-CLOCK1);
+
+	// Reduction by X^256 + 1
+	for(i=0; i<16; i++)
+  {
+		res_avx[i] = _mm256_sub_epi16(res_avx_output[i], res_avx_output[i+16]);
+  }
+
+}
diff --git a/crypto_kem/firesaber/avx2/verify.c b/crypto_kem/firesaber/avx2/verify.c
new file mode 100644
index 00000000..d78e12e0
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/verify.c
@@ -0,0 +1,35 @@
+#include "verify.h"
+
+/*-------------------------------------------------
+This file has been adapted from the implementation
+(available at https://github.com/pq-crystals/kyber) of
+"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------*/
+
+
+/* returns 0 for equal strings, 1 for non-equal strings */
+uint8_t PQCLEAN_FIRESABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
+    uint64_t r;
+    size_t i;
+    r = 0;
+
+    for (i = 0; i < len; i++) {
+        r |= a[i] ^ b[i];
+    }
+
+    r = (~r + 1); // Two's complement
+    r >>= 63;
+    return (uint8_t) r;
+}
+
+/* b = 1 means mov, b = 0 means don't mov*/
+void PQCLEAN_FIRESABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
+    size_t i;
+
+    b = -b;
+    for (i = 0; i < len; i++) {
+        r[i] ^= b & (x[i] ^ r[i]);
+    }
+}
diff --git a/crypto_kem/firesaber/avx2/verify.h b/crypto_kem/firesaber/avx2/verify.h
new file mode 100644
index 00000000..2ec50370
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/verify.h
@@ -0,0 +1,22 @@
+#ifndef VERIFY_H
+#define VERIFY_H
+/*-------------------------------------------------
+This file has been adapted from the implementation
+(available at https://github.com/pq-crystals/kyber) of
+"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------*/
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* returns 0 for equal strings, 1 for non-equal strings */
+uint8_t PQCLEAN_FIRESABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len);
+
+
+/* b = 1 means mov, b = 0 means don't mov*/
+void PQCLEAN_FIRESABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
+
+
+#endif
diff --git a/crypto_kem/firesaber/clean/LICENSE b/crypto_kem/firesaber/clean/LICENSE
index 08c799e3..d5d21fff 100644
--- a/crypto_kem/firesaber/clean/LICENSE
+++ b/crypto_kem/firesaber/clean/LICENSE
@@ -1,8 +1 @@
-----------------------------------------------------------------------------------------
-SABER_v1.1
-
-Public domain
-
-Authors: Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy,
-Frederik Vercauteren
-----------------------------------------------------------------------------------------
+Public Domain
diff --git a/crypto_kem/firesaber/clean/Makefile b/crypto_kem/firesaber/clean/Makefile
index e00112e8..8f8dd8f7 100644
--- a/crypto_kem/firesaber/clean/Makefile
+++ b/crypto_kem/firesaber/clean/Makefile
@@ -1,10 +1,10 @@
 # This Makefile can be used with GNU Make or BSD Make
 
 LIB=libfiresaber_clean.a
-HEADERS=api.h cbd.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h pack_unpack.h 
+HEADERS=api.h cbd.h pack_unpack.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h 
 OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
 
 all: $(LIB)
 
diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.c b/crypto_kem/firesaber/clean/SABER_indcpa.c
index da8aa685..8f4364e7 100644
--- a/crypto_kem/firesaber/clean/SABER_indcpa.c
+++ b/crypto_kem/firesaber/clean/SABER_indcpa.c
@@ -3,296 +3,90 @@
 #include "fips202.h"
 #include "pack_unpack.h"
 #include "poly.h"
-#include "poly_mul.h"
 #include "randombytes.h"
 #include <stdint.h>
 #include <string.h>
 
+#define h1 (1 << (SABER_EQ - SABER_EP - 1))
+#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
+    uint16_t A[SABER_L][SABER_L][SABER_N];
+    uint16_t s[SABER_L][SABER_N];
+    uint16_t b[SABER_L][SABER_N] = {0};
 
-/*-----------------------------------------------------------------------------------
-    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
--------------------------------------------------------------------------------------*/
+    uint8_t seed_A[SABER_SEEDBYTES];
+    uint8_t seed_s[SABER_NOISE_SEEDBYTES];
+    int i, j;
 
-#define h1 4 //2^(EQ-EP-1)
+    randombytes(seed_A, SABER_SEEDBYTES);
+    shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
+    randombytes(seed_s, SABER_NOISE_SEEDBYTES);
 
-#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+    PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A);
+    PQCLEAN_FIRESABER_CLEAN_GenSecret(s, seed_s);
+    PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1);
 
-static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]);
-static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose);
-
-static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec);
-
-static void GenMatrix(polyvec *a, const unsigned char *seed) {
-    unsigned char buf[SABER_K * SABER_K * (13 * SABER_N / 8)];
-
-    uint16_t temp_ar[SABER_N];
-
-    int i, j, k;
-    uint16_t mod = (SABER_Q - 1);
-
-    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            PQCLEAN_FIRESABER_CLEAN_BS2POL(buf + (i * SABER_K + j) * (13 * SABER_N / 8), temp_ar);
-            for (k = 0; k < SABER_N; k++) {
-                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
-            }
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_N; j++) {
+            b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP);
         }
     }
+
+    PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s);
+    PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b);
+    memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A));
 }
 
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+    uint16_t A[SABER_L][SABER_L][SABER_N];
+    uint16_t sp[SABER_L][SABER_N];
+    uint16_t bp[SABER_L][SABER_N] = {0};
+    uint16_t vp[SABER_N] = {0};
+    uint16_t mp[SABER_N];
+    uint16_t b[SABER_L][SABER_N];
+    int i, j;
+    const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk) {
-    polyvec a[SABER_K];
+    PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A);
+    PQCLEAN_FIRESABER_CLEAN_GenSecret(sp, seed_sp);
+    PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0);
 
-    uint16_t skpv[SABER_K][SABER_N];
-
-    unsigned char seed[SABER_SEEDBYTES];
-    unsigned char noiseseed[SABER_COINBYTES];
-    int32_t i, j;
-    uint16_t mod_q = SABER_Q - 1;
-
-
-    uint16_t res[SABER_K][SABER_N];
-
-    randombytes(seed, SABER_SEEDBYTES);
-
-    // for not revealing system RNG state
-    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES);
-    randombytes(noiseseed, SABER_COINBYTES);
-
-    GenMatrix(a, seed);   //sample matrix A
-
-    // generate secret from constant-time binomial distribution
-    PQCLEAN_FIRESABER_CLEAN_GenSecret(skpv, noiseseed);
-
-    // do the matrix vector multiplication and rounding
-    for (i = 0; i < SABER_K; i++) {
+    for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_N; j++) {
-            res[i][j] = 0;
-        }
-    }
-    MatrixVectorMul(a, skpv, res, SABER_Q - 1, 1);
-
-    // now rounding
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            // shift right 3 bits
-            res[i][j] = (res[i][j] + h1) & (mod_q);
-            res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP));
+            bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP);
         }
     }
 
-    // unload and pack sk=3 x (256 coefficients of 14 bits)
-    PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(sk, skpv, SABER_Q);
+    PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp);
+    PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, pk);
+    PQCLEAN_FIRESABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp);
 
-    // unload and pack pk=256 bits seed and 3 x (256 coefficients of 11 bits)
-    // load the public-key coefficients
-    PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(pk, res, SABER_P);
+    PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(mp, m);
 
-
-    // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
-    for (i = 0; i < SABER_SEEDBYTES; i++) {
-        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
+    for (j = 0; j < SABER_N; j++) {
+        vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET);
     }
 
+    PQCLEAN_FIRESABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp);
 }
 
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
 
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(const unsigned char *message_received, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext) {
-    uint32_t i, j, k;
-    polyvec a[SABER_K];
-    unsigned char seed[SABER_SEEDBYTES];
-    // public key of received by the client
-    uint16_t pkcl[SABER_K][SABER_N];
-    uint16_t skpv1[SABER_K][SABER_N];
-    uint16_t message[SABER_KEYBYTES * 8];
-    uint16_t res[SABER_K][SABER_N];
-    uint16_t mod_p = SABER_P - 1;
-    uint16_t mod_q = SABER_Q - 1;
-    uint16_t vprime[SABER_N];
-    unsigned char msk_c[SABER_SCALEBYTES_KEM];
+    uint16_t s[SABER_L][SABER_N];
+    uint16_t b[SABER_L][SABER_N];
+    uint16_t v[SABER_N] = {0};
+    uint16_t cm[SABER_N];
+    int i;
 
-    // extract the seedbytes from Public Key.
-    for (i = 0; i < SABER_SEEDBYTES; i++) {
-        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
-    }
+    PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk);
+    PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, ciphertext);
+    PQCLEAN_FIRESABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s);
+    PQCLEAN_FIRESABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES);
 
-    GenMatrix(a, seed);
-
-    // generate secret from constant-time binomial distribution
-    PQCLEAN_FIRESABER_CLEAN_GenSecret(skpv1, noiseseed);
-
-    // matrix-vector multiplication and rounding
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            res[i][j] = 0;
-        }
-    }
-    MatrixVectorMul(a, skpv1, res, SABER_Q - 1, 0);
-
-    // now rounding
-    //shift right 3 bits
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            res[i][j] = ( res[i][j] + h1 ) & mod_q;
-            res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP) );
-        }
-    }
-
-    PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(ciphertext, res, SABER_P);
-
-    // ************client matrix-vector multiplication ends************
-
-    // now calculate the v'
-    // unpack the public_key
-    // pkcl is the b in the protocol
-    PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(pk, pkcl, SABER_P);
     for (i = 0; i < SABER_N; i++) {
-        vprime[i] = 0;
-    }
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            skpv1[i][j] = skpv1[i][j] & (mod_p);
-        }
+        v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1);
     }
 
-    // vector-vector scalar multiplication with mod p
-    InnerProd(pkcl, skpv1, mod_p, vprime);
-
-    // addition of h1 to vprime
-    for (i = 0; i < SABER_N; i++) {
-        vprime[i] = vprime[i] + h1;
-    }
-
-    // unpack message_received;
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        for (i = 0; i < 8; i++) {
-            message[8 * j + i] = ((message_received[j] >> i) & 0x01);
-        }
-    }
-
-    // message encoding
-    for (i = 0; i < SABER_N; i++) {
-        message[i] = (message[i] << (SABER_EP - 1));
-    }
-
-    for (k = 0; k < SABER_N; k++) {
-        vprime[k] = ( (vprime[k] - message[k]) & (mod_p) ) >> (SABER_EP - SABER_ET);
-    }
-
-
-    PQCLEAN_FIRESABER_CLEAN_pack_6bit(msk_c, vprime);
-
-    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
-        ciphertext[SABER_POLYVECCOMPRESSEDBYTES + j] = msk_c[j];
-    }
-}
-
-
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char message_dec[]) {
-    uint32_t i, j;
-    // secret key of the server
-    uint16_t sksv[SABER_K][SABER_N];
-    uint16_t pksv[SABER_K][SABER_N];
-    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
-    uint16_t mod_p = SABER_P - 1;
-    uint16_t v[SABER_N];
-    uint16_t op[SABER_N];
-
-    // sksv is the secret-key
-    PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(sk, sksv, SABER_Q);
-    // pksv is the ciphertext
-    PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(ciphertext, pksv, SABER_P);
-
-    // vector-vector scalar multiplication with mod p
-    for (i = 0; i < SABER_N; i++) {
-        v[i] = 0;
-    }
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            sksv[i][j] = sksv[i][j] & (mod_p);
-        }
-    }
-    InnerProd(pksv, sksv, mod_p, v);
-
-    //Extraction
-    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
-        scale_ar[i] = ciphertext[SABER_POLYVECCOMPRESSEDBYTES + i];
-    }
-
-    PQCLEAN_FIRESABER_CLEAN_un_pack6bit(scale_ar, op);
-
-    //addition of h1
-    for (i = 0; i < SABER_N; i++) {
-        v[i] = ( ( v[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (mod_p) ) >> (SABER_EP - 1);
-    }
-
-    // pack decrypted message
-    POL2MSG(v, message_dec);
-}
-static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose) {
-    uint16_t acc[SABER_N];
-    int32_t i, j, k;
-
-    if (transpose == 1) {
-        for (i = 0; i < SABER_K; i++) {
-            for (j = 0; j < SABER_K; j++) {
-                PQCLEAN_FIRESABER_CLEAN_pol_mul((uint16_t *)&a[j].vec[i], skpv[j], acc, SABER_Q, SABER_N);
-
-                for (k = 0; k < SABER_N; k++) {
-                    res[i][k] = res[i][k] + acc[k];
-                    //reduction mod p
-                    res[i][k] = (res[i][k] & mod);
-                    //clear the accumulator
-                    acc[k] = 0;
-                }
-            }
-        }
-    } else {
-        for (i = 0; i < SABER_K; i++) {
-            for (j = 0; j < SABER_K; j++) {
-                PQCLEAN_FIRESABER_CLEAN_pol_mul((uint16_t *)&a[i].vec[j], skpv[j], acc, SABER_Q, SABER_N);
-                for (k = 0; k < SABER_N; k++) {
-                    res[i][k] = res[i][k] + acc[k];
-                    // reduction
-                    res[i][k] = res[i][k] & mod;
-                    // clear the accumulator
-                    acc[k] = 0;
-                }
-            }
-        }
-    }
-}
-
-static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec) {
-    int32_t i, j;
-
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        message_dec[j] = 0;
-        for (i = 0; i < 8; i++) {
-            message_dec[j] = message_dec[j] | (uint8_t) (message_dec_unpacked[j * 8 + i] << i);
-        }
-    }
-}
-
-
-static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]) {
-    uint32_t j, k;
-    uint16_t acc[SABER_N];
-
-    // vector-vector scalar multiplication with mod p
-    for (j = 0; j < SABER_K; j++) {
-        PQCLEAN_FIRESABER_CLEAN_pol_mul(pkcl[j], skpv[j], acc, SABER_P, SABER_N);
-
-        for (k = 0; k < SABER_N; k++) {
-            res[k] = res[k] + acc[k];
-            // reduction
-            res[k] = res[k] & mod;
-            // clear the accumulator
-            acc[k] = 0;
-        }
-    }
+    PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(m, v);
 }
diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.h b/crypto_kem/firesaber/clean/SABER_indcpa.h
index 6007352d..28a5feee 100644
--- a/crypto_kem/firesaber/clean/SABER_indcpa.h
+++ b/crypto_kem/firesaber/clean/SABER_indcpa.h
@@ -1,9 +1,13 @@
 #ifndef INDCPA_H
 #define INDCPA_H
+#include "SABER_params.h"
+#include <stdint.h>
+
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
+
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
 
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk);
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(const unsigned char *message, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext);
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char *message_dec);
 
 #endif
-
diff --git a/crypto_kem/firesaber/clean/SABER_params.h b/crypto_kem/firesaber/clean/SABER_params.h
index b0d517f8..9121a12b 100644
--- a/crypto_kem/firesaber/clean/SABER_params.h
+++ b/crypto_kem/firesaber/clean/SABER_params.h
@@ -1,49 +1,39 @@
 #ifndef PARAMS_H
 #define PARAMS_H
 
-#include "api.h"
 
-#define SABER_K 4
+/* Change this for different security strengths */
+
+/* Don't change anything below this line */
+#define SABER_L 4
 #define SABER_MU 6
 #define SABER_ET 6
 
 #define SABER_EQ 13
 #define SABER_EP 10
-
 #define SABER_N 256
-#define SABER_Q 8192
-#define SABER_P 1024
 
-#define SABER_SEEDBYTES       32
-#define SABER_NOISESEEDBYTES  32
-#define SABER_COINBYTES       32
-#define SABER_KEYBYTES        32
+#define SABER_SEEDBYTES 32
+#define SABER_NOISE_SEEDBYTES 32
+#define SABER_KEYBYTES 32
+#define SABER_HASHBYTES 32
 
-#define SABER_HASHBYTES       32
+#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8)
 
-#define SABER_POLYBYTES       416 //13*256/8 
+#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8)
+#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES)
 
-#define SABER_POLYVECBYTES    (SABER_K * SABER_POLYBYTES)
+#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8)
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES)
 
-#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
-
-#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
-
-#define SABER_SCALEBYTES (SABER_DELTA*SABER_N/8)
-
-#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8)
 
 #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
 #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
 
 #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
 
-#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
-
-#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
-
-
-
+#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM)
 
 #endif
-
diff --git a/crypto_kem/firesaber/clean/api.h b/crypto_kem/firesaber/clean/api.h
index 56d17038..14718674 100644
--- a/crypto_kem/firesaber/clean/api.h
+++ b/crypto_kem/firesaber/clean/api.h
@@ -1,14 +1,18 @@
 #ifndef PQCLEAN_FIRESABER_CLEAN_API_H
 #define PQCLEAN_FIRESABER_CLEAN_API_H
 
+
 #define PQCLEAN_FIRESABER_CLEAN_CRYPTO_ALGNAME "FireSaber"
-#define PQCLEAN_FIRESABER_CLEAN_CRYPTO_SECRETKEYBYTES 3040
-#define PQCLEAN_FIRESABER_CLEAN_CRYPTO_PUBLICKEYBYTES (4*320+32)
 #define PQCLEAN_FIRESABER_CLEAN_CRYPTO_BYTES 32
 #define PQCLEAN_FIRESABER_CLEAN_CRYPTO_CIPHERTEXTBYTES 1472
+#define PQCLEAN_FIRESABER_CLEAN_CRYPTO_PUBLICKEYBYTES 1312
+#define PQCLEAN_FIRESABER_CLEAN_CRYPTO_SECRETKEYBYTES 3040
 
 int PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
-int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
-int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
+
+int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
+
 
 #endif /* api_h */
diff --git a/crypto_kem/firesaber/clean/cbd.c b/crypto_kem/firesaber/clean/cbd.c
index cca885a1..8032eb5c 100644
--- a/crypto_kem/firesaber/clean/cbd.c
+++ b/crypto_kem/firesaber/clean/cbd.c
@@ -1,3 +1,7 @@
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include <stdint.h>
 /*---------------------------------------------------------------------
 This file has been adapted from the implementation
 (available at, Public Domain https://github.com/pq-crystals/kyber)
@@ -6,12 +10,8 @@ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
 
-#include "SABER_params.h"
-#include "api.h"
-#include "cbd.h"
-#include <stdint.h>
 
-static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+static uint64_t load_littleendian(const uint8_t *x, int bytes) {
     int i;
     uint64_t r = x[0];
     for (i = 1; i < bytes; i++) {
@@ -20,33 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) {
     return r;
 }
 
-
-void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) {
-    uint16_t Qmod_minus1 = SABER_Q - 1;
-
+void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) {
     uint32_t t, d, a[4], b[4];
     int i, j;
 
     for (i = 0; i < SABER_N / 4; i++) {
-        t = (uint32_t) load_littleendian(buf + 3 * i, 3);
+        t = load_littleendian(buf + 3 * i, 3);
         d = 0;
         for (j = 0; j < 3; j++) {
             d += (t >> j) & 0x249249;
         }
 
-        a[0] =  d & 0x7;
-        b[0] = (d >>  3) & 0x7;
-        a[1] = (d >>  6) & 0x7;
-        b[1] = (d >>  9) & 0x7;
+        a[0] = d & 0x7;
+        b[0] = (d >> 3) & 0x7;
+        a[1] = (d >> 6) & 0x7;
+        b[1] = (d >> 9) & 0x7;
         a[2] = (d >> 12) & 0x7;
         b[2] = (d >> 15) & 0x7;
         a[3] = (d >> 18) & 0x7;
         b[3] = (d >> 21);
 
-        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
-        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
-        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
-        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
-
+        s[4 * i + 0] = (uint16_t)(a[0] - b[0]);
+        s[4 * i + 1] = (uint16_t)(a[1] - b[1]);
+        s[4 * i + 2] = (uint16_t)(a[2] - b[2]);
+        s[4 * i + 3] = (uint16_t)(a[3] - b[3]);
     }
 }
diff --git a/crypto_kem/firesaber/clean/cbd.h b/crypto_kem/firesaber/clean/cbd.h
index b10e5202..0fa18b02 100644
--- a/crypto_kem/firesaber/clean/cbd.h
+++ b/crypto_kem/firesaber/clean/cbd.h
@@ -1,6 +1,5 @@
 #ifndef CBD_H
 #define CBD_H
-
 /*---------------------------------------------------------------------
 This file has been adapted from the implementation
 (available at, Public Domain https://github.com/pq-crystals/kyber)
@@ -8,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
 by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
-
-#include "poly.h"
+#include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf);
+void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]);
+
 
 #endif
diff --git a/crypto_kem/firesaber/clean/kem.c b/crypto_kem/firesaber/clean/kem.c
index c66cfed9..e94219a6 100644
--- a/crypto_kem/firesaber/clean/kem.c
+++ b/crypto_kem/firesaber/clean/kem.c
@@ -1,5 +1,6 @@
 #include "SABER_indcpa.h"
 #include "SABER_params.h"
+#include "api.h"
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
@@ -7,90 +8,71 @@
 #include <stdio.h>
 #include <string.h>
 
-int PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+
+int PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
     int i;
 
-    // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
-    PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(pk, sk);
-
-    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
+    PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
-        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];
+        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
     }
 
-    // Then hash(pk) is appended.
-    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES);
+    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended.
 
-    // Remaining part of sk contains a pseudo-random number.
-    // This is output when check in crypto_kem_dec() fails.
-    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES );
+    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number.
+    // This is output when check in PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec() fails.
     return (0);
 }
 
-int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) {
-    // Will contain key, coins
-    unsigned char kr[64];
-    unsigned char buf[64];
+int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) {
+
+    uint8_t kr[64]; // Will contain key, coins
+    uint8_t buf[64];
 
     randombytes(buf, 32);
 
-    // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
-    sha3_256(buf, buf, 32);
+    sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
 
-    // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
-    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES);
-
-    // kr[0:63] <-- Hash(buf[0:63]);
-    sha3_512(kr, buf, 64);
+    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
 
+    sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
     // K^ <-- kr[0:31]
     // noiseseed (r) <-- kr[32:63];
-    // buf[0:31] contains message; kr[32:63] contains randomness r;
-    PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk,  ct);
+    PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
 
-    sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC);
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
 
-    // hash concatenation of pre-k and h(c) to k
-    sha3_256(ss, kr, 64);
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
 
     return (0);
 }
 
-
-int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) {
+int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
     int i;
-    unsigned char fail;
-    unsigned char cmp[SABER_BYTES_CCA_DEC];
-    unsigned char buf[64];
-
-    // Will contain key, coins
-    unsigned char kr[64];
-    const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
-
-    // buf[0:31] <-- message
-    PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(sk, ct, buf);
+    uint8_t fail;
+    uint8_t cmp[SABER_BYTES_CCA_DEC];
+    uint8_t buf[64];
+    uint8_t kr[64]; // Will contain key, coins
+    const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
 
+    PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message
 
     // Multitarget countermeasure for coins + contributory KEM
-    // Save hash by storing h(pk) in sk
-    for (i = 0; i < 32; i++) {
+    for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk
         buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i];
     }
 
     sha3_512(kr, buf, 64);
 
-    PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, cmp);
+    PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(cmp, buf, kr + 32, pk);
 
+    fail = PQCLEAN_FIRESABER_CLEAN_verify(c, cmp, SABER_BYTES_CCA_DEC);
 
-    fail = PQCLEAN_FIRESABER_CLEAN_verify(ct, cmp, SABER_BYTES_CCA_DEC);
-
-    // overwrite coins in kr with h(c)
-    sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC);
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c)
 
     PQCLEAN_FIRESABER_CLEAN_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail);
 
-    // hash concatenation of pre-k and h(c) to k
-    sha3_256(ss, kr, 64);
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
 
     return (0);
 }
diff --git a/crypto_kem/firesaber/clean/pack_unpack.c b/crypto_kem/firesaber/clean/pack_unpack.c
index 9e68ffc1..0add1409 100644
--- a/crypto_kem/firesaber/clean/pack_unpack.c
+++ b/crypto_kem/firesaber/clean/pack_unpack.c
@@ -1,254 +1,136 @@
+#include "api.h"
 #include "pack_unpack.h"
+#include <string.h>
 
-void PQCLEAN_FIRESABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) |
-                                 ((data[offset_data + 1] & 0x7) << 3) |
-                                 ((data[offset_data + 2] & 0x3) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  |
-                                 ((data[offset_data + 3] & 0x7) << 1) |
-                                 ((data[offset_data + 4] & 0x7) << 4) |
-                                 (((data[offset_data + 5]) & 0x01) << 7);
-        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) |
-                                 ((data[offset_data + 6] & 0x7) << 2) |
-                                 ((data[offset_data + 7] & 0x7) << 5);
-    }
-}
-
-void PQCLEAN_FIRESABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
-        data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3 ) & 0x07;
-        data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6 ) & 0x03) |
-                                (((bytes[offset_byte + 1]) & 0x01) << 2);
-        data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1 ) & 0x07;
-        data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4 ) & 0x07;
-        data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7 ) & 0x01) |
-                                (((bytes[offset_byte + 2]) & 0x03) << 1);
-        data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07);
-        data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07);
-    }
-}
-
-void PQCLEAN_FIRESABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data;
-
-    for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        bytes[j] = (data[offset_data] & 0x0f) |
-                   ((data[offset_data + 1] & 0x0f) << 4);
-    }
-}
-
-void PQCLEAN_FIRESABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar) {
-    uint32_t j;
-    uint32_t offset_data;
-
-    for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        ar[offset_data] = bytes[j] & 0x0f;
-        ar[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
-    }
-}
-
-void PQCLEAN_FIRESABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
+void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 4; j++) {
         offset_byte = 3 * j;
         offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) |
-                                 ((data[offset_data + 1] & 0x03) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) |
-                                 ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) |
-                                 ((data[offset_data + 3] & 0x3f) << 2);
+        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6);
+        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
+        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2);
     }
 }
 
-
-void PQCLEAN_FIRESABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
+void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 4; j++) {
         offset_byte = 3 * j;
         offset_data = 4 * j;
         data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
-        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |
-                                ((bytes[offset_byte + 1] & 0x0f) << 2);
-        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) |
-                                ((bytes[offset_byte + 2] & 0x03) << 4);
+        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2);
+        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4);
         data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
     }
 }
 
-
-static void POLVECp2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff));
-            bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x03) |
-                                     ((data[i][offset_data + 1] & 0x3f) << 2);
-            bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 6) & 0x0f) |
-                                     ((data[i][offset_data + 2] & 0x0f) << 4);
-            bytes[offset_byte + 3] = ((data[i][offset_data + 2] >> 4) & 0x3f) |
-                                     ((data[i][offset_data + 3] & 0x03) << 6);
-            bytes[offset_byte + 4] = ((data[i][offset_data + 3] >> 2) & 0xff);
-        }
-    }
-}
-
-static void BS2POLVECp(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                       ((bytes[offset_byte + 1] & 0x03) << 8);
-            data[i][offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) |
-                                       ((bytes[offset_byte + 2] & 0x0f) << 6);
-            data[i][offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) |
-                                       ((bytes[offset_byte + 3] & 0x3f) << 4);
-            data[i][offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) |
-                                       ((bytes[offset_byte + 4] & 0xff) << 2);
-        }
-    }
-}
-
-
-
-static void POLVECq2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff));
-            bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x1f) |
-                                     ((data[i][offset_data + 1] & 0x07) << 5);
-            bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 3) & 0xff);
-            bytes[offset_byte + 3] = ((data[i][offset_data + 1] >> 11) & 0x03) |
-                                     ((data[i][offset_data + 2] & 0x3f) << 2);
-            bytes[offset_byte + 4] = ((data[i][offset_data + 2] >> 6) & 0x7f) |
-                                     ((data[i][offset_data + 3] & 0x01) << 7);
-            bytes[offset_byte + 5] = ((data[i][offset_data + 3] >> 1) & 0xff);
-            bytes[offset_byte + 6] = ((data[i][offset_data + 3] >> 9) & 0x0f) |
-                                     ((data[i][offset_data + 4] & 0x0f) << 4);
-            bytes[offset_byte + 7] = ((data[i][offset_data + 4] >> 4) & 0xff);
-            bytes[offset_byte + 8] = ((data[i][offset_data + 4] >> 12) & 0x01) |
-                                     ((data[i][offset_data + 5] & 0x7f) << 1);
-            bytes[offset_byte + 9] = ((data[i][offset_data + 5] >> 7) & 0x3f) |
-                                     ((data[i][offset_data + 6] & 0x03) << 6);
-            bytes[offset_byte + 10] = ((data[i][offset_data + 6] >> 2) & 0xff);
-            bytes[offset_byte + 11] = ((data[i][offset_data + 6] >> 10) & 0x07) |
-                                      ((data[i][offset_data + 7] & 0x1f) << 3);
-            bytes[offset_byte + 12] = ((data[i][offset_data + 7] >> 5) & 0xff);
-        }
-    }
-}
-
-static void BS2POLVECq(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                       ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) |
-                                       ((bytes[offset_byte + 2] & 0xff) << 3) |
-                                       ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) |
-                                       ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) |
-                                       ((bytes[offset_byte + 5] & 0xff) << 1) |
-                                       ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) |
-                                       ((bytes[offset_byte + 7] & 0xff) << 4) |
-                                       ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) |
-                                       ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) |
-                                       ((bytes[offset_byte + 10] & 0xff) << 2) |
-                                       ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) |
-                                       ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-}
-
-//only BS2POLq no BS2POLp
-void PQCLEAN_FIRESABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 8; j++) {
         offset_byte = 13 * j;
         offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) |
-                                ((bytes[offset_byte + 2] & 0xff) << 3) |
-                                ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) |
-                                ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) |
-                                ((bytes[offset_byte + 5] & 0xff) << 1) |
-                                ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) |
-                                ((bytes[offset_byte + 7] & 0xff) << 4) |
-                                ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) |
-                                ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) |
-                                ((bytes[offset_byte + 10] & 0xff) << 2) |
-                                ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) |
-                                ((bytes[offset_byte + 12] & 0xff) << 5);
+        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
+        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5);
+        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff);
+        bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2);
+        bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7);
+        bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff);
+        bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4);
+        bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff);
+        bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1);
+        bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6);
+        bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff);
+        bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3);
+        bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff);
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-    if (modulus == 1024) {
-        POLVECp2BS(bytes, data);
-    } else if (modulus == 8192) {
-        POLVECq2BS(bytes, data);
+static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-    if (modulus == 1024) {
-        BS2POLVECp(bytes, data);
-    } else if (modulus == 8192) {
-        BS2POLVECq(bytes, data);
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 5 * j;
+        offset_data = 4 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
+        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2);
+        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
+        bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6);
+        bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff);
+    }
+}
+
+static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 5 * j;
+        offset_data = 4 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8);
+        data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6);
+        data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4);
+        data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2);
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLq2BS(bytes + i * SABER_POLYBYTES, data[i]);
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLq(data[i], bytes + i * SABER_POLYBYTES);
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]);
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8));
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) {
+    size_t i, j;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            data[j * 8 + i] = ((bytes[j] >> i) & 0x01);
+        }
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) {
+    size_t i, j;
+    memset(bytes, 0, SABER_KEYBYTES);
+
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i);
+        }
     }
 }
diff --git a/crypto_kem/firesaber/clean/pack_unpack.h b/crypto_kem/firesaber/clean/pack_unpack.h
index 6509f107..0a8ee253 100644
--- a/crypto_kem/firesaber/clean/pack_unpack.h
+++ b/crypto_kem/firesaber/clean/pack_unpack.h
@@ -1,28 +1,27 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
-
 #include "SABER_params.h"
 #include <stdint.h>
 #include <stdio.h>
 
+void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]);
 
-void PQCLEAN_FIRESABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_FIRESABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data);
-
-void PQCLEAN_FIRESABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_FIRESABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar);
-
-void PQCLEAN_FIRESABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_FIRESABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data);
+void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]);
+void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]);
 
-void PQCLEAN_FIRESABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]);
+
+
+void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]);
+
+void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
+
+
+void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]);
+
+void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]);
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus);
 
 #endif
diff --git a/crypto_kem/firesaber/clean/poly.c b/crypto_kem/firesaber/clean/poly.c
index 6fef45d5..c65175fe 100644
--- a/crypto_kem/firesaber/clean/poly.c
+++ b/crypto_kem/firesaber/clean/poly.c
@@ -1,21 +1,49 @@
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
-#include "SABER_params.h"
+#include "api.h"
 #include "cbd.h"
 #include "fips202.h"
+#include "pack_unpack.h"
 #include "poly.h"
+#include "poly_mul.h"
+#include <stdio.h>
 
-void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed) {
-    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
-
-    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
-
-    for (size_t i = 0; i < SABER_K; i++) {
-        PQCLEAN_FIRESABER_CLEAN_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
+void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
+    int i, j;
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_L; j++) {
+            if (transpose == 1) {
+                PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]);
+            } else {
+                PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]);
+            }
+        }
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
+    int j;
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(b[j], s[j], res);
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+    int i;
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES);
+    }
+}
+
+void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) {
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+    size_t i;
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_FIRESABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES);
     }
 }
diff --git a/crypto_kem/firesaber/clean/poly.h b/crypto_kem/firesaber/clean/poly.h
index 4f69a068..044e4eec 100644
--- a/crypto_kem/firesaber/clean/poly.h
+++ b/crypto_kem/firesaber/clean/poly.h
@@ -1,26 +1,15 @@
 #ifndef POLY_H
 #define POLY_H
-
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
-
-
 #include "SABER_params.h"
 #include <stdint.h>
 
-typedef struct {
-    uint16_t coeffs[SABER_N];
-} poly;
+void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose);
 
-typedef struct {
-    poly vec[SABER_K];
-} polyvec;
+void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]);
+
+void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]);
 
-void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed);
 
 #endif
diff --git a/crypto_kem/firesaber/clean/poly_mul.c b/crypto_kem/firesaber/clean/poly_mul.c
index 20f1d4ad..27c92f29 100644
--- a/crypto_kem/firesaber/clean/poly_mul.c
+++ b/crypto_kem/firesaber/clean/poly_mul.c
@@ -228,19 +228,15 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n) {
-    uint32_t i;
-    // normal multiplication
-    uint16_t c[512];
-
-    for (i = 0; i < 512; i++) {
-        c[i] = 0;
-    }
+/* res += a*b */
+void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) {
+    uint16_t c[2 * SABER_N] = {0};
+    int i;
 
     toom_cook_4way(a, b, c);
 
-    // reduction
-    for (i = n; i < 2 * n; i++) {
-        res[i - n] = (c[i - n] - c[i]) & (p - 1);
+    /* reduction */
+    for (i = SABER_N; i < 2 * SABER_N; i++) {
+        res[i - SABER_N] += (c[i - SABER_N] - c[i]);
     }
 }
diff --git a/crypto_kem/firesaber/clean/poly_mul.h b/crypto_kem/firesaber/clean/poly_mul.h
index 4d960042..e554d60c 100644
--- a/crypto_kem/firesaber/clean/poly_mul.h
+++ b/crypto_kem/firesaber/clean/poly_mul.h
@@ -1,9 +1,9 @@
-#ifndef POLYMUL_H
-#define POLYMUL_H
-
+#ifndef POLY_MUL_H
+#define POLY_MUL_H
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_FIRESABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n);
+void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]);
+
 
 #endif
diff --git a/crypto_kem/firesaber/clean/verify.c b/crypto_kem/firesaber/clean/verify.c
index 3c571e92..97a302a9 100644
--- a/crypto_kem/firesaber/clean/verify.c
+++ b/crypto_kem/firesaber/clean/verify.c
@@ -1,3 +1,5 @@
+#include "verify.h"
+
 /*-------------------------------------------------
 This file has been adapted from the implementation
 (available at https://github.com/pq-crystals/kyber) of
@@ -5,26 +7,25 @@ This file has been adapted from the implementation
  by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------*/
-#include "verify.h"
-#include <stdint.h>
+
 
 /* returns 0 for equal strings, 1 for non-equal strings */
-unsigned char PQCLEAN_FIRESABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len) {
+uint8_t PQCLEAN_FIRESABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) {
     uint64_t r;
     size_t i;
-
     r = 0;
+
     for (i = 0; i < len; i++) {
         r |= a[i] ^ b[i];
     }
 
     r = (~r + 1); // Two's complement
     r >>= 63;
-    return (unsigned char)r;
+    return (uint8_t) r;
 }
 
 /* b = 1 means mov, b = 0 means don't mov*/
-void PQCLEAN_FIRESABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
+void PQCLEAN_FIRESABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
     size_t i;
 
     b = -b;
diff --git a/crypto_kem/firesaber/clean/verify.h b/crypto_kem/firesaber/clean/verify.h
index 1b69b071..1d5e4cb9 100644
--- a/crypto_kem/firesaber/clean/verify.h
+++ b/crypto_kem/firesaber/clean/verify.h
@@ -1,6 +1,5 @@
 #ifndef VERIFY_H
 #define VERIFY_H
-
 /*-------------------------------------------------
 This file has been adapted from the implementation
 (available at https://github.com/pq-crystals/kyber) of
@@ -13,9 +12,11 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 #include <stdint.h>
 
 /* returns 0 for equal strings, 1 for non-equal strings */
-unsigned char PQCLEAN_FIRESABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len);
+uint8_t PQCLEAN_FIRESABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len);
+
 
 /* b = 1 means mov, b = 0 means don't mov*/
-void PQCLEAN_FIRESABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);
+void PQCLEAN_FIRESABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
+
 
 #endif
diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml
index 1b7912f6..1cc06c9a 100644
--- a/crypto_kem/lightsaber/META.yml
+++ b/crypto_kem/lightsaber/META.yml
@@ -14,4 +14,13 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/commit/14ede83f1ff3bcc41f0464543542366c68b55871
+      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+    - name: avx2
+      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/lightsaber/avx2/LICENSE b/crypto_kem/lightsaber/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/lightsaber/avx2/Makefile b/crypto_kem/lightsaber/avx2/Makefile
new file mode 100644
index 00000000..0522fe8d
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=liblightsaber_avx2.a
+HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
+OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o 
+
+CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/lightsaber/avx2/SABER_indcpa.c b/crypto_kem/lightsaber/avx2/SABER_indcpa.c
new file mode 100644
index 00000000..3270a8c9
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/SABER_indcpa.c
@@ -0,0 +1,416 @@
+#include "./polymul/toom-cook_4way.c"
+#include "SABER_indcpa.h"
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include "fips202.h"
+#include "pack_unpack.h"
+#include "randombytes.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+//#include "randombytes.h"
+//#include "./polymul/toom_cook_4/toom-cook_4way.c"
+
+#define h1 4 //2^(EQ-EP-1)
+
+#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+
+
+static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) {
+    int32_t i, j;
+
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        message_dec[j] = 0;
+        for (i = 0; i < 8; i++) {
+            message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i);
+        }
+    }
+}
+
+/*-----------------------------------------------------------------------------------
+    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
+-------------------------------------------------------------------------------------*/
+
+static void GenMatrix(polyvec *a, const uint8_t *seed) {
+    uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8];
+
+    uint16_t temp_ar[SABER_N];
+
+    int i, j, k;
+    uint16_t mod = (SABER_Q - 1);
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            PQCLEAN_LIGHTSABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8);
+            for (k = 0; k < SABER_N; k++) {
+                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
+            }
+        }
+    }
+}
+
+static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
+
+    uint32_t i;
+
+    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
+
+    for (i = 0; i < SABER_K; i++) {
+        PQCLEAN_LIGHTSABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
+    }
+}
+
+//********************************matrix-vector mul routines*****************************************************
+static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) {
+    int64_t i, j;
+
+    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
+
+    for (i = 0; i < NUM_POLY; i++) {
+        for (j = 0; j < NUM_POLY; j++) {
+
+            if (isTranspose == 0) {
+                toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j);
+            } else {
+                toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j);
+            }
+        }
+
+        TC_interpol(c_bucket, res_avx[i]);
+    }
+
+}
+
+static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) {
+
+    int64_t i;
+
+    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
+
+    for (i = 0; i < NUM_POLY; i++) {
+        toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i);
+    }
+    TC_interpol(c_bucket, res_avx);
+}
+
+//********************************matrix-vector mul routines*****************************************************
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
+
+    polyvec a[SABER_K];
+
+    uint16_t skpv1[SABER_K][SABER_N];
+
+
+
+    uint8_t seed[SABER_SEEDBYTES];
+    uint8_t noiseseed[SABER_COINBYTES];
+    int32_t i, j, k;
+
+
+//--------------AVX declaration------------------
+
+    __m256i sk_avx[SABER_K][SABER_N / 16];
+    __m256i mod;
+    __m256i res_avx[SABER_K][SABER_N / 16];
+    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
+    //__m256i acc[2*SABER_N/16];
+
+    mod = _mm256_set1_epi16(SABER_Q - 1);
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+
+//--------------AVX declaration ends------------------
+
+    randombytes(seed, SABER_SEEDBYTES);
+
+    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state
+    randombytes(noiseseed, SABER_COINBYTES);
+
+
+    GenMatrix(a, seed); //sample matrix A
+
+    GenSecret(skpv1, noiseseed);
+
+
+// Load sk into avx vectors
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+        }
+
+    }
+
+    // Load a into avx vectors
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            for (k = 0; k < SABER_N / 16; k++) {
+                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
+            }
+        }
+    }
+
+
+
+    //------------------------do the matrix vector multiplication and rounding------------
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sk_avx[j], b_bucket[j]);
+    }
+    matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order
+
+    // Now truncation
+
+
+    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N / 16; j++) {
+            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
+            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
+            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
+        }
+    }
+
+    //------------------Pack sk into byte string-------
+
+    PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q);
+
+    //------------------Pack pk into byte string-------
+
+    for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key
+        for (j = 0; j < SABER_N / 16; j++) {
+            _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
+        }
+    }
+    PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string
+
+
+    for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
+        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
+    }
+
+}
+
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+
+
+    uint32_t i, j, k;
+    polyvec a[SABER_K];     // skpv;
+    uint8_t seed[SABER_SEEDBYTES];
+    uint16_t pkcl[SABER_K][SABER_N];    //public key of received by the client
+
+
+    uint16_t skpv1[SABER_K][SABER_N];
+    uint16_t temp[SABER_K][SABER_N];
+    uint16_t message[SABER_KEYBYTES * 8];
+
+    uint8_t msk_c[SABER_SCALEBYTES_KEM];
+
+    //--------------AVX declaration------------------
+
+    __m256i sk_avx[SABER_K][SABER_N / 16];
+    __m256i mod, mod_p;
+    __m256i res_avx[SABER_K][SABER_N / 16];
+    __m256i vprime_avx[SABER_N / 16];
+    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
+    //__m256i acc[2*SABER_N/16];
+
+    __m256i pkcl_avx[SABER_K][SABER_N / 16];
+
+    __m256i message_avx[SABER_N / 16];
+
+    mod = _mm256_set1_epi16(SABER_Q - 1);
+    mod_p = _mm256_set1_epi16(SABER_P - 1);
+
+
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+
+    //--------------AVX declaration ends------------------
+    for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK.
+        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
+    }
+
+    GenMatrix(a, seed);
+    GenSecret(skpv1, noiseseed);
+
+    // ----------- Load skpv1 into avx vectors ----------
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+        }
+    }
+
+    // ----------- Load skpv1 into avx vectors ----------
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            for (k = 0; k < SABER_N / 16; k++) {
+                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
+            }
+        }
+    }
+    //-----------------matrix-vector multiplication and rounding
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sk_avx[j], b_bucket[j]);
+    }
+    matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order
+
+    // Now truncation
+
+    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N / 16; j++) {
+            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
+            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
+            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
+
+        }
+    }
+
+
+    //-----this result should be put in b_prime for later use in server.
+    for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays
+        for (j = 0; j < SABER_N / 16; j++) {
+            _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
+        }
+    }
+
+    PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string
+
+//**************client matrix-vector multiplication ends******************//
+
+    //------now calculate the v'
+
+    //-------unpack the public_key
+    PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P);
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16]));
+        }
+    }
+
+    // InnerProduct
+    //for(k=0;k<SABER_N/16;k++){
+    //  vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]);
+    //}
+
+    // vector-vector scalar multiplication with mod p
+
+    vector_vector_mul(pkcl_avx, b_bucket, vprime_avx);
+
+    // Computation of v'+h1
+    for (i = 0; i < SABER_N / 16; i++) { //adding h1
+        vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1));
+    }
+
+    // unpack m;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            message[8 * j + i] = ((m[j] >> i) & 0x01);
+        }
+    }
+    // message encoding
+    for (i = 0; i < SABER_N / 16; i++) {
+        message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16]));
+        message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) );
+    }
+
+    // SHIFTRIGHT(v'+h1-m mod p, EP-ET)
+    for (k = 0; k < SABER_N / 16; k++) {
+        vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]);
+        vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p);
+        vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) );
+    }
+
+    // Unpack avx
+    for (j = 0; j < SABER_N / 16; j++) {
+        _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]);
+    }
+
+    PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(msk_c, temp[0]);
+
+
+    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
+        ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j];
+    }
+
+}
+
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+
+    uint32_t i, j;
+    uint16_t sksv[SABER_K][SABER_N]; //secret key of the server
+    uint16_t pksv[SABER_K][SABER_N];
+    uint16_t message_dec_unpacked[SABER_KEYBYTES * 8];  // one element containes on decrypted bit;
+    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
+    uint16_t op[SABER_N];
+
+    //--------------AVX declaration------------------
+
+
+    //__m256i mod_p;
+
+    __m256i v_avx[SABER_N / 16];
+
+    //__m256i acc[2*SABER_N/16];
+
+    __m256i sksv_avx[SABER_K][SABER_N / 16];
+    __m256i pksv_avx[SABER_K][SABER_N / 16];
+
+    //mod_p=_mm256_set1_epi16(SABER_P-1);
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+    //--------------AVX declaration ends------------------
+
+    //-------unpack the public_key
+
+    PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key
+    PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16]));
+            pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16]));
+        }
+    }
+
+    for (i = 0; i < SABER_N / 16; i++) {
+        v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]);
+    }
+
+
+    // InnerProduct(b', s, mod p)
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sksv_avx[j], b_bucket[j]);
+    }
+
+    vector_vector_mul(pksv_avx, b_bucket, v_avx);
+
+    for (i = 0; i < SABER_N / 16; i++) {
+        _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
+    }
+
+
+    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
+        scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i];
+    }
+
+    PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(op, scale_ar);
+
+
+    //addition of h2
+    for (i = 0; i < SABER_N; i++) {
+        message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1);
+    }
+
+
+    POL2MSG(m, message_dec_unpacked);
+}
diff --git a/crypto_kem/lightsaber/avx2/SABER_indcpa.h b/crypto_kem/lightsaber/avx2/SABER_indcpa.h
new file mode 100644
index 00000000..61ee77ba
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/SABER_indcpa.h
@@ -0,0 +1,13 @@
+#ifndef INDCPA_H
+#define INDCPA_H
+#include "SABER_params.h"
+#include <stdint.h>
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
+
+
+#endif
diff --git a/crypto_kem/lightsaber/avx2/SABER_params.h b/crypto_kem/lightsaber/avx2/SABER_params.h
new file mode 100644
index 00000000..11d34fda
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/SABER_params.h
@@ -0,0 +1,46 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+#include "api.h"
+
+
+
+
+#define SABER_K 2
+#define SABER_MU 10
+#define SABER_ET 3
+
+
+#define SABER_EQ 13
+#define SABER_EP 10
+
+#define SABER_N 256
+#define SABER_Q 8192 //2^13
+#define SABER_P 1024
+
+#define SABER_SEEDBYTES       32
+#define SABER_NOISESEEDBYTES  32
+#define SABER_COINBYTES       32
+#define SABER_KEYBYTES        32
+
+#define SABER_HASHBYTES       32
+
+#define SABER_POLYBYTES              416 //13*256/8 
+
+#define SABER_POLYVECBYTES           (SABER_K * SABER_POLYBYTES)
+
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
+
+#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
+
+#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+
+#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
+#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
+
+#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
+
+#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
+
+#endif
diff --git a/crypto_kem/lightsaber/avx2/api.h b/crypto_kem/lightsaber/avx2/api.h
new file mode 100644
index 00000000..d1e2105b
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/api.h
@@ -0,0 +1,18 @@
+#ifndef PQCLEAN_LIGHTSABER_AVX2_API_H
+#define PQCLEAN_LIGHTSABER_AVX2_API_H
+
+
+#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_ALGNAME "LightSaber"
+#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_BYTES 32
+#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_CIPHERTEXTBYTES 736
+#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_PUBLICKEYBYTES 672
+#define PQCLEAN_LIGHTSABER_AVX2_CRYPTO_SECRETKEYBYTES 1568
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
+
+
+#endif /* PQCLEAN_LIGHTSABER_AVX2_API_H */
diff --git a/crypto_kem/lightsaber/avx2/cbd.c b/crypto_kem/lightsaber/avx2/cbd.c
new file mode 100644
index 00000000..a43170e2
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/cbd.c
@@ -0,0 +1,51 @@
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include <stdint.h>
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+
+
+static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+    int i;
+    uint64_t r = x[0];
+    for (i = 1; i < bytes; i++) {
+        r |= (uint64_t)x[i] << (8 * i);
+    }
+    return r;
+}
+
+
+void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) {
+    uint16_t Qmod_minus1 = SABER_Q - 1;
+
+    uint64_t t, d, a[4], b[4];
+    int i, j;
+
+    for (i = 0; i < SABER_N / 4; i++) {
+        t = load_littleendian(buf + 5 * i, 5);
+        d = 0;
+        for (j = 0; j < 5; j++) {
+            d += (t >> j) & 0x0842108421UL;
+        }
+
+        a[0] =  d & 0x1f;
+        b[0] = (d >>  5) & 0x1f;
+        a[1] = (d >> 10) & 0x1f;
+        b[1] = (d >> 15) & 0x1f;
+        a[2] = (d >> 20) & 0x1f;
+        b[2] = (d >> 25) & 0x1f;
+        a[3] = (d >> 30) & 0x1f;
+        b[3] = (d >> 35);
+
+        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
+        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
+        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
+        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
+    }
+}
diff --git a/crypto_kem/lightsaber/avx2/cbd.h b/crypto_kem/lightsaber/avx2/cbd.h
new file mode 100644
index 00000000..01ba76e8
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/cbd.h
@@ -0,0 +1,16 @@
+#ifndef CBD_H
+#define CBD_H
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+#include "poly.h"
+#include <stdint.h>
+
+void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf);
+
+
+#endif
diff --git a/crypto_kem/lightsaber/avx2/kem.c b/crypto_kem/lightsaber/avx2/kem.c
new file mode 100644
index 00000000..70221f10
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/kem.c
@@ -0,0 +1,79 @@
+#include "SABER_indcpa.h"
+#include "SABER_params.h"
+#include "api.h"
+#include "fips202.h"
+#include "randombytes.h"
+#include "verify.h"
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
+    int i;
+
+    PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
+    for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
+        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
+    }
+
+    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended.
+
+    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number.
+    // This is output when check in PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec() fails.
+    return (0);
+}
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) {
+
+    uint8_t kr[64]; // Will contain key, coins
+    uint8_t buf[64];
+
+    randombytes(buf, 32);
+
+    sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
+
+    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
+
+    sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
+    // K^ <-- kr[0:31]
+    // noiseseed (r) <-- kr[32:63];
+    PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
+
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
+
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
+
+    return (0);
+}
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
+    int i;
+    uint8_t fail;
+    uint8_t cmp[SABER_BYTES_CCA_DEC];
+    uint8_t buf[64];
+    uint8_t kr[64]; // Will contain key, coins
+    const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
+
+    PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message
+
+    // Multitarget countermeasure for coins + contributory KEM
+    for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk
+        buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i];
+    }
+
+    sha3_512(kr, buf, 64);
+
+    PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk);
+
+    fail = PQCLEAN_LIGHTSABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC);
+
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c)
+
+    PQCLEAN_LIGHTSABER_AVX2_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail);
+
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
+
+    return (0);
+}
diff --git a/crypto_kem/lightsaber/avx2/kem.h b/crypto_kem/lightsaber/avx2/kem.h
new file mode 100644
index 00000000..b80c335d
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/kem.h
@@ -0,0 +1,35 @@
+#ifndef INDCPA_H
+#define INDCPA_H
+
+#include <stdint.h>
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk);
+
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
+
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
+
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk);
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk,  uint8_t *ciphertext);
+
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]);
+
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+
+
+
+//uint64_t clock1,clock2;
+
+//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex;
+
+
+#endif
diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.c b/crypto_kem/lightsaber/avx2/pack_unpack.c
new file mode 100644
index 00000000..e912fd0a
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/pack_unpack.c
@@ -0,0 +1,502 @@
+#include "pack_unpack.h"
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 3 * j;
+        offset_data = 8 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6);
+        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7);
+        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 );
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 3 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
+        data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07;
+        data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 );
+        data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07;
+        data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07;
+        data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 );
+        data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 );
+        data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 );
+    }
+
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0;
+
+    for (j = 0; j < SABER_N / 2; j++) {
+        offset_data = 2 * j;
+        bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 );
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0;
+
+    for (j = 0; j < SABER_N / 2; j++) {
+        offset_data = 2 * j;
+        data[offset_data] = bytes[j] & 0x0f;
+        data[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 3 * j;
+        offset_data = 4 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6);
+        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
+        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2);
+    }
+}
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 3 * j;
+        offset_data = 4 * j;
+        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
+        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |  ((bytes[offset_byte + 1] & 0x0f) << 2)  ;
+        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ;
+        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
+    }
+
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
+        }
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
+        }
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
+
+            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
+
+        }
+    }
+
+
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+    }
+}
+
+
+
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
+            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
+            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
+            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
+
+        }
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        }
+    }
+
+
+}
+
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
+            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
+            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
+            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
+
+        }
+    }
+
+
+}
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
+
+            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
+
+        }
+    }
+
+
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        }
+    }
+
+
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    //for(i=0;i<SABER_K;i++){
+    //i=0;
+    //offset_byte1=i*(SABER_N*13)/8;
+    for (j = 0; j < SABER_N / 8; j++) {
+        //offset_byte=offset_byte1+13*j;
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+    }
+    //}
+
+
+}
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+    /*This function packs 11 bit data stream into 8 bits of data.
+    */
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 11) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 11 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1);
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7);
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5);
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff );
+
+        }
+    }
+
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 11) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 11 * j;
+            offset_data = 8 * j;
+
+            data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 );
+
+            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 );
+
+            data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 );
+
+            data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 );
+
+            data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 );
+
+            data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 );
+
+            data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 );
+
+            data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 );
+        }
+    }
+
+
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 14) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 7 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff );
+        }
+    }
+
+
+}
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 14) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 7 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 );
+
+            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 );
+
+            data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 );
+
+            data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 );
+        }
+    }
+
+
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
+
+    if (modulus == 1024) {
+        PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(bytes, data);
+    } else if (modulus == 8192) {
+        PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(bytes, data);
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) {
+
+    if (modulus == 1024) {
+        PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(data, bytes);
+    } else if (modulus == 8192) {
+        PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(data, bytes);
+    }
+
+}
diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.h b/crypto_kem/lightsaber/avx2/pack_unpack.h
new file mode 100644
index 00000000..9a5d41f0
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/pack_unpack.h
@@ -0,0 +1,56 @@
+#ifndef PACK_UNPACK_H
+#define PACK_UNPACK_H
+#include "SABER_params.h"
+#include <stdint.h>
+#include <stdio.h>
+
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus);
+
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+
+void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+
+void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes);
+
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+
+#endif
diff --git a/crypto_kem/lightsaber/avx2/poly.h b/crypto_kem/lightsaber/avx2/poly.h
new file mode 100644
index 00000000..8f2a7574
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/poly.h
@@ -0,0 +1,27 @@
+#ifndef POLY_H
+#define POLY_H
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+#include "SABER_params.h"
+#include <stdint.h>
+
+typedef struct {
+    uint16_t coeffs[SABER_N];
+} poly;
+
+typedef struct {
+    poly vec[SABER_K];
+} polyvec;
+
+void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce);
+
+
+void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3);
+
+
+#endif
diff --git a/crypto_kem/lightsaber/avx2/polymul/consts.h b/crypto_kem/lightsaber/avx2/polymul/consts.h
new file mode 100644
index 00000000..40826398
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/polymul/consts.h
@@ -0,0 +1,20 @@
+#include "../SABER_params.h"
+
+#define AVX_N (SABER_N >> 4)
+#define small_len_avx (AVX_N >> 2)
+
+#define SCHB_N 16
+
+#define N_SB (SABER_N >> 2)
+#define N_SB_RES (2*N_SB-1)
+
+#define N_SB_16 (N_SB >> 2)
+#define N_SB_16_RES (2*N_SB_16-1)
+
+#define AVX_N1 16 /*N/16*/ 
+
+#define SCM_SIZE 16
+
+// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements
+#define NUM_POLY SABER_K
+//int NUM_POLY=2; 
diff --git a/crypto_kem/lightsaber/avx2/polymul/matrix.c b/crypto_kem/lightsaber/avx2/polymul/matrix.c
new file mode 100644
index 00000000..5fa35783
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/polymul/matrix.c
@@ -0,0 +1,303 @@
+#include <immintrin.h>
+
+static void transpose_n1(__m256i *M)
+{
+	//int i;
+	register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
+	register __m256i temp, temp0, temp1, temp2;
+
+	//for(i=0; i<8; i=i+1)
+	//{
+		r0 = _mm256_unpacklo_epi16(M[0], M[1]); 
+		r1 = _mm256_unpacklo_epi16(M[2], M[3]); 
+		r2 = _mm256_unpacklo_epi16(M[4], M[5]); 
+		r3 = _mm256_unpacklo_epi16(M[6], M[7]);
+		r4 = _mm256_unpacklo_epi16(M[8], M[9]); 
+		r5 = _mm256_unpacklo_epi16(M[10], M[11]);
+		r6 = _mm256_unpacklo_epi16(M[12], M[13]); 
+		r7 = _mm256_unpacklo_epi16(M[14], M[15]); 
+
+
+		temp = _mm256_unpacklo_epi32(r0, r1); 
+		temp0 = _mm256_unpacklo_epi32(r2, r3); 
+		temp1 = _mm256_unpacklo_epi32(r4, r5); 
+		temp2 = _mm256_unpacklo_epi32(r6, r7); 
+
+		r8 = _mm256_unpackhi_epi32(r0, r1); 
+		r9 = _mm256_unpackhi_epi32(r2, r3); 
+		r10 = _mm256_unpackhi_epi32(r4, r5); 
+		r11 = _mm256_unpackhi_epi32(r6, r7);
+
+		r0 = _mm256_unpacklo_epi64(temp, temp0); 
+		r2 = _mm256_unpackhi_epi64(temp, temp0); 
+
+		r1 = _mm256_unpacklo_epi64(temp1, temp2); 
+		r3 = _mm256_unpackhi_epi64(temp1, temp2);
+
+		temp = _mm256_unpackhi_epi16(M[0], M[1]); 
+		temp0 = _mm256_unpackhi_epi16(M[2], M[3]); 
+		temp1 = _mm256_unpackhi_epi16(M[4], M[5]); 
+		temp2 = _mm256_unpackhi_epi16(M[6], M[7]); 
+		r4 = _mm256_unpackhi_epi16(M[8], M[9]); 
+
+		M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
+		M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
+		M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
+		M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
+
+
+		r5 = _mm256_unpackhi_epi16(M[10], M[11]); 
+		r6 = _mm256_unpackhi_epi16(M[12], M[13]); 
+		r7 = _mm256_unpackhi_epi16(M[14], M[15]); 
+
+
+
+		r0 = _mm256_unpacklo_epi64(r8, r9); 
+		r1 = _mm256_unpacklo_epi64(r10, r11); 
+
+		r2 = _mm256_unpackhi_epi64(r8, r9); 
+		r3 = _mm256_unpackhi_epi64(r10, r11); 
+
+
+
+		M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
+		M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
+		M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
+		M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
+
+
+	//for(i=0; i<4; i=i+1)
+	//{
+		r0 = _mm256_unpacklo_epi32(temp, temp0); 
+		r1 = _mm256_unpacklo_epi32(temp1, temp2);
+		r2 = _mm256_unpacklo_epi32(r4, r5); 
+		r3 = _mm256_unpacklo_epi32(r6, r7); 
+
+	//}
+
+
+	//for(i=0; i<2; i=i+1)
+	//{
+		r8 = _mm256_unpacklo_epi64(r0, r1); 
+		r10 = _mm256_unpackhi_epi64(r0, r1); 
+
+		r9 = _mm256_unpacklo_epi64(r2, r3); 
+		r11 = _mm256_unpackhi_epi64(r2, r3); 
+
+		M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
+		M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
+		M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
+		M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
+
+		r0 = _mm256_unpackhi_epi32(temp, temp0); 
+		r1 = _mm256_unpackhi_epi32(temp1, temp2); 
+		r2 = _mm256_unpackhi_epi32(r4, r5); 
+		r3 = _mm256_unpackhi_epi32(r6, r7); 
+
+	//}
+//	for(i=0; i<2; i=i+1)
+//	{
+		r4 = _mm256_unpacklo_epi64(r0, r1); 
+		r6 = _mm256_unpackhi_epi64(r0, r1); 
+
+		r5 = _mm256_unpacklo_epi64(r2, r3); 
+		r7 = _mm256_unpackhi_epi64(r2, r3); 
+
+//	}
+
+	//-------------------------------------------------------
+
+	M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
+	M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
+	M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
+	M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
+}
+
+/*
+void transpose_unrolled(__m256i *M)
+{
+	int i;
+	__m256i tL[8], tH[8];
+	__m256i bL[4], bH[4], cL[4], cH[4];
+	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
+
+	__m256i r0, r1, r2, r3, r4, r5, r6, r7;
+
+	//for(i=0; i<8; i=i+1)
+	//{
+		tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); 
+		tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); 
+
+		tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); 
+		tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); 
+
+		tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); 
+		tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); 
+
+		tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); 
+		tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); 
+
+		tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); 
+		tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); 
+
+		tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); 
+		tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); 
+
+		tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); 
+		tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); 
+
+		tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); 
+		tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); 
+
+	//}
+
+	//-------------------------------------------------------
+	//for(i=0; i<4; i=i+1)
+	//{
+		bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); 
+		bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); 
+
+		bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); 
+		bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); 
+
+		bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); 
+		bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); 
+
+		bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); 
+		bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); 
+
+	//}
+
+	//for(i=0; i<2; i=i+1)
+	//{
+		dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); 
+		dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); 
+
+		dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); 
+		dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]);
+
+		M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
+		M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
+		M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
+		M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
+
+	//}
+	//for(i=0; i<2; i=i+1)
+	//{
+		eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); 
+		eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); 
+
+		eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); 
+		eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); 
+
+	//}
+
+	//-------------------------------------------------------
+
+	//-------------------------------------------------------
+	for(i=0; i<4; i=i+1)
+	{
+		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
+		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
+	}
+
+
+	for(i=0; i<2; i=i+1)
+	{
+		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
+		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
+		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
+	}
+
+	//-------------------------------------------------------
+
+
+
+	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
+	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
+	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
+	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
+
+	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
+	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
+	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
+	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
+
+	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
+	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
+	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
+	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
+}
+
+
+void transpose1(__m256i *M)
+{
+	int i;
+	__m256i tL[8], tH[8];
+	__m256i bL[4], bH[4], cL[4], cH[4];
+	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
+
+	for(i=0; i<8; i=i+1)
+	{
+		tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); 
+		tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); 
+	}
+
+	for(i=0; i<4; i=i+1)
+	{
+		bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); 
+		bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); 
+	}
+	for(i=0; i<4; i=i+1)
+	{
+		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
+		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
+	}
+
+	for(i=0; i<2; i=i+1)
+	{
+		dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); 
+		dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); 
+		eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); 
+	}
+
+	for(i=0; i<2; i=i+1)
+	{
+		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
+		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
+		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
+	}
+
+	M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
+	M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
+	M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
+	M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
+
+	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
+	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
+	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
+	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
+
+	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
+	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
+	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
+	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
+
+	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
+	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
+	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
+	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
+}
+*/
diff --git a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c
new file mode 100644
index 00000000..4e4f11f8
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c
@@ -0,0 +1,753 @@
+//#define SCM_SIZE 16
+
+//#pragma STDC FP_CONTRACT ON
+
+#include <immintrin.h>
+
+inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
+    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
+}
+
+
+static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
+									      //the c_avx are added cummulatively
+{
+
+	register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+	register __m256i temp;
+
+
+	a0=a[0];
+	a1=a[1];
+	a2=a[2];
+	a3=a[3];
+	a4=a[4];
+	a5=a[5];
+	a6=a[6];
+	a7=a[7];
+
+	b0=b[0];
+	b1=b[1];
+	b2=b[2];
+	b3=b[3];
+	b4=b[4];
+	b5=b[5];
+	b6=b[6];
+	b7=b[7];
+
+	// New Unrolled first triangle
+
+	//otherwise accumulate
+	c_avx[0] = mul_add(a0, b0, c_avx[0]);
+	
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	temp=mul_add(a1, b0, temp);
+	c_avx[1] = _mm256_add_epi16(temp, c_avx[1]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b2);
+	temp = mul_add(a1, b1, temp);
+	temp=mul_add(a2, b0, temp);
+	c_avx[2] = _mm256_add_epi16(temp, c_avx[2]);
+	
+
+	temp = _mm256_mullo_epi16 (a0, b3);
+	temp = mul_add(a1, b2, temp);
+	temp = mul_add(a2, b1, temp);
+	temp=mul_add(a3, b0, temp);
+	c_avx[3] = _mm256_add_epi16(temp, c_avx[3]);
+
+	temp = _mm256_mullo_epi16 (a0, b4);
+	temp = mul_add(a1, b3, temp);
+	temp = mul_add(a3, b1, temp);
+	temp = mul_add(a4, b0, temp);
+	temp=mul_add(a2, b2, temp);
+	c_avx[4] = _mm256_add_epi16(temp, c_avx[4]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b5);
+	temp = mul_add(a1, b4 , temp);
+	temp = mul_add(a2, b3, temp);
+	temp = mul_add(a3, b2, temp);
+	temp = mul_add( a4, b1, temp);
+	temp=mul_add(a5, b0, temp);
+	c_avx[5] = _mm256_add_epi16(temp, c_avx[5]);
+	
+	temp = _mm256_mullo_epi16 (a0, b6);
+	temp = mul_add(a1, b5, temp);
+	temp = mul_add(a5, b1, temp);
+	temp = mul_add(a6, b0, temp);
+	temp = mul_add(a2, b4, temp);
+	temp = mul_add(a3, b3, temp);
+	temp=mul_add(a4, b2, temp);
+	c_avx[6] = _mm256_add_epi16(temp, c_avx[6]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b7);
+	temp = mul_add(a1, b6, temp);
+	temp = mul_add (a6, b1, temp);
+	temp = mul_add (a7, b0, temp);
+	temp = mul_add(a2, b5, temp);
+	temp = mul_add (a3, b4, temp);
+	temp = mul_add (a4, b3, temp);
+	temp=mul_add(a5, b2, temp);
+	c_avx[7] = _mm256_add_epi16(temp, c_avx[7]);
+
+	temp = _mm256_mullo_epi16 (a0, b[8]);
+	temp = mul_add (a1, b7, temp);
+	temp = mul_add (a7, b1, temp);
+	temp = mul_add (a[8], b0, temp);
+	temp = mul_add (a2, b6,temp);
+	temp = mul_add(a3, b5, temp);
+	temp = mul_add (a4, b4,temp);
+	temp = mul_add (a5, b3, temp);
+	
+		temp=mul_add(a6, b2, temp);
+		c_avx[8] = _mm256_add_epi16(temp, c_avx[8]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[9]);
+	temp = mul_add (a1, b[8], temp);
+	temp = mul_add (a[8], b1, temp);
+	temp = mul_add (a[9], b0, temp);
+	temp = mul_add (a2, b7, temp);
+	temp = mul_add (a3, b6, temp);
+	temp = mul_add (a4, b5, temp);
+	temp = mul_add (a5, b4, temp);
+	temp = mul_add (a6, b3, temp);
+		temp=mul_add(a7, b2, temp);
+		c_avx[9] = _mm256_add_epi16(temp, c_avx[9]);
+
+
+	temp= _mm256_mullo_epi16 (a0, b[10]);
+	temp = mul_add (a1, b[9], temp);
+	temp = mul_add (a[9], b1, temp);
+	temp = mul_add (a[10], b0, temp);
+	temp = mul_add (a2, b[8], temp);
+	temp = mul_add (a3, b7, temp);
+	temp = mul_add (a4, b6, temp);
+	temp = mul_add (a5, b5, temp);
+	temp = mul_add (a6, b4, temp);
+	temp = mul_add (a7, b3, temp);
+		temp=mul_add(a[8], b2, temp);
+		c_avx[10] = _mm256_add_epi16(temp, c_avx[10]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[11]);
+	temp = mul_add (a1, b[10], temp );
+	temp = mul_add (a[10], b1, temp );
+	temp = mul_add (a[11], b0, temp );
+	temp = mul_add (a2, b[9], temp );
+	temp = mul_add (a3, b[8], temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a[8], b3, temp );
+		temp=mul_add(a[9], b2, temp);
+		c_avx[11] = _mm256_add_epi16(temp, c_avx[11]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[12]);
+	temp = mul_add (a1, b[11], temp);
+	temp = mul_add (a[11], b1, temp);
+	temp = mul_add (a[12], b0, temp);
+	temp = mul_add (a2, b[10], temp);
+	temp = mul_add (a3, b[9], temp);
+	temp = mul_add (a4, b[8], temp);
+	temp = mul_add (a5, b7, temp);
+	temp = mul_add (a6, b6, temp);
+	temp = mul_add (a7, b5, temp);
+	temp = mul_add (a[8], b4, temp);
+	temp = mul_add (a[9], b3, temp);
+		temp=mul_add(a[10], b2, temp);
+		c_avx[12] = _mm256_add_epi16(temp, c_avx[12]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[13]);
+	temp = mul_add (a1, b[12], temp );
+	temp = mul_add (a[12], b1, temp );
+	temp = mul_add (a[13], b0, temp );
+	temp = mul_add (a2, b[11], temp );
+	temp = mul_add (a3, b[10], temp );
+	temp = mul_add (a4, b[9], temp );
+	temp = mul_add (a5, b[8], temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a[8], b5, temp );
+	temp = mul_add (a[9], b4, temp );
+	temp = mul_add (a[10], b3, temp );
+		temp=mul_add(a[11], b2, temp);
+		c_avx[13] = _mm256_add_epi16(temp, c_avx[13]);
+
+
+
+	temp = _mm256_mullo_epi16 (a0, b[14]);
+	temp = mul_add (a1, b[13], temp );
+	temp = mul_add (a[13], b1, temp );
+	temp = mul_add (a[14], b0, temp );
+	temp = mul_add (a2, b[12], temp );
+	temp = mul_add (a3, b[11], temp );
+	temp = mul_add (a4, b[10], temp );
+	temp = mul_add (a5, b[9], temp );
+	temp = mul_add (a6, b[8], temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a[8], b6, temp );
+	temp = mul_add (a[9], b5, temp );
+	temp = mul_add (a[10], b4, temp );
+	temp = mul_add (a[11], b3, temp );
+		temp=mul_add(a[12], b2, temp);
+		c_avx[14] = _mm256_add_epi16(temp, c_avx[14]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[15]);
+	temp = mul_add (a1, b[14], temp );
+	temp = mul_add (a[14], b1, temp );
+	temp = mul_add (a[15], b0, temp );
+	temp = mul_add (a2, b[13], temp );
+	temp = mul_add (a3, b[12], temp );
+	temp = mul_add (a4, b[11], temp );
+	temp = mul_add (a5, b[10], temp );
+	temp = mul_add (a6, b[9], temp );
+	temp = mul_add (a7, b[8], temp );
+	temp = mul_add (a[8], b7, temp );
+	temp = mul_add (a[9], b6, temp );
+	temp = mul_add (a[10], b5, temp );
+	temp = mul_add (a[11], b4, temp );
+	temp = mul_add (a[12], b3, temp );
+		temp=mul_add(a[13], b2, temp);
+		c_avx[15] = _mm256_add_epi16(temp, c_avx[15]);
+
+
+	// unrolled second triangle
+	a0=a[14];
+	a1=a[15];
+	a2=a[13];
+	a3=a[12];
+	a4=a[11];
+	a5=a[10];
+	a6=a[9];
+	a7=a[8];
+
+	b0=b[14];
+	b1=b[15];
+	b2=b[13];
+	b3=b[12];
+	b4=b[11];
+	b5=b[10];
+	b6=b[9];
+	b7=b[8];
+
+	temp = _mm256_mullo_epi16 (a[1], b1);
+	temp = mul_add (a[2], b0, temp );
+	temp = mul_add (a[3], b2, temp );
+	temp = mul_add (a[4], b3, temp );
+	temp = mul_add (a[5], b4, temp );
+	temp = mul_add (a[6], b5, temp );
+	temp = mul_add (a[7], b6, temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a6, b[7], temp );
+	temp = mul_add (a5, b[6], temp );
+	temp = mul_add (a4, b[5], temp );
+	temp = mul_add (a3, b[4], temp );
+	temp = mul_add (a2, b[3], temp );
+	temp = mul_add (a0, b[2], temp );
+		temp=mul_add(a1, b[1], temp);
+		c_avx[16] = _mm256_add_epi16(temp, c_avx[16]);
+
+
+	temp = _mm256_mullo_epi16 (a[2], b1);
+	temp = mul_add (a[3], b0, temp );
+	temp = mul_add (a[4], b2, temp );
+	temp = mul_add (a[5], b3, temp );
+	temp = mul_add (a[6], b4, temp );
+	temp = mul_add (a[7], b5, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a5, b[7], temp );
+	temp = mul_add (a4, b[6], temp );
+	temp = mul_add (a3, b[5], temp );
+	temp = mul_add (a2, b[4], temp );
+	temp = mul_add (a0, b[3], temp );
+		temp=mul_add(a1, b[2], temp);
+		c_avx[17] = _mm256_add_epi16(temp, c_avx[17]);
+
+
+	temp = _mm256_mullo_epi16 (a[3], b1);
+	temp = mul_add (a[4], b0, temp );
+	temp = mul_add (a[5], b2, temp );
+	temp = mul_add (a[6], b3, temp );
+	temp = mul_add (a[7], b4, temp );
+	temp = mul_add (a7, b5, temp );
+	temp = mul_add (a6, b6, temp );
+	temp = mul_add (a5, b7, temp );
+	temp = mul_add (a4, b[7], temp );
+	temp = mul_add (a3, b[6], temp );
+	temp = mul_add (a2, b[5], temp );
+	temp = mul_add (a0, b[4], temp );
+		temp=mul_add(a1, b[3], temp);
+		c_avx[18] = _mm256_add_epi16(temp, c_avx[18]);
+
+
+	temp = _mm256_mullo_epi16 (a[4], b1);
+	temp = mul_add (a[5], b0, temp );
+	temp = mul_add (a[6], b2, temp );
+	temp = mul_add (a[7], b3, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a3, b[7], temp );
+	temp = mul_add (a2, b[6], temp );
+	temp = mul_add (a0, b[5], temp );
+		temp=mul_add(a1, b[4], temp);
+		c_avx[19] = _mm256_add_epi16(temp, c_avx[19]);
+
+
+	temp = _mm256_mullo_epi16 (a[5], b1);
+	temp = mul_add (a[6], b0, temp );
+	temp = mul_add (a[7], b2, temp );
+	temp = mul_add (a7, b3, temp );
+	temp = mul_add (a6, b4, temp );
+	temp = mul_add (a5, b5, temp );
+	temp = mul_add (a4, b6, temp );
+	temp = mul_add (a3, b7, temp );
+	temp = mul_add (a2, b[7], temp );
+	temp = mul_add (a0, b[6], temp );
+		temp=mul_add(a1, b[5], temp);
+		c_avx[20] = _mm256_add_epi16(temp, c_avx[20]);
+
+
+	temp = _mm256_mullo_epi16 (a[6], b1);
+	temp = mul_add (a[7], b0, temp );
+	temp = mul_add (a7, b2, temp );
+	temp = mul_add (a6, b3, temp );
+	temp = mul_add (a5, b4, temp );
+	temp = mul_add (a4, b5, temp );
+	temp = mul_add (a3, b6, temp );
+	temp = mul_add (a2, b7, temp );
+	temp = mul_add (a0, b[7], temp );
+		temp=mul_add(a1, b[6], temp);
+		c_avx[21] = _mm256_add_epi16(temp, c_avx[21]);
+
+
+	temp = _mm256_mullo_epi16 (a[7], b1);
+	temp = mul_add (a7, b0, temp );
+	temp = mul_add (a6, b2, temp );
+	temp = mul_add (a5, b3, temp );
+	temp = mul_add (a4, b4, temp );
+	temp = mul_add (a3, b5, temp );
+	temp = mul_add (a2, b6, temp );
+	temp = mul_add (a0, b7, temp );
+		temp=mul_add(a1, b[7], temp);
+		c_avx[22] = _mm256_add_epi16(temp, c_avx[22]);
+
+
+	temp = _mm256_mullo_epi16 (a7, b1);
+	temp = mul_add (a6, b0, temp );
+	temp = mul_add (a5, b2, temp );
+	temp = mul_add (a4, b3, temp );
+	temp = mul_add (a3, b4, temp );
+	temp = mul_add (a2, b5, temp );
+	temp = mul_add (a0, b6, temp );
+		temp=mul_add(a1, b7, temp);
+		c_avx[23] = _mm256_add_epi16(temp, c_avx[23]);
+
+
+	temp = _mm256_mullo_epi16 (a6, b1);
+	temp = mul_add (a5, b0, temp );
+	temp = mul_add (a4, b2, temp );
+	temp = mul_add (a3, b3, temp );
+	temp = mul_add (a2, b4, temp );
+	temp = mul_add (a0, b5, temp );
+		temp=mul_add(a1, b6, temp);
+		c_avx[24] = _mm256_add_epi16(temp, c_avx[24]);
+
+
+	temp = _mm256_mullo_epi16 (a5, b1);
+	temp = mul_add (a4, b0, temp );
+	temp = mul_add (a3, b2, temp );
+	temp = mul_add (a2, b3, temp );
+	temp = mul_add (a0, b4, temp );
+		temp=mul_add(a1, b5, temp);
+		c_avx[25] = _mm256_add_epi16(temp, c_avx[25]);
+
+
+	temp = _mm256_mullo_epi16 (a4, b1);
+	temp = mul_add (a3, b0, temp );
+	temp = mul_add (a2, b2, temp );
+	temp = mul_add (a0, b3, temp );
+		temp=mul_add(a1, b4, temp);
+		c_avx[26] = _mm256_add_epi16(temp, c_avx[26]);
+
+
+	temp = _mm256_mullo_epi16 (a3, b1);
+	temp = mul_add (a2, b0, temp );
+	temp = mul_add (a0, b2, temp );
+		temp=mul_add(a1, b3, temp);
+		c_avx[27] = _mm256_add_epi16(temp, c_avx[27]);
+
+
+	temp = _mm256_mullo_epi16 (a2, b1);
+	temp = mul_add (a0, b0, temp );
+		temp=mul_add(a1, b2, temp);
+		c_avx[28] = _mm256_add_epi16(temp, c_avx[28]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+		temp=mul_add(a1, b0, temp);
+		c_avx[29] = _mm256_add_epi16(temp, c_avx[29]);
+
+
+		c_avx[30] = mul_add(a1, b1, c_avx[30]);
+
+
+
+	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
+
+
+}
+
+
+
+static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
+									      //the c_avx are not added cummulatively
+{
+
+	__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+	__m256i temp;
+
+
+	a0=a[0];
+	a1=a[1];
+	a2=a[2];
+	a3=a[3];
+	a4=a[4];
+	a5=a[5];
+	a6=a[6];
+	a7=a[7];
+
+	b0=b[0];
+	b1=b[1];
+	b2=b[2];
+	b3=b[3];
+	b4=b[4];
+	b5=b[5];
+	b6=b[6];
+	b7=b[7];
+
+	// New Unrolled first triangle
+	c_avx[0] = _mm256_mullo_epi16 (a0, b0);
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	c_avx[1]=mul_add(a1, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b2);
+
+	temp = mul_add(a1, b1, temp);
+	c_avx[2]= mul_add(a2, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b3);
+	temp = mul_add(a1, b2, temp);
+	temp = mul_add(a2, b1, temp);
+	c_avx[3]= mul_add(a3, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b4);
+	temp = mul_add(a1, b3, temp);
+	temp = mul_add(a3, b1, temp);
+	temp = mul_add(a4, b0, temp);
+	c_avx[4]= mul_add(a2, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b5);
+	temp = mul_add(a1, b4 , temp);
+	temp = mul_add(a2, b3, temp);
+	temp = mul_add(a3, b2, temp);
+	temp = mul_add( a4, b1, temp);
+	c_avx[5] = mul_add(a5, b0, temp);
+	
+	temp = _mm256_mullo_epi16 (a0, b6);
+	temp = mul_add(a1, b5, temp);
+	temp = mul_add(a5, b1, temp);
+	temp = mul_add(a6, b0, temp);
+	temp = mul_add(a2, b4, temp);
+	temp = mul_add(a3, b3, temp);
+	c_avx[6] = mul_add(a4, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b7);
+	temp = mul_add(a1, b6, temp);
+	temp = mul_add (a6, b1, temp);
+	temp = mul_add (a7, b0, temp);
+	temp = mul_add(a2, b5, temp);
+	temp = mul_add (a3, b4, temp);
+	temp = mul_add (a4, b3, temp);
+	c_avx[7] = mul_add (a5, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[8]);
+	temp = mul_add (a1, b7, temp);
+	temp = mul_add (a7, b1, temp);
+	temp = mul_add (a[8], b0, temp);
+	temp = mul_add (a2, b6,temp);
+	temp = mul_add(a3, b5, temp);
+	temp = mul_add (a4, b4,temp);
+	temp = mul_add (a5, b3, temp);
+	c_avx[8] = mul_add (a6, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[9]);
+	temp = mul_add (a1, b[8], temp);
+	temp = mul_add (a[8], b1, temp);
+	temp = mul_add (a[9], b0, temp);
+	temp = mul_add (a2, b7, temp);
+	temp = mul_add (a3, b6, temp);
+	temp = mul_add (a4, b5, temp);
+	temp = mul_add (a5, b4, temp);
+	temp = mul_add (a6, b3, temp);
+	c_avx[9] = mul_add (a7, b2, temp);
+
+	temp= _mm256_mullo_epi16 (a0, b[10]);
+	temp = mul_add (a1, b[9], temp);
+	temp = mul_add (a[9], b1, temp);
+	temp = mul_add (a[10], b0, temp);
+	temp = mul_add (a2, b[8], temp);
+	temp = mul_add (a3, b7, temp);
+	temp = mul_add (a4, b6, temp);
+	temp = mul_add (a5, b5, temp);
+	temp = mul_add (a6, b4, temp);
+	temp = mul_add (a7, b3, temp);
+	c_avx[10] = mul_add (a[8], b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[11]);
+	temp = mul_add (a1, b[10], temp );
+	temp = mul_add (a[10], b1, temp );
+	temp = mul_add (a[11], b0, temp );
+	temp = mul_add (a2, b[9], temp );
+	temp = mul_add (a3, b[8], temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a[8], b3, temp );
+	c_avx[11] = mul_add (a[9], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[12]);
+	temp = mul_add (a1, b[11], temp);
+	temp = mul_add (a[11], b1, temp);
+	temp = mul_add (a[12], b0, temp);
+	temp = mul_add (a2, b[10], temp);
+	temp = mul_add (a3, b[9], temp);
+	temp = mul_add (a4, b[8], temp);
+	temp = mul_add (a5, b7, temp);
+	temp = mul_add (a6, b6, temp);
+	temp = mul_add (a7, b5, temp);
+	temp = mul_add (a[8], b4, temp);
+	temp = mul_add (a[9], b3, temp);
+	c_avx[12] = mul_add (a[10], b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[13]);
+	temp = mul_add (a1, b[12], temp );
+	temp = mul_add (a[12], b1, temp );
+	temp = mul_add (a[13], b0, temp );
+	temp = mul_add (a2, b[11], temp );
+	temp = mul_add (a3, b[10], temp );
+	temp = mul_add (a4, b[9], temp );
+	temp = mul_add (a5, b[8], temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a[8], b5, temp );
+	temp = mul_add (a[9], b4, temp );
+	temp = mul_add (a[10], b3, temp );
+	c_avx[13] = mul_add (a[11], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[14]);
+	temp = mul_add (a1, b[13], temp );
+	temp = mul_add (a[13], b1, temp );
+	temp = mul_add (a[14], b0, temp );
+	temp = mul_add (a2, b[12], temp );
+	temp = mul_add (a3, b[11], temp );
+	temp = mul_add (a4, b[10], temp );
+	temp = mul_add (a5, b[9], temp );
+	temp = mul_add (a6, b[8], temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a[8], b6, temp );
+	temp = mul_add (a[9], b5, temp );
+	temp = mul_add (a[10], b4, temp );
+	temp = mul_add (a[11], b3, temp );
+	c_avx[14] = mul_add (a[12], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[15]);
+	temp = mul_add (a1, b[14], temp );
+	temp = mul_add (a[14], b1, temp );
+	temp = mul_add (a[15], b0, temp );
+	temp = mul_add (a2, b[13], temp );
+	temp = mul_add (a3, b[12], temp );
+	temp = mul_add (a4, b[11], temp );
+	temp = mul_add (a5, b[10], temp );
+	temp = mul_add (a6, b[9], temp );
+	temp = mul_add (a7, b[8], temp );
+	temp = mul_add (a[8], b7, temp );
+	temp = mul_add (a[9], b6, temp );
+	temp = mul_add (a[10], b5, temp );
+	temp = mul_add (a[11], b4, temp );
+	temp = mul_add (a[12], b3, temp );
+	c_avx[15] = mul_add (a[13], b2, temp );
+
+
+	// unrolled second triangle
+	a0=a[14];
+	a1=a[15];
+	a2=a[13];
+	a3=a[12];
+	a4=a[11];
+	a5=a[10];
+	a6=a[9];
+	a7=a[8];
+
+	b0=b[14];
+	b1=b[15];
+	b2=b[13];
+	b3=b[12];
+	b4=b[11];
+	b5=b[10];
+	b6=b[9];
+	b7=b[8];
+	
+
+	temp = _mm256_mullo_epi16 (a[1], b1);
+	temp = mul_add (a[2], b0, temp );
+	temp = mul_add (a[3], b2, temp );
+	temp = mul_add (a[4], b3, temp );
+	temp = mul_add (a[5], b4, temp );
+	temp = mul_add (a[6], b5, temp );
+	temp = mul_add (a[7], b6, temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a6, b[7], temp );
+	temp = mul_add (a5, b[6], temp );
+	temp = mul_add (a4, b[5], temp );
+	temp = mul_add (a3, b[4], temp );
+	temp = mul_add (a2, b[3], temp );
+	temp = mul_add (a0, b[2], temp );
+	c_avx[16] = mul_add (a1, b[1], temp );
+
+	temp = _mm256_mullo_epi16 (a[2], b1);
+	temp = mul_add (a[3], b0, temp );
+	temp = mul_add (a[4], b2, temp );
+	temp = mul_add (a[5], b3, temp );
+	temp = mul_add (a[6], b4, temp );
+	temp = mul_add (a[7], b5, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a5, b[7], temp );
+	temp = mul_add (a4, b[6], temp );
+	temp = mul_add (a3, b[5], temp );
+	temp = mul_add (a2, b[4], temp );
+	temp = mul_add (a0, b[3], temp );
+	c_avx[17] = mul_add (a1, b[2], temp );
+
+	temp = _mm256_mullo_epi16 (a[3], b1);
+	temp = mul_add (a[4], b0, temp );
+	temp = mul_add (a[5], b2, temp );
+	temp = mul_add (a[6], b3, temp );
+	temp = mul_add (a[7], b4, temp );
+	temp = mul_add (a7, b5, temp );
+	temp = mul_add (a6, b6, temp );
+	temp = mul_add (a5, b7, temp );
+	temp = mul_add (a4, b[7], temp );
+	temp = mul_add (a3, b[6], temp );
+	temp = mul_add (a2, b[5], temp );
+	temp = mul_add (a0, b[4], temp );
+	c_avx[18] = mul_add (a1, b[3], temp );
+
+	temp = _mm256_mullo_epi16 (a[4], b1);
+	temp = mul_add (a[5], b0, temp );
+	temp = mul_add (a[6], b2, temp );
+	temp = mul_add (a[7], b3, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a3, b[7], temp );
+	temp = mul_add (a2, b[6], temp );
+	temp = mul_add (a0, b[5], temp );
+	c_avx[19] = mul_add (a1, b[4], temp );
+
+	temp = _mm256_mullo_epi16 (a[5], b1);
+	temp = mul_add (a[6], b0, temp );
+	temp = mul_add (a[7], b2, temp );
+	temp = mul_add (a7, b3, temp );
+	temp = mul_add (a6, b4, temp );
+	temp = mul_add (a5, b5, temp );
+	temp = mul_add (a4, b6, temp );
+	temp = mul_add (a3, b7, temp );
+	temp = mul_add (a2, b[7], temp );
+	temp = mul_add (a0, b[6], temp );
+	c_avx[20] = mul_add (a1, b[5], temp );
+
+	temp = _mm256_mullo_epi16 (a[6], b1);
+	temp = mul_add (a[7], b0, temp );
+	temp = mul_add (a7, b2, temp );
+	temp = mul_add (a6, b3, temp );
+	temp = mul_add (a5, b4, temp );
+	temp = mul_add (a4, b5, temp );
+	temp = mul_add (a3, b6, temp );
+	temp = mul_add (a2, b7, temp );
+	temp = mul_add (a0, b[7], temp );
+	c_avx[21] = mul_add (a1, b[6], temp );
+
+	temp = _mm256_mullo_epi16 (a[7], b1);
+	temp = mul_add (a7, b0, temp );
+	temp = mul_add (a6, b2, temp );
+	temp = mul_add (a5, b3, temp );
+	temp = mul_add (a4, b4, temp );
+	temp = mul_add (a3, b5, temp );
+	temp = mul_add (a2, b6, temp );
+	temp = mul_add (a0, b7, temp );
+	c_avx[22] = mul_add (a1, b[7], temp );
+
+	temp = _mm256_mullo_epi16 (a7, b1);
+	temp = mul_add (a6, b0, temp );
+	temp = mul_add (a5, b2, temp );
+	temp = mul_add (a4, b3, temp );
+	temp = mul_add (a3, b4, temp );
+	temp = mul_add (a2, b5, temp );
+	temp = mul_add (a0, b6, temp );
+	c_avx[23] = mul_add (a1, b7, temp );
+
+	temp = _mm256_mullo_epi16 (a6, b1);
+	temp = mul_add (a5, b0, temp );
+	temp = mul_add (a4, b2, temp );
+	temp = mul_add (a3, b3, temp );
+	temp = mul_add (a2, b4, temp );
+	temp = mul_add (a0, b5, temp );
+	c_avx[24] = mul_add (a1, b6, temp );
+
+	temp = _mm256_mullo_epi16 (a5, b1);
+	temp = mul_add (a4, b0, temp );
+	temp = mul_add (a3, b2, temp );
+	temp = mul_add (a2, b3, temp );
+	temp = mul_add (a0, b4, temp );
+	c_avx[25] = mul_add (a1, b5, temp );
+
+	temp = _mm256_mullo_epi16 (a4, b1);
+	temp = mul_add (a3, b0, temp );
+	temp = mul_add (a2, b2, temp );
+	temp = mul_add (a0, b3, temp );
+	c_avx[26] = mul_add (a1, b4, temp );
+
+	temp = _mm256_mullo_epi16 (a3, b1);
+	temp = mul_add (a2, b0, temp );
+	temp = mul_add (a0, b2, temp );
+	c_avx[27] = mul_add (a1, b3, temp );
+
+	temp = _mm256_mullo_epi16 (a2, b1);
+	temp = mul_add (a0, b0, temp );
+	c_avx[28] = mul_add (a1, b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	c_avx[29] = mul_add (a1, b0, temp);
+
+	c_avx[30] = _mm256_mullo_epi16 (a1, b1);
+
+
+	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
+
+}
diff --git a/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c b/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c
new file mode 100644
index 00000000..78fb86c2
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c
@@ -0,0 +1,1010 @@
+/*
+Cleaned version for step by step approach look into the _debug file
+*/
+//#include "timing.c"
+#include "consts.h"
+#include "matrix.c"
+#include "scm_avx.c"
+
+static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX.
+{
+	__m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time
+
+	//uint16_t i;
+
+	register __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+
+
+		//CLOCK1=cpucycles();
+		
+		//------------------AVX evaluation for 1st poly-----------------------
+
+                    r0_avx=a[0];
+                    r1_avx=a[1];
+                    r2_avx=a[2];
+                    r3_avx=a[3];
+		    a_bucket[0]=r0_avx;
+		    a_bucket[1]=r1_avx;
+		    a_bucket[2]=r2_avx;
+		    a_bucket[3]=r3_avx;
+		    a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]);
+
+
+		//------------------AVX evaluation for 1st poly ends------------------
+
+
+		//------------------AVX evaluation for 2nd poly-----------------------
+                    r0_avx=a[small_len_avx];
+                    r1_avx=a[small_len_avx+1];
+                    r2_avx=a[small_len_avx+2];
+                    r3_avx=a[small_len_avx+3];
+		    a_bucket[0+9]=r0_avx;
+		    a_bucket[1+9]=r1_avx;
+		    a_bucket[2+9]=r2_avx;
+		    a_bucket[3+9]=r3_avx;
+		    a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]);
+
+	
+		//------------------AVX evaluation for 2nd poly ends------------------
+
+
+		//------------------AVX evaluation for 3rd poly-----------------------
+                    r0_avx=a[2*small_len_avx];
+                    r1_avx=a[2*small_len_avx+1];
+                    r2_avx=a[2*small_len_avx+2];
+                    r3_avx=a[2*small_len_avx+3];
+		    a_bucket[0+18]=r0_avx;
+		    a_bucket[1+18]=r1_avx;
+		    a_bucket[2+18]=r2_avx;
+		    a_bucket[3+18]=r3_avx;
+		    a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]);
+		
+		//------------------AVX evaluation for 3rd poly ends------------------
+
+
+		//------------------AVX evaluation for 4th poly-----------------------
+
+                    r0_avx=a[3*small_len_avx];
+                    r1_avx=a[3*small_len_avx+1];
+                    r2_avx=a[3*small_len_avx+2];
+                    r3_avx=a[3*small_len_avx+3];
+		    a_bucket[0+27]=r0_avx;
+		    a_bucket[1+27]=r1_avx;
+		    a_bucket[2+27]=r2_avx;
+		    a_bucket[3+27]=r3_avx;
+		    a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]);
+		
+		//------------------AVX evaluation for 4th poly ends------------------
+
+		//------------------AVX evaluation for 5th poly-----------------------
+		
+                    r0_avx=a[4*small_len_avx+0];
+                    r1_avx=a[4*small_len_avx+1];
+                    r2_avx=a[4*small_len_avx+2];
+                    r3_avx=a[4*small_len_avx+3];
+		    a_bucket[0+36]=r0_avx;
+		    a_bucket[1+36]=r1_avx;
+		    a_bucket[2+36]=r2_avx;
+		    a_bucket[3+36]=r3_avx;
+		    a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]);
+		
+		//------------------AVX evaluation for 5th poly ends------------------
+
+
+		//------------------AVX evaluation for 6th poly-----------------------
+                    r0_avx=a[5*small_len_avx];
+                    r1_avx=a[5*small_len_avx+1];
+                    r2_avx=a[5*small_len_avx+2];
+                    r3_avx=a[5*small_len_avx+3];
+		    a_bucket[0+45]=r0_avx;
+		    a_bucket[1+45]=r1_avx;
+		    a_bucket[2+45]=r2_avx;
+		    a_bucket[3+45]=r3_avx;
+		    a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]);
+		
+		//------------------AVX evaluation for 6th poly ends------------------
+
+		//------------------AVX evaluation for 7th poly-----------------------
+
+                    r0_avx=a[6*small_len_avx];
+                    r1_avx=a[6*small_len_avx+1];
+                    r2_avx=a[6*small_len_avx+2];
+                    r3_avx=a[6*small_len_avx+3];
+		    a_bucket[0+54]=r0_avx;
+		    a_bucket[1+54]=r1_avx;
+		    a_bucket[2+54]=r2_avx;
+		    a_bucket[3+54]=r3_avx;
+		    a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]);
+
+		//------------------AVX evaluation for 7th poly ends------------------
+		
+	
+
+		//CLOCK2=cpucycles();
+		//CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1);
+		//printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1);
+
+
+		//CLOCK1=cpucycles();
+		//-----------------Forward transposes--------------------------------------
+			transpose_n1(a_bucket);
+			transpose_n1(a_bucket+16);
+			transpose_n1(a_bucket+32);
+			transpose_n1(a_bucket+48);
+
+		//-----------------Forwatrd transposes ends---------------------------------
+
+		//----------------------all multiplications---------------------------------
+		if(f==0){
+			schoolbook_avx_new2(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
+			schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
+			schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
+		}
+		else{
+			schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
+			//schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
+			schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
+		}
+		/*
+		schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f);
+		schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f);
+		schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f);
+		schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f);
+		*/
+
+
+		//----------------------all multiplications ends-----------------------------
+
+
+		//-----------------Reverse transposes--------------------------------------
+
+			/*
+			transpose(c_bucket);
+			transpose(c_bucket+16);
+
+			transpose(c_bucket+2*SCM_SIZE);
+			transpose(c_bucket+16+2*SCM_SIZE);
+
+			transpose(c_bucket+4*SCM_SIZE);
+			transpose(c_bucket+16+4*SCM_SIZE);
+
+			transpose(c_bucket+6*SCM_SIZE);
+			transpose(c_bucket+16+6*SCM_SIZE);
+			*/
+		//-----------------Reverse transposes ends---------------------------------
+
+		//CLOCK2=cpucycles();
+		//CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1);
+
+		//KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6);
+		
+}
+
+static void KARA_eval(__m256i* b, __m256i *b_bucket){
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+
+		//-------1st poly----------------------------------------------------
+                    r0_avx=b[0];
+                    r1_avx=b[1];
+                    r2_avx=b[2];
+                    r3_avx=b[3];
+		    b_bucket[0]=r0_avx;
+		    b_bucket[1]=r1_avx;
+		    b_bucket[2]=r2_avx;
+		    b_bucket[3]=r3_avx;
+		    b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]);
+		//-------2nd poly----------------------------------------------------
+
+                    r0_avx=b[small_len_avx];
+                    r1_avx=b[small_len_avx+1];
+                    r2_avx=b[small_len_avx+2];
+                    r3_avx=b[small_len_avx+3];
+		    b_bucket[0+9]=r0_avx;
+		    b_bucket[1+9]=r1_avx;
+		    b_bucket[2+9]=r2_avx;
+		    b_bucket[3+9]=r3_avx;
+		    b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]);
+
+		//-------3rd poly----------------------------------------------------
+
+                    r0_avx=b[2*small_len_avx+0];
+                    r1_avx=b[2*small_len_avx+1];
+                    r2_avx=b[2*small_len_avx+2];
+                    r3_avx=b[2*small_len_avx+3];
+		    b_bucket[0+18]=r0_avx;
+		    b_bucket[1+18]=r1_avx;
+		    b_bucket[2+18]=r2_avx;
+		    b_bucket[3+18]=r3_avx;
+		    b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]);
+
+		//-------4th poly----------------------------------------------------
+                    r0_avx=b[3*small_len_avx];
+                    r1_avx=b[3*small_len_avx+1];
+                    r2_avx=b[3*small_len_avx+2];
+                    r3_avx=b[3*small_len_avx+3];
+		    b_bucket[0+27]=r0_avx;
+		    b_bucket[1+27]=r1_avx;
+		    b_bucket[2+27]=r2_avx;
+		    b_bucket[3+27]=r3_avx;
+		    b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]);
+
+		//-------5th poly----------------------------------------------------
+
+                    r0_avx=b[4*small_len_avx];
+                    r1_avx=b[4*small_len_avx+1];
+                    r2_avx=b[4*small_len_avx+2];
+                    r3_avx=b[4*small_len_avx+3];
+		    b_bucket[0+36]=r0_avx;
+		    b_bucket[1+36]=r1_avx;
+		    b_bucket[2+36]=r2_avx;
+		    b_bucket[3+36]=r3_avx;
+		    b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]);
+
+		//-------6th poly----------------------------------------------------
+
+                    r0_avx=b[5*small_len_avx];
+                    r1_avx=b[5*small_len_avx+1];
+                    r2_avx=b[5*small_len_avx+2];
+                    r3_avx=b[5*small_len_avx+3];
+		    b_bucket[0+45]=r0_avx;
+		    b_bucket[1+45]=r1_avx;
+		    b_bucket[2+45]=r2_avx;
+		    b_bucket[3+45]=r3_avx;
+		    b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]);
+
+		//-------7th poly----------------------------------------------------
+
+                    r0_avx=b[6*small_len_avx];
+                    r1_avx=b[6*small_len_avx+1];
+                    r2_avx=b[6*small_len_avx+2];
+                    r3_avx=b[6*small_len_avx+3];
+		    b_bucket[0+54]=r0_avx;
+		    b_bucket[1+54]=r1_avx;
+		    b_bucket[2+54]=r2_avx;
+		    b_bucket[3+54]=r3_avx;
+		    b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]);
+
+		//--------------Evaluating B poly ends-------------------------------
+
+			transpose_n1(b_bucket);
+			transpose_n1(b_bucket+16);
+			transpose_n1(b_bucket+32);
+			transpose_n1(b_bucket+48);	
+}
+
+static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){
+
+		//int64_t i;
+		register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
+
+		__m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
+
+		//CLOCK1=cpucycles();
+
+		   //------------------------AVX interpolation for 1st poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[0];
+				res_avx2 = c_bucket[1];
+				res_avx4 = c_bucket[2];
+				res_avx6 = c_bucket[3];
+
+				c6_avx=c_bucket[6];
+				c7_avx=c_bucket[7];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[16];
+				res_avx3 = c_bucket[17];
+				res_avx5 = c_bucket[18];
+				res_avx7 = c_bucket[19];
+
+				c22_avx=c_bucket[22];
+				c23_avx=c_bucket[23];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final0[0]=res_avx0;
+				result_final0[1]=res_avx1;
+
+				result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final0[6]=res_avx6;
+				result_final0[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 1st poly ends--------------
+
+
+		   //------------------------AVX interpolation for 2nd poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[9]; //c_bucket0
+				res_avx2 = c_bucket[10]; //c_bucket1
+				res_avx4 = c_bucket[11]; //c_bucket2
+				res_avx6 = c_bucket[12]; //c_bucket3
+
+				c6_avx=c_bucket[15]; //c_bucket6
+				c7_avx=c_bucket[32]; //c_bucket7
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[25]; //c_bucket0
+				res_avx3 = c_bucket[26]; //c_bucket1
+				res_avx5 = c_bucket[27]; //c_bucket2
+				res_avx7 = c_bucket[28]; //c_bucket3
+
+				c22_avx=c_bucket[31];
+				c23_avx=c_bucket[48];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final1[0]=res_avx0;
+				result_final1[1]=res_avx1;
+
+				result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final1[6]=res_avx6;
+				result_final1[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 2nd poly ends--------------
+
+		   //------------------------AVX interpolation for 3rd poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[34]; //c_bucket0
+				res_avx2 = c_bucket[35]; //c_bucket1
+				res_avx4 = c_bucket[36];
+				res_avx6 = c_bucket[37];
+
+				c6_avx=c_bucket[40];
+				c7_avx=c_bucket[41];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[50]; //c_bucket0
+				res_avx3 = c_bucket[51]; //c_bucket1
+				res_avx5 = c_bucket[52];
+				res_avx7 = c_bucket[53];
+
+				c22_avx=c_bucket[56];
+				c23_avx=c_bucket[57];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+			//loop4
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+			//loop5
+				result_final2[0]=res_avx0;
+				result_final2[1]=res_avx1;
+
+				result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final2[6]=res_avx6;
+				result_final2[7]=res_avx7;
+
+		   //------------------------AVX interpolation for 3rd poly ends--------------
+		
+		   //------------------------AVX interpolation for 4th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[43];
+				res_avx2 = c_bucket[44];
+				res_avx4 = c_bucket[45];
+				res_avx6 = c_bucket[46];
+
+				c6_avx=c_bucket[65];
+				c7_avx=c_bucket[66];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[59];
+				res_avx3 = c_bucket[60];
+				res_avx5 = c_bucket[61];
+				res_avx7 = c_bucket[62];
+
+				c22_avx=c_bucket[81];
+				c23_avx=c_bucket[82];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final3[0]=res_avx0;
+				result_final3[1]=res_avx1;
+
+				result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final3[6]=res_avx6;
+				result_final3[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 4th poly ends--------------
+
+		   //------------------------AVX interpolation for 5th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[68];
+				res_avx2 = c_bucket[69];
+				res_avx4 = c_bucket[70];
+				res_avx6 = c_bucket[71];
+
+				c6_avx=c_bucket[74];
+				c7_avx=c_bucket[75];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[84];
+				res_avx3 = c_bucket[85];
+				res_avx5 = c_bucket[86];
+				res_avx7 = c_bucket[87];
+
+				c22_avx=c_bucket[90];
+				c23_avx=c_bucket[91];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final4[0]=res_avx0;
+				result_final4[1]=res_avx1;
+
+				result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final4[6]=res_avx6;
+				result_final4[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 5th poly ends--------------
+
+		   //------------------------AVX interpolation for 6th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[77];
+				res_avx2 = c_bucket[78];
+				res_avx4 = c_bucket[79];
+				res_avx6 = c_bucket[96];
+
+				c6_avx=c_bucket[99];
+				c7_avx=c_bucket[100];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[93];
+				res_avx3 = c_bucket[94];
+				res_avx5 = c_bucket[95];
+				res_avx7 = c_bucket[112];
+
+				c22_avx=c_bucket[115];
+				c23_avx=c_bucket[116];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final5[0]=res_avx0;
+				result_final5[1]=res_avx1;
+
+				result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final5[6]=res_avx6;
+				result_final5[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 6th poly ends--------------
+
+		   //------------------------AVX interpolation for 7th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[102];
+				res_avx2 = c_bucket[103];
+				res_avx4 = c_bucket[104];
+				res_avx6 = c_bucket[105];
+
+				c6_avx=c_bucket[108];
+				c7_avx=c_bucket[109];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[118];
+				res_avx3 = c_bucket[119];
+				res_avx5 = c_bucket[120];
+				res_avx7 = c_bucket[121];
+
+				c22_avx=c_bucket[124];
+				c23_avx=c_bucket[125];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final6[0]=res_avx0;
+				result_final6[1]=res_avx1;
+
+				result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final6[6]=res_avx6;
+				result_final6[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 7th poly ends--------------
+
+		//CLOCK2=cpucycles();
+		//CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1);
+		//printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1);
+
+
+
+}
+
+static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ 
+
+	int i;
+
+//---------------AVX data-----------------------------
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+	__m256i aw_avx[7*small_len_avx];
+
+//----------------AVX data----------------------------
+
+
+// EVALUATION
+
+	//CLOCK1=cpucycles();
+
+	for (i=0; i<small_len_avx; i++){
+		r0_avx=a_avx[i];
+		r1_avx=a_avx[i + small_len_avx];
+		r2_avx=a_avx[i + 2*small_len_avx];
+		r3_avx=a_avx[i + 3*small_len_avx];
+		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
+		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		aw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		aw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx=_mm256_slli_epi16(r0_avx,2);
+		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
+		r4_avx=_mm256_slli_epi16(r4_avx,1);
+		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
+		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
+		aw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		aw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx= _mm256_slli_epi16(r3_avx, 3);
+		r6_avx= _mm256_slli_epi16(r2_avx, 2);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		r6_avx= _mm256_slli_epi16(r1_avx, 1);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		aw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
+		aw_avx[6*small_len_avx+i]= r0_avx; 
+		aw_avx[i]= r3_avx;
+	}
+
+
+	//CLOCK2=cpucycles();
+	//CLOCK_TC_EVAL=CLOCK_TC_EVAL+(CLOCK2-CLOCK1);
+
+	batch_64coefficient_multiplications_new(aw_avx, b_bucket, c_bucket, f);//New
+
+}
+
+static void TC_eval(__m256i* b_avx, __m256i* b_bucket){
+
+	int i;
+	__m256i bw_avx[7*small_len_avx];
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+
+	for (i=0; i<small_len_avx; i++){
+		
+		r0_avx=b_avx[i];
+		r1_avx=b_avx[i + small_len_avx];
+		r2_avx=b_avx[i + 2*small_len_avx];
+		r3_avx=b_avx[i + 3*small_len_avx];
+		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
+		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		bw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		bw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx=_mm256_slli_epi16(r0_avx,2);
+		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
+		r4_avx=_mm256_slli_epi16(r4_avx,1);
+		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
+		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
+		bw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		bw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx= _mm256_slli_epi16(r3_avx, 3);
+		r6_avx= _mm256_slli_epi16(r2_avx, 2);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		r6_avx= _mm256_slli_epi16(r1_avx, 1);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		bw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
+		bw_avx[6*small_len_avx+i]= r0_avx;
+		bw_avx[i]= r3_avx;
+	}
+
+	KARA_eval(bw_avx, b_bucket);
+
+}
+
+
+static void TC_interpol(__m256i *c_bucket, __m256i* res_avx){
+
+	int i;
+
+	register __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
+
+	__m256i w1_avx[2*small_len_avx],w2_avx[2*small_len_avx],w3_avx[2*small_len_avx],w4_avx[2*small_len_avx],w5_avx[2*small_len_avx],w6_avx[2*small_len_avx],w7_avx[2*small_len_avx];
+
+	__m256i res_avx_output[2*AVX_N1];
+
+	//CLOCK1=cpucycles();
+
+	
+	transpose_n1(c_bucket);
+	transpose_n1(c_bucket+16);
+
+	transpose_n1(c_bucket+2*SCM_SIZE);
+	transpose_n1(c_bucket+16+2*SCM_SIZE);
+
+	transpose_n1(c_bucket+4*SCM_SIZE);
+	transpose_n1(c_bucket+16+4*SCM_SIZE);
+
+	transpose_n1(c_bucket+6*SCM_SIZE);
+	transpose_n1(c_bucket+16+6*SCM_SIZE);
+	
+
+	KARA_interpol(c_bucket, w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx);
+
+	for (i = 0; i < 2*small_len_avx; i++) {
+
+		r0_avx = w1_avx[i];
+		r1_avx = w2_avx[i];
+		r2_avx = w3_avx[i];
+		r3_avx = w4_avx[i];
+		r4_avx = w5_avx[i];
+		r5_avx = w6_avx[i];
+		r6_avx = w7_avx[i];
+		r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
+		r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
+		r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
+		r3_avx = _mm256_srli_epi16(r3_avx, 1);
+		r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
+		temp_avx = _mm256_slli_epi16(r6_avx, 6);
+		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+		r4_avx = _mm256_slli_epi16(r4_avx, 1);
+		r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
+		r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
+		temp_avx = _mm256_slli_epi16(r2_avx, 6);
+		r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
+		r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
+		r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
+		r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
+		temp_avx = _mm256_mullo_epi16 (r2_avx, _mm256_set1_epi16(45));
+		r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+		temp_avx = _mm256_slli_epi16(r2_avx, 3);
+		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+		r4_avx = _mm256_mullo_epi16 (r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
+		r4_avx = _mm256_srli_epi16(r4_avx, 3);
+		r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
+		temp_avx = _mm256_slli_epi16(r3_avx, 4);
+		r1_avx= _mm256_add_epi16(r1_avx, temp_avx);
+		r1_avx = _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
+		r1_avx= _mm256_srli_epi16(r1_avx, 1); 	
+		r3_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		r3_avx= _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
+		temp_avx= _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(30));
+		temp_avx= _mm256_sub_epi16(temp_avx, r5_avx);
+		temp_avx= _mm256_mullo_epi16 (temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
+		r5_avx= _mm256_srli_epi16(temp_avx, 2);
+		r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
+		r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
+
+		if(i<small_len_avx){
+			res_avx_output[0*small_len_avx+i]=r6_avx;
+			res_avx_output[1*small_len_avx+i]=r5_avx;
+			res_avx_output[2*small_len_avx+i]=r4_avx;
+			res_avx_output[3*small_len_avx+i]=r3_avx;
+			res_avx_output[4*small_len_avx+i]=r2_avx;
+			res_avx_output[5*small_len_avx+i]=r1_avx;
+			res_avx_output[6*small_len_avx+i]=r0_avx;
+		}
+		else{
+			res_avx_output[0*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[0*small_len_avx+i], r6_avx);
+			res_avx_output[1*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[1*small_len_avx+i], r5_avx);
+			res_avx_output[2*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[2*small_len_avx+i], r4_avx);
+			res_avx_output[3*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[3*small_len_avx+i], r3_avx);
+			res_avx_output[4*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[4*small_len_avx+i], r2_avx);
+			res_avx_output[5*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[5*small_len_avx+i], r1_avx);
+			res_avx_output[6*small_len_avx+i]=r0_avx;
+		}
+	}
+
+	//CLOCK2=cpucycles();
+	//CLOCK_TC_INTER=CLOCK_TC_INTER+(CLOCK2-CLOCK1);
+
+	// Reduction by X^256 + 1
+	for(i=0; i<16; i++)
+  {
+		res_avx[i] = _mm256_sub_epi16(res_avx_output[i], res_avx_output[i+16]);
+  }
+
+}
diff --git a/crypto_kem/lightsaber/avx2/verify.c b/crypto_kem/lightsaber/avx2/verify.c
new file mode 100644
index 00000000..c2e5dc72
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/verify.c
@@ -0,0 +1,35 @@
+#include "verify.h"
+
+/*-------------------------------------------------
+This file has been adapted from the implementation
+(available at https://github.com/pq-crystals/kyber) of
+"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------*/
+
+
+/* returns 0 for equal strings, 1 for non-equal strings */
+uint8_t PQCLEAN_LIGHTSABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
+    uint64_t r;
+    size_t i;
+    r = 0;
+
+    for (i = 0; i < len; i++) {
+        r |= a[i] ^ b[i];
+    }
+
+    r = (~r + 1); // Two's complement
+    r >>= 63;
+    return (uint8_t) r;
+}
+
+/* b = 1 means mov, b = 0 means don't mov*/
+void PQCLEAN_LIGHTSABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
+    size_t i;
+
+    b = -b;
+    for (i = 0; i < len; i++) {
+        r[i] ^= b & (x[i] ^ r[i]);
+    }
+}
diff --git a/crypto_kem/lightsaber/avx2/verify.h b/crypto_kem/lightsaber/avx2/verify.h
new file mode 100644
index 00000000..f57ee9bc
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/verify.h
@@ -0,0 +1,22 @@
+#ifndef VERIFY_H
+#define VERIFY_H
+/*-------------------------------------------------
+This file has been adapted from the implementation
+(available at https://github.com/pq-crystals/kyber) of
+"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------*/
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* returns 0 for equal strings, 1 for non-equal strings */
+uint8_t PQCLEAN_LIGHTSABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len);
+
+
+/* b = 1 means mov, b = 0 means don't mov*/
+void PQCLEAN_LIGHTSABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
+
+
+#endif
diff --git a/crypto_kem/lightsaber/clean/LICENSE b/crypto_kem/lightsaber/clean/LICENSE
index 08c799e3..d5d21fff 100644
--- a/crypto_kem/lightsaber/clean/LICENSE
+++ b/crypto_kem/lightsaber/clean/LICENSE
@@ -1,8 +1 @@
-----------------------------------------------------------------------------------------
-SABER_v1.1
-
-Public domain
-
-Authors: Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy,
-Frederik Vercauteren
-----------------------------------------------------------------------------------------
+Public Domain
diff --git a/crypto_kem/lightsaber/clean/Makefile b/crypto_kem/lightsaber/clean/Makefile
index b1b532e4..160435dc 100644
--- a/crypto_kem/lightsaber/clean/Makefile
+++ b/crypto_kem/lightsaber/clean/Makefile
@@ -1,10 +1,10 @@
 # This Makefile can be used with GNU Make or BSD Make
 
 LIB=liblightsaber_clean.a
-HEADERS=api.h cbd.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h pack_unpack.h 
+HEADERS=api.h cbd.h pack_unpack.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h 
 OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
 
 all: $(LIB)
 
diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.c b/crypto_kem/lightsaber/clean/SABER_indcpa.c
index 20cf1de2..ccb72492 100644
--- a/crypto_kem/lightsaber/clean/SABER_indcpa.c
+++ b/crypto_kem/lightsaber/clean/SABER_indcpa.c
@@ -3,296 +3,90 @@
 #include "fips202.h"
 #include "pack_unpack.h"
 #include "poly.h"
-#include "poly_mul.h"
 #include "randombytes.h"
 #include <stdint.h>
 #include <string.h>
 
+#define h1 (1 << (SABER_EQ - SABER_EP - 1))
+#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
+    uint16_t A[SABER_L][SABER_L][SABER_N];
+    uint16_t s[SABER_L][SABER_N];
+    uint16_t b[SABER_L][SABER_N] = {0};
 
-/*-----------------------------------------------------------------------------------
-    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
--------------------------------------------------------------------------------------*/
+    uint8_t seed_A[SABER_SEEDBYTES];
+    uint8_t seed_s[SABER_NOISE_SEEDBYTES];
+    int i, j;
 
-#define h1 4 //2^(EQ-EP-1)
+    randombytes(seed_A, SABER_SEEDBYTES);
+    shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
+    randombytes(seed_s, SABER_NOISE_SEEDBYTES);
 
-#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+    PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A);
+    PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, seed_s);
+    PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1);
 
-static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]);
-static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose);
-
-static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec);
-
-static void GenMatrix(polyvec *a, const unsigned char *seed) {
-    unsigned char buf[SABER_K * SABER_K * (13 * SABER_N / 8)];
-
-    uint16_t temp_ar[SABER_N];
-
-    int i, j, k;
-    uint16_t mod = (SABER_Q - 1);
-
-    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            PQCLEAN_LIGHTSABER_CLEAN_BS2POL(buf + (i * SABER_K + j) * (13 * SABER_N / 8), temp_ar);
-            for (k = 0; k < SABER_N; k++) {
-                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
-            }
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_N; j++) {
+            b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP);
         }
     }
+
+    PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s);
+    PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b);
+    memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A));
 }
 
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+    uint16_t A[SABER_L][SABER_L][SABER_N];
+    uint16_t sp[SABER_L][SABER_N];
+    uint16_t bp[SABER_L][SABER_N] = {0};
+    uint16_t vp[SABER_N] = {0};
+    uint16_t mp[SABER_N];
+    uint16_t b[SABER_L][SABER_N];
+    int i, j;
+    const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk) {
-    polyvec a[SABER_K];
+    PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A);
+    PQCLEAN_LIGHTSABER_CLEAN_GenSecret(sp, seed_sp);
+    PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0);
 
-    uint16_t skpv[SABER_K][SABER_N];
-
-    unsigned char seed[SABER_SEEDBYTES];
-    unsigned char noiseseed[SABER_COINBYTES];
-    int32_t i, j;
-    uint16_t mod_q = SABER_Q - 1;
-
-
-    uint16_t res[SABER_K][SABER_N];
-
-    randombytes(seed, SABER_SEEDBYTES);
-
-    // for not revealing system RNG state
-    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES);
-    randombytes(noiseseed, SABER_COINBYTES);
-
-    GenMatrix(a, seed);   //sample matrix A
-
-    // generate secret from constant-time binomial distribution
-    PQCLEAN_LIGHTSABER_CLEAN_GenSecret(skpv, noiseseed);
-
-    // do the matrix vector multiplication and rounding
-    for (i = 0; i < SABER_K; i++) {
+    for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_N; j++) {
-            res[i][j] = 0;
-        }
-    }
-    MatrixVectorMul(a, skpv, res, SABER_Q - 1, 1);
-
-    // now rounding
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            // shift right 3 bits
-            res[i][j] = (res[i][j] + h1) & (mod_q);
-            res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP));
+            bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP);
         }
     }
 
-    // unload and pack sk=3 x (256 coefficients of 14 bits)
-    PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(sk, skpv, SABER_Q);
+    PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp);
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, pk);
+    PQCLEAN_LIGHTSABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp);
 
-    // unload and pack pk=256 bits seed and 3 x (256 coefficients of 11 bits)
-    // load the public-key coefficients
-    PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(pk, res, SABER_P);
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(mp, m);
 
-
-    // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
-    for (i = 0; i < SABER_SEEDBYTES; i++) {
-        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
+    for (j = 0; j < SABER_N; j++) {
+        vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET);
     }
 
+    PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp);
 }
 
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
 
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(const unsigned char *message_received, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext) {
-    uint32_t i, j, k;
-    polyvec a[SABER_K];
-    unsigned char seed[SABER_SEEDBYTES];
-    // public key of received by the client
-    uint16_t pkcl[SABER_K][SABER_N];
-    uint16_t skpv1[SABER_K][SABER_N];
-    uint16_t message[SABER_KEYBYTES * 8];
-    uint16_t res[SABER_K][SABER_N];
-    uint16_t mod_p = SABER_P - 1;
-    uint16_t mod_q = SABER_Q - 1;
-    uint16_t vprime[SABER_N];
-    unsigned char msk_c[SABER_SCALEBYTES_KEM];
+    uint16_t s[SABER_L][SABER_N];
+    uint16_t b[SABER_L][SABER_N];
+    uint16_t v[SABER_N] = {0};
+    uint16_t cm[SABER_N];
+    int i;
 
-    // extract the seedbytes from Public Key.
-    for (i = 0; i < SABER_SEEDBYTES; i++) {
-        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
-    }
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk);
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, ciphertext);
+    PQCLEAN_LIGHTSABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s);
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES);
 
-    GenMatrix(a, seed);
-
-    // generate secret from constant-time binomial distribution
-    PQCLEAN_LIGHTSABER_CLEAN_GenSecret(skpv1, noiseseed);
-
-    // matrix-vector multiplication and rounding
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            res[i][j] = 0;
-        }
-    }
-    MatrixVectorMul(a, skpv1, res, SABER_Q - 1, 0);
-
-    // now rounding
-    //shift right 3 bits
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            res[i][j] = ( res[i][j] + h1 ) & mod_q;
-            res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP) );
-        }
-    }
-
-    PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(ciphertext, res, SABER_P);
-
-    // ************client matrix-vector multiplication ends************
-
-    // now calculate the v'
-    // unpack the public_key
-    // pkcl is the b in the protocol
-    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(pk, pkcl, SABER_P);
     for (i = 0; i < SABER_N; i++) {
-        vprime[i] = 0;
-    }
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            skpv1[i][j] = skpv1[i][j] & (mod_p);
-        }
+        v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1);
     }
 
-    // vector-vector scalar multiplication with mod p
-    InnerProd(pkcl, skpv1, mod_p, vprime);
-
-    // addition of h1 to vprime
-    for (i = 0; i < SABER_N; i++) {
-        vprime[i] = vprime[i] + h1;
-    }
-
-    // unpack message_received;
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        for (i = 0; i < 8; i++) {
-            message[8 * j + i] = ((message_received[j] >> i) & 0x01);
-        }
-    }
-
-    // message encoding
-    for (i = 0; i < SABER_N; i++) {
-        message[i] = (message[i] << (SABER_EP - 1));
-    }
-
-    for (k = 0; k < SABER_N; k++) {
-        vprime[k] = ( (vprime[k] - message[k]) & (mod_p) ) >> (SABER_EP - SABER_ET);
-    }
-
-
-    PQCLEAN_LIGHTSABER_CLEAN_pack_3bit(msk_c, vprime);
-
-    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
-        ciphertext[SABER_POLYVECCOMPRESSEDBYTES + j] = msk_c[j];
-    }
-}
-
-
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char message_dec[]) {
-    uint32_t i, j;
-    // secret key of the server
-    uint16_t sksv[SABER_K][SABER_N];
-    uint16_t pksv[SABER_K][SABER_N];
-    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
-    uint16_t mod_p = SABER_P - 1;
-    uint16_t v[SABER_N];
-    uint16_t op[SABER_N];
-
-    // sksv is the secret-key
-    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(sk, sksv, SABER_Q);
-    // pksv is the ciphertext
-    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(ciphertext, pksv, SABER_P);
-
-    // vector-vector scalar multiplication with mod p
-    for (i = 0; i < SABER_N; i++) {
-        v[i] = 0;
-    }
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            sksv[i][j] = sksv[i][j] & (mod_p);
-        }
-    }
-    InnerProd(pksv, sksv, mod_p, v);
-
-    //Extraction
-    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
-        scale_ar[i] = ciphertext[SABER_POLYVECCOMPRESSEDBYTES + i];
-    }
-
-    PQCLEAN_LIGHTSABER_CLEAN_un_pack3bit(scale_ar, op);
-
-    //addition of h1
-    for (i = 0; i < SABER_N; i++) {
-        v[i] = ( ( v[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (mod_p) ) >> (SABER_EP - 1);
-    }
-
-    // pack decrypted message
-    POL2MSG(v, message_dec);
-}
-static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose) {
-    uint16_t acc[SABER_N];
-    int32_t i, j, k;
-
-    if (transpose == 1) {
-        for (i = 0; i < SABER_K; i++) {
-            for (j = 0; j < SABER_K; j++) {
-                PQCLEAN_LIGHTSABER_CLEAN_pol_mul((uint16_t *)&a[j].vec[i], skpv[j], acc, SABER_Q, SABER_N);
-
-                for (k = 0; k < SABER_N; k++) {
-                    res[i][k] = res[i][k] + acc[k];
-                    //reduction mod p
-                    res[i][k] = (res[i][k] & mod);
-                    //clear the accumulator
-                    acc[k] = 0;
-                }
-            }
-        }
-    } else {
-        for (i = 0; i < SABER_K; i++) {
-            for (j = 0; j < SABER_K; j++) {
-                PQCLEAN_LIGHTSABER_CLEAN_pol_mul((uint16_t *)&a[i].vec[j], skpv[j], acc, SABER_Q, SABER_N);
-                for (k = 0; k < SABER_N; k++) {
-                    res[i][k] = res[i][k] + acc[k];
-                    // reduction
-                    res[i][k] = res[i][k] & mod;
-                    // clear the accumulator
-                    acc[k] = 0;
-                }
-            }
-        }
-    }
-}
-
-static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec) {
-    int32_t i, j;
-
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        message_dec[j] = 0;
-        for (i = 0; i < 8; i++) {
-            message_dec[j] = message_dec[j] | (uint8_t) (message_dec_unpacked[j * 8 + i] << i);
-        }
-    }
-}
-
-
-static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]) {
-    uint32_t j, k;
-    uint16_t acc[SABER_N];
-
-    // vector-vector scalar multiplication with mod p
-    for (j = 0; j < SABER_K; j++) {
-        PQCLEAN_LIGHTSABER_CLEAN_pol_mul(pkcl[j], skpv[j], acc, SABER_P, SABER_N);
-
-        for (k = 0; k < SABER_N; k++) {
-            res[k] = res[k] + acc[k];
-            // reduction
-            res[k] = res[k] & mod;
-            // clear the accumulator
-            acc[k] = 0;
-        }
-    }
+    PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(m, v);
 }
diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.h b/crypto_kem/lightsaber/clean/SABER_indcpa.h
index 4f806c55..efccbf5e 100644
--- a/crypto_kem/lightsaber/clean/SABER_indcpa.h
+++ b/crypto_kem/lightsaber/clean/SABER_indcpa.h
@@ -1,9 +1,13 @@
 #ifndef INDCPA_H
 #define INDCPA_H
+#include "SABER_params.h"
+#include <stdint.h>
+
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
+
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk);
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(const unsigned char *message, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext);
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char *message_dec);
 
 #endif
-
diff --git a/crypto_kem/lightsaber/clean/SABER_params.h b/crypto_kem/lightsaber/clean/SABER_params.h
index eb3825f2..a6a9fc55 100644
--- a/crypto_kem/lightsaber/clean/SABER_params.h
+++ b/crypto_kem/lightsaber/clean/SABER_params.h
@@ -1,50 +1,39 @@
 #ifndef PARAMS_H
 #define PARAMS_H
 
-#include "api.h"
 
-#define SABER_K 2
+/* Change this for different security strengths */
+
+/* Don't change anything below this line */
+#define SABER_L 2
 #define SABER_MU 10
 #define SABER_ET 3
 
-
 #define SABER_EQ 13
 #define SABER_EP 10
-
 #define SABER_N 256
-#define SABER_Q 8192
-#define SABER_P 1024
 
-#define SABER_SEEDBYTES       32
-#define SABER_NOISESEEDBYTES  32
-#define SABER_COINBYTES       32
-#define SABER_KEYBYTES        32
+#define SABER_SEEDBYTES 32
+#define SABER_NOISE_SEEDBYTES 32
+#define SABER_KEYBYTES 32
+#define SABER_HASHBYTES 32
 
-#define SABER_HASHBYTES       32
+#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8)
 
-#define SABER_POLYBYTES       416 //13*256/8 
+#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8)
+#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES)
 
-#define SABER_POLYVECBYTES    (SABER_K * SABER_POLYBYTES)
+#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8)
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES)
 
-#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
-
-#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
-
-#define SABER_SCALEBYTES (SABER_DELTA*SABER_N/8)
-
-#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8)
 
 #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
 #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
 
 #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
 
-#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
-
-#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
-
-
-
+#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM)
 
 #endif
-
diff --git a/crypto_kem/lightsaber/clean/api.h b/crypto_kem/lightsaber/clean/api.h
index 4f73c035..f0fe63f1 100644
--- a/crypto_kem/lightsaber/clean/api.h
+++ b/crypto_kem/lightsaber/clean/api.h
@@ -1,14 +1,18 @@
 #ifndef PQCLEAN_LIGHTSABER_CLEAN_API_H
 #define PQCLEAN_LIGHTSABER_CLEAN_API_H
 
+
 #define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_ALGNAME "LightSaber"
-#define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_SECRETKEYBYTES 1568
-#define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_PUBLICKEYBYTES (2*320+32)
 #define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_BYTES 32
 #define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_CIPHERTEXTBYTES 736
+#define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_PUBLICKEYBYTES 672
+#define PQCLEAN_LIGHTSABER_CLEAN_CRYPTO_SECRETKEYBYTES 1568
 
 int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
-int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
-int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
+
+int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
+
 
 #endif /* api_h */
diff --git a/crypto_kem/lightsaber/clean/cbd.c b/crypto_kem/lightsaber/clean/cbd.c
index f6ebe4d7..7e3f2be2 100644
--- a/crypto_kem/lightsaber/clean/cbd.c
+++ b/crypto_kem/lightsaber/clean/cbd.c
@@ -1,3 +1,7 @@
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include <stdint.h>
 /*---------------------------------------------------------------------
 This file has been adapted from the implementation
 (available at, Public Domain https://github.com/pq-crystals/kyber)
@@ -6,12 +10,8 @@ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
 
-#include "SABER_params.h"
-#include "api.h"
-#include "cbd.h"
-#include <stdint.h>
 
-static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+static uint64_t load_littleendian(const uint8_t *x, int bytes) {
     int i;
     uint64_t r = x[0];
     for (i = 1; i < bytes; i++) {
@@ -20,10 +20,7 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) {
     return r;
 }
 
-
-void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) {
-    uint16_t Qmod_minus1 = SABER_Q - 1;
-
+void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) {
     uint64_t t, d, a[4], b[4];
     int i, j;
 
@@ -34,8 +31,8 @@ void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) {
             d += (t >> j) & 0x0842108421UL;
         }
 
-        a[0] =  d & 0x1f;
-        b[0] = (d >>  5) & 0x1f;
+        a[0] = d & 0x1f;
+        b[0] = (d >> 5) & 0x1f;
         a[1] = (d >> 10) & 0x1f;
         b[1] = (d >> 15) & 0x1f;
         a[2] = (d >> 20) & 0x1f;
@@ -43,9 +40,9 @@ void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) {
         a[3] = (d >> 30) & 0x1f;
         b[3] = (d >> 35);
 
-        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
-        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
-        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
-        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
+        s[4 * i + 0] = (uint16_t)(a[0] - b[0]);
+        s[4 * i + 1] = (uint16_t)(a[1] - b[1]);
+        s[4 * i + 2] = (uint16_t)(a[2] - b[2]);
+        s[4 * i + 3] = (uint16_t)(a[3] - b[3]);
     }
 }
diff --git a/crypto_kem/lightsaber/clean/cbd.h b/crypto_kem/lightsaber/clean/cbd.h
index 37553425..dffd4dc5 100644
--- a/crypto_kem/lightsaber/clean/cbd.h
+++ b/crypto_kem/lightsaber/clean/cbd.h
@@ -1,6 +1,5 @@
 #ifndef CBD_H
 #define CBD_H
-
 /*---------------------------------------------------------------------
 This file has been adapted from the implementation
 (available at, Public Domain https://github.com/pq-crystals/kyber)
@@ -8,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
 by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
-
-#include "poly.h"
+#include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf);
+void PQCLEAN_LIGHTSABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]);
+
 
 #endif
diff --git a/crypto_kem/lightsaber/clean/kem.c b/crypto_kem/lightsaber/clean/kem.c
index 8aad4302..eb9353b1 100644
--- a/crypto_kem/lightsaber/clean/kem.c
+++ b/crypto_kem/lightsaber/clean/kem.c
@@ -1,5 +1,6 @@
 #include "SABER_indcpa.h"
 #include "SABER_params.h"
+#include "api.h"
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
@@ -7,90 +8,71 @@
 #include <stdio.h>
 #include <string.h>
 
-int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+
+int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
     int i;
 
-    // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
-    PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(pk, sk);
-
-    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
+    PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
-        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];
+        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
     }
 
-    // Then hash(pk) is appended.
-    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES);
+    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended.
 
-    // Remaining part of sk contains a pseudo-random number.
-    // This is output when check in crypto_kem_dec() fails.
-    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES );
+    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number.
+    // This is output when check in PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec() fails.
     return (0);
 }
 
-int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) {
-    // Will contain key, coins
-    unsigned char kr[64];
-    unsigned char buf[64];
+int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) {
+
+    uint8_t kr[64]; // Will contain key, coins
+    uint8_t buf[64];
 
     randombytes(buf, 32);
 
-    // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
-    sha3_256(buf, buf, 32);
+    sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
 
-    // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
-    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES);
-
-    // kr[0:63] <-- Hash(buf[0:63]);
-    sha3_512(kr, buf, 64);
+    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
 
+    sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
     // K^ <-- kr[0:31]
     // noiseseed (r) <-- kr[32:63];
-    // buf[0:31] contains message; kr[32:63] contains randomness r;
-    PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk,  ct);
+    PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
 
-    sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC);
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
 
-    // hash concatenation of pre-k and h(c) to k
-    sha3_256(ss, kr, 64);
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
 
     return (0);
 }
 
-
-int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) {
+int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
     int i;
-    unsigned char fail;
-    unsigned char cmp[SABER_BYTES_CCA_DEC];
-    unsigned char buf[64];
-
-    // Will contain key, coins
-    unsigned char kr[64];
-    const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
-
-    // buf[0:31] <-- message
-    PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(sk, ct, buf);
+    uint8_t fail;
+    uint8_t cmp[SABER_BYTES_CCA_DEC];
+    uint8_t buf[64];
+    uint8_t kr[64]; // Will contain key, coins
+    const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
 
+    PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message
 
     // Multitarget countermeasure for coins + contributory KEM
-    // Save hash by storing h(pk) in sk
-    for (i = 0; i < 32; i++) {
+    for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk
         buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i];
     }
 
     sha3_512(kr, buf, 64);
 
-    PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, cmp);
+    PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(cmp, buf, kr + 32, pk);
 
+    fail = PQCLEAN_LIGHTSABER_CLEAN_verify(c, cmp, SABER_BYTES_CCA_DEC);
 
-    fail = PQCLEAN_LIGHTSABER_CLEAN_verify(ct, cmp, SABER_BYTES_CCA_DEC);
-
-    // overwrite coins in kr with h(c)
-    sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC);
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c)
 
     PQCLEAN_LIGHTSABER_CLEAN_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail);
 
-    // hash concatenation of pre-k and h(c) to k
-    sha3_256(ss, kr, 64);
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
 
     return (0);
 }
diff --git a/crypto_kem/lightsaber/clean/pack_unpack.c b/crypto_kem/lightsaber/clean/pack_unpack.c
index 4b1c409f..2a39a1d7 100644
--- a/crypto_kem/lightsaber/clean/pack_unpack.c
+++ b/crypto_kem/lightsaber/clean/pack_unpack.c
@@ -1,254 +1,140 @@
+#include "api.h"
 #include "pack_unpack.h"
+#include <string.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
+void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 8; j++) {
         offset_byte = 3 * j;
         offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) |
-                                 ((data[offset_data + 1] & 0x7) << 3) |
-                                 ((data[offset_data + 2] & 0x3) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  |
-                                 ((data[offset_data + 3] & 0x7) << 1) |
-                                 ((data[offset_data + 4] & 0x7) << 4) |
-                                 (((data[offset_data + 5]) & 0x01) << 7);
-        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) |
-                                 ((data[offset_data + 6] & 0x7) << 2) |
-                                 ((data[offset_data + 7] & 0x7) << 5);
+        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ((data[offset_data + 1] & 0x7) << 3) | ((data[offset_data + 2] & 0x3) << 6);
+        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2) & 0x01) | ((data[offset_data + 3] & 0x7) << 1) | ((data[offset_data + 4] & 0x7) << 4) | (((data[offset_data + 5]) & 0x01) << 7);
+        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1) & 0x03) | ((data[offset_data + 6] & 0x7) << 2) | ((data[offset_data + 7] & 0x7) << 5);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 8; j++) {
         offset_byte = 3 * j;
         offset_data = 8 * j;
         data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
-        data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3 ) & 0x07;
-        data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6 ) & 0x03) |
-                                (((bytes[offset_byte + 1]) & 0x01) << 2);
-        data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1 ) & 0x07;
-        data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4 ) & 0x07;
-        data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7 ) & 0x01) |
-                                (((bytes[offset_byte + 2]) & 0x03) << 1);
+        data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3) & 0x07;
+        data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6) & 0x03) | (((bytes[offset_byte + 1]) & 0x01) << 2);
+        data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1) & 0x07;
+        data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4) & 0x07;
+        data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7) & 0x01) | (((bytes[offset_byte + 2]) & 0x03) << 1);
         data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07);
         data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data;
-
-    for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        bytes[j] = (data[offset_data] & 0x0f) |
-                   ((data[offset_data + 1] & 0x0f) << 4);
-    }
-}
-
-void PQCLEAN_LIGHTSABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar) {
-    uint32_t j;
-    uint32_t offset_data;
-
-    for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        ar[offset_data] = bytes[j] & 0x0f;
-        ar[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
-    }
-}
-
-void PQCLEAN_LIGHTSABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
-    for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) |
-                                 ((data[offset_data + 1] & 0x03) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) |
-                                 ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) |
-                                 ((data[offset_data + 3] & 0x3f) << 2);
-    }
-}
-
-
-void PQCLEAN_LIGHTSABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
-    for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
-        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |
-                                ((bytes[offset_byte + 1] & 0x0f) << 2);
-        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) |
-                                ((bytes[offset_byte + 2] & 0x03) << 4);
-        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
-    }
-}
-
-
-static void POLVECp2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff));
-            bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x03) |
-                                     ((data[i][offset_data + 1] & 0x3f) << 2);
-            bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 6) & 0x0f) |
-                                     ((data[i][offset_data + 2] & 0x0f) << 4);
-            bytes[offset_byte + 3] = ((data[i][offset_data + 2] >> 4) & 0x3f) |
-                                     ((data[i][offset_data + 3] & 0x03) << 6);
-            bytes[offset_byte + 4] = ((data[i][offset_data + 3] >> 2) & 0xff);
-        }
-    }
-}
-
-static void BS2POLVECp(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                       ((bytes[offset_byte + 1] & 0x03) << 8);
-            data[i][offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) |
-                                       ((bytes[offset_byte + 2] & 0x0f) << 6);
-            data[i][offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) |
-                                       ((bytes[offset_byte + 3] & 0x3f) << 4);
-            data[i][offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) |
-                                       ((bytes[offset_byte + 4] & 0xff) << 2);
-        }
-    }
-}
-
-
-
-static void POLVECq2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff));
-            bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x1f) |
-                                     ((data[i][offset_data + 1] & 0x07) << 5);
-            bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 3) & 0xff);
-            bytes[offset_byte + 3] = ((data[i][offset_data + 1] >> 11) & 0x03) |
-                                     ((data[i][offset_data + 2] & 0x3f) << 2);
-            bytes[offset_byte + 4] = ((data[i][offset_data + 2] >> 6) & 0x7f) |
-                                     ((data[i][offset_data + 3] & 0x01) << 7);
-            bytes[offset_byte + 5] = ((data[i][offset_data + 3] >> 1) & 0xff);
-            bytes[offset_byte + 6] = ((data[i][offset_data + 3] >> 9) & 0x0f) |
-                                     ((data[i][offset_data + 4] & 0x0f) << 4);
-            bytes[offset_byte + 7] = ((data[i][offset_data + 4] >> 4) & 0xff);
-            bytes[offset_byte + 8] = ((data[i][offset_data + 4] >> 12) & 0x01) |
-                                     ((data[i][offset_data + 5] & 0x7f) << 1);
-            bytes[offset_byte + 9] = ((data[i][offset_data + 5] >> 7) & 0x3f) |
-                                     ((data[i][offset_data + 6] & 0x03) << 6);
-            bytes[offset_byte + 10] = ((data[i][offset_data + 6] >> 2) & 0xff);
-            bytes[offset_byte + 11] = ((data[i][offset_data + 6] >> 10) & 0x07) |
-                                      ((data[i][offset_data + 7] & 0x1f) << 3);
-            bytes[offset_byte + 12] = ((data[i][offset_data + 7] >> 5) & 0xff);
-        }
-    }
-}
-
-static void BS2POLVECq(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                       ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) |
-                                       ((bytes[offset_byte + 2] & 0xff) << 3) |
-                                       ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) |
-                                       ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) |
-                                       ((bytes[offset_byte + 5] & 0xff) << 1) |
-                                       ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) |
-                                       ((bytes[offset_byte + 7] & 0xff) << 4) |
-                                       ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) |
-                                       ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) |
-                                       ((bytes[offset_byte + 10] & 0xff) << 2) |
-                                       ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) |
-                                       ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-}
-
-//only BS2POLq no BS2POLp
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 8; j++) {
         offset_byte = 13 * j;
         offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) |
-                                ((bytes[offset_byte + 2] & 0xff) << 3) |
-                                ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) |
-                                ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) |
-                                ((bytes[offset_byte + 5] & 0xff) << 1) |
-                                ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) |
-                                ((bytes[offset_byte + 7] & 0xff) << 4) |
-                                ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) |
-                                ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) |
-                                ((bytes[offset_byte + 10] & 0xff) << 2) |
-                                ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) |
-                                ((bytes[offset_byte + 12] & 0xff) << 5);
+        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
+        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5);
+        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff);
+        bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2);
+        bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7);
+        bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff);
+        bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4);
+        bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff);
+        bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1);
+        bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6);
+        bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff);
+        bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3);
+        bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-    if (modulus == 1024) {
-        POLVECp2BS(bytes, data);
-    } else if (modulus == 8192) {
-        POLVECq2BS(bytes, data);
+static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-    if (modulus == 1024) {
-        BS2POLVECp(bytes, data);
-    } else if (modulus == 8192) {
-        BS2POLVECq(bytes, data);
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 5 * j;
+        offset_data = 4 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
+        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2);
+        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
+        bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6);
+        bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff);
+    }
+}
+
+static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 5 * j;
+        offset_data = 4 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8);
+        data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6);
+        data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4);
+        data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2);
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLq2BS(bytes + i * SABER_POLYBYTES, data[i]);
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLq(data[i], bytes + i * SABER_POLYBYTES);
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]);
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8));
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) {
+    size_t i, j;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            data[j * 8 + i] = ((bytes[j] >> i) & 0x01);
+        }
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) {
+    size_t i, j;
+    memset(bytes, 0, SABER_KEYBYTES);
+
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i);
+        }
     }
 }
diff --git a/crypto_kem/lightsaber/clean/pack_unpack.h b/crypto_kem/lightsaber/clean/pack_unpack.h
index 86fd2fad..44ccf31a 100644
--- a/crypto_kem/lightsaber/clean/pack_unpack.h
+++ b/crypto_kem/lightsaber/clean/pack_unpack.h
@@ -1,28 +1,27 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
-
 #include "SABER_params.h"
 #include <stdint.h>
 #include <stdio.h>
 
+void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_LIGHTSABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data);
-
-void PQCLEAN_LIGHTSABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_LIGHTSABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar);
-
-void PQCLEAN_LIGHTSABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_LIGHTSABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data);
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]);
+void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]);
+
+
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]);
+
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
+
+
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]);
+
+void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus);
 
 #endif
diff --git a/crypto_kem/lightsaber/clean/poly.c b/crypto_kem/lightsaber/clean/poly.c
index fc86ab3c..1c1e22cc 100644
--- a/crypto_kem/lightsaber/clean/poly.c
+++ b/crypto_kem/lightsaber/clean/poly.c
@@ -1,21 +1,49 @@
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
-#include "SABER_params.h"
+#include "api.h"
 #include "cbd.h"
 #include "fips202.h"
+#include "pack_unpack.h"
 #include "poly.h"
+#include "poly_mul.h"
+#include <stdio.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed) {
-    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
-
-    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
-
-    for (size_t i = 0; i < SABER_K; i++) {
-        PQCLEAN_LIGHTSABER_CLEAN_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
+void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
+    int i, j;
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_L; j++) {
+            if (transpose == 1) {
+                PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]);
+            } else {
+                PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]);
+            }
+        }
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
+    int j;
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(b[j], s[j], res);
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+    int i;
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES);
+    }
+}
+
+void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) {
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+    size_t i;
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_LIGHTSABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES);
     }
 }
diff --git a/crypto_kem/lightsaber/clean/poly.h b/crypto_kem/lightsaber/clean/poly.h
index 47ceeebb..1f50c48e 100644
--- a/crypto_kem/lightsaber/clean/poly.h
+++ b/crypto_kem/lightsaber/clean/poly.h
@@ -1,26 +1,15 @@
 #ifndef POLY_H
 #define POLY_H
-
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
-
-
 #include "SABER_params.h"
 #include <stdint.h>
 
-typedef struct {
-    uint16_t coeffs[SABER_N];
-} poly;
+void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose);
 
-typedef struct {
-    poly vec[SABER_K];
-} polyvec;
+void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]);
+
+void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed);
 
 #endif
diff --git a/crypto_kem/lightsaber/clean/poly_mul.c b/crypto_kem/lightsaber/clean/poly_mul.c
index 926910b5..5e37a024 100644
--- a/crypto_kem/lightsaber/clean/poly_mul.c
+++ b/crypto_kem/lightsaber/clean/poly_mul.c
@@ -228,19 +228,15 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n) {
-    uint32_t i;
-    // normal multiplication
-    uint16_t c[512];
-
-    for (i = 0; i < 512; i++) {
-        c[i] = 0;
-    }
+/* res += a*b */
+void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) {
+    uint16_t c[2 * SABER_N] = {0};
+    int i;
 
     toom_cook_4way(a, b, c);
 
-    // reduction
-    for (i = n; i < 2 * n; i++) {
-        res[i - n] = (c[i - n] - c[i]) & (p - 1);
+    /* reduction */
+    for (i = SABER_N; i < 2 * SABER_N; i++) {
+        res[i - SABER_N] += (c[i - SABER_N] - c[i]);
     }
 }
diff --git a/crypto_kem/lightsaber/clean/poly_mul.h b/crypto_kem/lightsaber/clean/poly_mul.h
index 8d634584..0d5cf6ed 100644
--- a/crypto_kem/lightsaber/clean/poly_mul.h
+++ b/crypto_kem/lightsaber/clean/poly_mul.h
@@ -1,9 +1,9 @@
-#ifndef POLYMUL_H
-#define POLYMUL_H
-
+#ifndef POLY_MUL_H
+#define POLY_MUL_H
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n);
+void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]);
+
 
 #endif
diff --git a/crypto_kem/lightsaber/clean/verify.c b/crypto_kem/lightsaber/clean/verify.c
index 52c6969b..05e564da 100644
--- a/crypto_kem/lightsaber/clean/verify.c
+++ b/crypto_kem/lightsaber/clean/verify.c
@@ -1,3 +1,5 @@
+#include "verify.h"
+
 /*-------------------------------------------------
 This file has been adapted from the implementation
 (available at https://github.com/pq-crystals/kyber) of
@@ -5,26 +7,25 @@ This file has been adapted from the implementation
  by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------*/
-#include "verify.h"
-#include <stdint.h>
+
 
 /* returns 0 for equal strings, 1 for non-equal strings */
-unsigned char PQCLEAN_LIGHTSABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len) {
+uint8_t PQCLEAN_LIGHTSABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) {
     uint64_t r;
     size_t i;
-
     r = 0;
+
     for (i = 0; i < len; i++) {
         r |= a[i] ^ b[i];
     }
 
     r = (~r + 1); // Two's complement
     r >>= 63;
-    return (unsigned char)r;
+    return (uint8_t) r;
 }
 
 /* b = 1 means mov, b = 0 means don't mov*/
-void PQCLEAN_LIGHTSABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
+void PQCLEAN_LIGHTSABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
     size_t i;
 
     b = -b;
diff --git a/crypto_kem/lightsaber/clean/verify.h b/crypto_kem/lightsaber/clean/verify.h
index 32c2adb5..4f538e6f 100644
--- a/crypto_kem/lightsaber/clean/verify.h
+++ b/crypto_kem/lightsaber/clean/verify.h
@@ -1,6 +1,5 @@
 #ifndef VERIFY_H
 #define VERIFY_H
-
 /*-------------------------------------------------
 This file has been adapted from the implementation
 (available at https://github.com/pq-crystals/kyber) of
@@ -13,9 +12,11 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 #include <stdint.h>
 
 /* returns 0 for equal strings, 1 for non-equal strings */
-unsigned char PQCLEAN_LIGHTSABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len);
+uint8_t PQCLEAN_LIGHTSABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len);
+
 
 /* b = 1 means mov, b = 0 means don't mov*/
-void PQCLEAN_LIGHTSABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);
+void PQCLEAN_LIGHTSABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
+
 
 #endif
diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml
index 4de4f1c8..50250180 100644
--- a/crypto_kem/saber/META.yml
+++ b/crypto_kem/saber/META.yml
@@ -14,4 +14,13 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/commit/14ede83f1ff3bcc41f0464543542366c68b55871
+      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+    - name: avx2
+      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      supported_platforms:
+          - architecture: x86_64
+            operating_systems:
+                - Linux
+                - Darwin
+            required_flags:
+                - avx2
diff --git a/crypto_kem/saber/avx2/LICENSE b/crypto_kem/saber/avx2/LICENSE
new file mode 100644
index 00000000..d5d21fff
--- /dev/null
+++ b/crypto_kem/saber/avx2/LICENSE
@@ -0,0 +1 @@
+Public Domain
diff --git a/crypto_kem/saber/avx2/Makefile b/crypto_kem/saber/avx2/Makefile
new file mode 100644
index 00000000..65cc21ef
--- /dev/null
+++ b/crypto_kem/saber/avx2/Makefile
@@ -0,0 +1,22 @@
+# This Makefile can be used with GNU Make or BSD Make
+
+LIB=libsaber_avx2.a
+HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
+OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o 
+
+CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
+
+all: $(LIB)
+
+%.o: %.s $(HEADERS)
+	$(AS) -o $@ $<
+
+%.o: %.c $(HEADERS)
+	$(CC) $(CFLAGS) -c -o $@ $<
+
+$(LIB): $(OBJECTS)
+	$(AR) -r $@ $(OBJECTS)
+
+clean:
+	$(RM) $(OBJECTS)
+	$(RM) $(LIB)
diff --git a/crypto_kem/saber/avx2/SABER_indcpa.c b/crypto_kem/saber/avx2/SABER_indcpa.c
new file mode 100644
index 00000000..d16a7a06
--- /dev/null
+++ b/crypto_kem/saber/avx2/SABER_indcpa.c
@@ -0,0 +1,416 @@
+#include "./polymul/toom-cook_4way.c"
+#include "SABER_indcpa.h"
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include "fips202.h"
+#include "pack_unpack.h"
+#include "randombytes.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+//#include "randombytes.h"
+//#include "./polymul/toom_cook_4/toom-cook_4way.c"
+
+#define h1 4 //2^(EQ-EP-1)
+
+#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+
+
+static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) {
+    int32_t i, j;
+
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        message_dec[j] = 0;
+        for (i = 0; i < 8; i++) {
+            message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i);
+        }
+    }
+}
+
+/*-----------------------------------------------------------------------------------
+    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
+-------------------------------------------------------------------------------------*/
+
+static void GenMatrix(polyvec *a, const uint8_t *seed) {
+    uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8];
+
+    uint16_t temp_ar[SABER_N];
+
+    int i, j, k;
+    uint16_t mod = (SABER_Q - 1);
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            PQCLEAN_SABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8);
+            for (k = 0; k < SABER_N; k++) {
+                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
+            }
+        }
+    }
+}
+
+static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
+
+    uint32_t i;
+
+    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
+
+    for (i = 0; i < SABER_K; i++) {
+        PQCLEAN_SABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
+    }
+}
+
+//********************************matrix-vector mul routines*****************************************************
+static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) {
+    int64_t i, j;
+
+    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
+
+    for (i = 0; i < NUM_POLY; i++) {
+        for (j = 0; j < NUM_POLY; j++) {
+
+            if (isTranspose == 0) {
+                toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j);
+            } else {
+                toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j);
+            }
+        }
+
+        TC_interpol(c_bucket, res_avx[i]);
+    }
+
+}
+
+static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) {
+
+    int64_t i;
+
+    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
+
+    for (i = 0; i < NUM_POLY; i++) {
+        toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i);
+    }
+    TC_interpol(c_bucket, res_avx);
+}
+
+//********************************matrix-vector mul routines*****************************************************
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
+
+    polyvec a[SABER_K];
+
+    uint16_t skpv1[SABER_K][SABER_N];
+
+
+
+    uint8_t seed[SABER_SEEDBYTES];
+    uint8_t noiseseed[SABER_COINBYTES];
+    int32_t i, j, k;
+
+
+//--------------AVX declaration------------------
+
+    __m256i sk_avx[SABER_K][SABER_N / 16];
+    __m256i mod;
+    __m256i res_avx[SABER_K][SABER_N / 16];
+    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
+    //__m256i acc[2*SABER_N/16];
+
+    mod = _mm256_set1_epi16(SABER_Q - 1);
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+
+//--------------AVX declaration ends------------------
+
+    randombytes(seed, SABER_SEEDBYTES);
+
+    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state
+    randombytes(noiseseed, SABER_COINBYTES);
+
+
+    GenMatrix(a, seed); //sample matrix A
+
+    GenSecret(skpv1, noiseseed);
+
+
+// Load sk into avx vectors
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+        }
+
+    }
+
+    // Load a into avx vectors
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            for (k = 0; k < SABER_N / 16; k++) {
+                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
+            }
+        }
+    }
+
+
+
+    //------------------------do the matrix vector multiplication and rounding------------
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sk_avx[j], b_bucket[j]);
+    }
+    matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order
+
+    // Now truncation
+
+
+    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N / 16; j++) {
+            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
+            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
+            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
+        }
+    }
+
+    //------------------Pack sk into byte string-------
+
+    PQCLEAN_SABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q);
+
+    //------------------Pack pk into byte string-------
+
+    for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key
+        for (j = 0; j < SABER_N / 16; j++) {
+            _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
+        }
+    }
+    PQCLEAN_SABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string
+
+
+    for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
+        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
+    }
+
+}
+
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+
+
+    uint32_t i, j, k;
+    polyvec a[SABER_K];     // skpv;
+    uint8_t seed[SABER_SEEDBYTES];
+    uint16_t pkcl[SABER_K][SABER_N];    //public key of received by the client
+
+
+    uint16_t skpv1[SABER_K][SABER_N];
+    uint16_t temp[SABER_K][SABER_N];
+    uint16_t message[SABER_KEYBYTES * 8];
+
+    uint8_t msk_c[SABER_SCALEBYTES_KEM];
+
+    //--------------AVX declaration------------------
+
+    __m256i sk_avx[SABER_K][SABER_N / 16];
+    __m256i mod, mod_p;
+    __m256i res_avx[SABER_K][SABER_N / 16];
+    __m256i vprime_avx[SABER_N / 16];
+    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
+    //__m256i acc[2*SABER_N/16];
+
+    __m256i pkcl_avx[SABER_K][SABER_N / 16];
+
+    __m256i message_avx[SABER_N / 16];
+
+    mod = _mm256_set1_epi16(SABER_Q - 1);
+    mod_p = _mm256_set1_epi16(SABER_P - 1);
+
+
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+
+    //--------------AVX declaration ends------------------
+    for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK.
+        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
+    }
+
+    GenMatrix(a, seed);
+    GenSecret(skpv1, noiseseed);
+
+    // ----------- Load skpv1 into avx vectors ----------
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+        }
+    }
+
+    // ----------- Load skpv1 into avx vectors ----------
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_K; j++) {
+            for (k = 0; k < SABER_N / 16; k++) {
+                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
+            }
+        }
+    }
+    //-----------------matrix-vector multiplication and rounding
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sk_avx[j], b_bucket[j]);
+    }
+    matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order
+
+    // Now truncation
+
+    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N / 16; j++) {
+            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
+            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
+            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
+
+        }
+    }
+
+
+    //-----this result should be put in b_prime for later use in server.
+    for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays
+        for (j = 0; j < SABER_N / 16; j++) {
+            _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
+        }
+    }
+
+    PQCLEAN_SABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string
+
+//**************client matrix-vector multiplication ends******************//
+
+    //------now calculate the v'
+
+    //-------unpack the public_key
+    PQCLEAN_SABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P);
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16]));
+        }
+    }
+
+    // InnerProduct
+    //for(k=0;k<SABER_N/16;k++){
+    //  vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]);
+    //}
+
+    // vector-vector scalar multiplication with mod p
+
+    vector_vector_mul(pkcl_avx, b_bucket, vprime_avx);
+
+    // Computation of v'+h1
+    for (i = 0; i < SABER_N / 16; i++) { //adding h1
+        vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1));
+    }
+
+    // unpack m;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            message[8 * j + i] = ((m[j] >> i) & 0x01);
+        }
+    }
+    // message encoding
+    for (i = 0; i < SABER_N / 16; i++) {
+        message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16]));
+        message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) );
+    }
+
+    // SHIFTRIGHT(v'+h1-m mod p, EP-ET)
+    for (k = 0; k < SABER_N / 16; k++) {
+        vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]);
+        vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p);
+        vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) );
+    }
+
+    // Unpack avx
+    for (j = 0; j < SABER_N / 16; j++) {
+        _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]);
+    }
+
+    PQCLEAN_SABER_AVX2_SABER_pack_4bit(msk_c, temp[0]);
+
+
+    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
+        ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j];
+    }
+
+}
+
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+
+    uint32_t i, j;
+    uint16_t sksv[SABER_K][SABER_N]; //secret key of the server
+    uint16_t pksv[SABER_K][SABER_N];
+    uint16_t message_dec_unpacked[SABER_KEYBYTES * 8];  // one element containes on decrypted bit;
+    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
+    uint16_t op[SABER_N];
+
+    //--------------AVX declaration------------------
+
+
+    //__m256i mod_p;
+
+    __m256i v_avx[SABER_N / 16];
+
+    //__m256i acc[2*SABER_N/16];
+
+    __m256i sksv_avx[SABER_K][SABER_N / 16];
+    __m256i pksv_avx[SABER_K][SABER_N / 16];
+
+    //mod_p=_mm256_set1_epi16(SABER_P-1);
+
+    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
+    //--------------AVX declaration ends------------------
+
+    //-------unpack the public_key
+
+    PQCLEAN_SABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key
+    PQCLEAN_SABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext
+
+    for (i = 0; i < SABER_K; i++) {
+        for (j = 0; j < SABER_N / 16; j++) {
+            sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16]));
+            pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16]));
+        }
+    }
+
+    for (i = 0; i < SABER_N / 16; i++) {
+        v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]);
+    }
+
+
+    // InnerProduct(b', s, mod p)
+
+    for (j = 0; j < NUM_POLY; j++) {
+        TC_eval(sksv_avx[j], b_bucket[j]);
+    }
+
+    vector_vector_mul(pksv_avx, b_bucket, v_avx);
+
+    for (i = 0; i < SABER_N / 16; i++) {
+        _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
+    }
+
+
+    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
+        scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i];
+    }
+
+    PQCLEAN_SABER_AVX2_SABER_un_pack4bit(op, scale_ar);
+
+
+    //addition of h2
+    for (i = 0; i < SABER_N; i++) {
+        message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1);
+    }
+
+
+    POL2MSG(m, message_dec_unpacked);
+}
diff --git a/crypto_kem/saber/avx2/SABER_indcpa.h b/crypto_kem/saber/avx2/SABER_indcpa.h
new file mode 100644
index 00000000..acdda606
--- /dev/null
+++ b/crypto_kem/saber/avx2/SABER_indcpa.h
@@ -0,0 +1,13 @@
+#ifndef INDCPA_H
+#define INDCPA_H
+#include "SABER_params.h"
+#include <stdint.h>
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
+
+
+#endif
diff --git a/crypto_kem/saber/avx2/SABER_params.h b/crypto_kem/saber/avx2/SABER_params.h
new file mode 100644
index 00000000..9b0edafe
--- /dev/null
+++ b/crypto_kem/saber/avx2/SABER_params.h
@@ -0,0 +1,46 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+#include "api.h"
+
+
+
+
+#define SABER_K 3
+#define SABER_MU 8
+#define SABER_ET 4
+
+
+#define SABER_EQ 13
+#define SABER_EP 10
+
+#define SABER_N 256
+#define SABER_Q 8192 //2^13
+#define SABER_P 1024
+
+#define SABER_SEEDBYTES       32
+#define SABER_NOISESEEDBYTES  32
+#define SABER_COINBYTES       32
+#define SABER_KEYBYTES        32
+
+#define SABER_HASHBYTES       32
+
+#define SABER_POLYBYTES              416 //13*256/8 
+
+#define SABER_POLYVECBYTES           (SABER_K * SABER_POLYBYTES)
+
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
+
+#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
+
+#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+
+#define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
+#define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
+
+#define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
+
+#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
+
+#endif
diff --git a/crypto_kem/saber/avx2/api.h b/crypto_kem/saber/avx2/api.h
new file mode 100644
index 00000000..20bf0df3
--- /dev/null
+++ b/crypto_kem/saber/avx2/api.h
@@ -0,0 +1,18 @@
+#ifndef PQCLEAN_SABER_AVX2_API_H
+#define PQCLEAN_SABER_AVX2_API_H
+
+
+#define PQCLEAN_SABER_AVX2_CRYPTO_ALGNAME "Saber"
+#define PQCLEAN_SABER_AVX2_CRYPTO_BYTES 32
+#define PQCLEAN_SABER_AVX2_CRYPTO_CIPHERTEXTBYTES 1088
+#define PQCLEAN_SABER_AVX2_CRYPTO_PUBLICKEYBYTES 992
+#define PQCLEAN_SABER_AVX2_CRYPTO_SECRETKEYBYTES 2304
+
+int PQCLEAN_SABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int PQCLEAN_SABER_AVX2_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_SABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
+
+
+#endif /* PQCLEAN_SABER_AVX2_API_H */
diff --git a/crypto_kem/saber/avx2/cbd.c b/crypto_kem/saber/avx2/cbd.c
new file mode 100644
index 00000000..7639d7d2
--- /dev/null
+++ b/crypto_kem/saber/avx2/cbd.c
@@ -0,0 +1,51 @@
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include <stdint.h>
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+
+
+static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+    int i;
+    uint64_t r = x[0];
+    for (i = 1; i < bytes; i++) {
+        r |= (uint64_t)x[i] << (8 * i);
+    }
+    return r;
+}
+
+
+void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) {
+    uint16_t Qmod_minus1 = SABER_Q - 1;
+
+    uint32_t t, d, a[4], b[4];
+    int i, j;
+
+    for (i = 0; i < SABER_N / 4; i++) {
+        t = load_littleendian(buf + 4 * i, 4);
+        d = 0;
+        for (j = 0; j < 4; j++) {
+            d += (t >> j) & 0x11111111;
+        }
+
+        a[0] =  d & 0xf;
+        b[0] = (d >>  4) & 0xf;
+        a[1] = (d >>  8) & 0xf;
+        b[1] = (d >> 12) & 0xf;
+        a[2] = (d >> 16) & 0xf;
+        b[2] = (d >> 20) & 0xf;
+        a[3] = (d >> 24) & 0xf;
+        b[3] = (d >> 28);
+
+        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
+        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
+        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
+        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
+    }
+}
diff --git a/crypto_kem/saber/avx2/cbd.h b/crypto_kem/saber/avx2/cbd.h
new file mode 100644
index 00000000..e80ffc75
--- /dev/null
+++ b/crypto_kem/saber/avx2/cbd.h
@@ -0,0 +1,16 @@
+#ifndef CBD_H
+#define CBD_H
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+#include "poly.h"
+#include <stdint.h>
+
+void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf);
+
+
+#endif
diff --git a/crypto_kem/saber/avx2/kem.c b/crypto_kem/saber/avx2/kem.c
new file mode 100644
index 00000000..c88bb315
--- /dev/null
+++ b/crypto_kem/saber/avx2/kem.c
@@ -0,0 +1,79 @@
+#include "SABER_indcpa.h"
+#include "SABER_params.h"
+#include "api.h"
+#include "fips202.h"
+#include "randombytes.h"
+#include "verify.h"
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+
+int PQCLEAN_SABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
+    int i;
+
+    PQCLEAN_SABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
+    for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
+        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
+    }
+
+    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended.
+
+    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number.
+    // This is output when check in PQCLEAN_SABER_AVX2_crypto_kem_dec() fails.
+    return (0);
+}
+
+int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) {
+
+    uint8_t kr[64]; // Will contain key, coins
+    uint8_t buf[64];
+
+    randombytes(buf, 32);
+
+    sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
+
+    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
+
+    sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
+    // K^ <-- kr[0:31]
+    // noiseseed (r) <-- kr[32:63];
+    PQCLEAN_SABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
+
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
+
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
+
+    return (0);
+}
+
+int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
+    int i;
+    uint8_t fail;
+    uint8_t cmp[SABER_BYTES_CCA_DEC];
+    uint8_t buf[64];
+    uint8_t kr[64]; // Will contain key, coins
+    const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
+
+    PQCLEAN_SABER_AVX2_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message
+
+    // Multitarget countermeasure for coins + contributory KEM
+    for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk
+        buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i];
+    }
+
+    sha3_512(kr, buf, 64);
+
+    PQCLEAN_SABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk);
+
+    fail = PQCLEAN_SABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC);
+
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c)
+
+    PQCLEAN_SABER_AVX2_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail);
+
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
+
+    return (0);
+}
diff --git a/crypto_kem/saber/avx2/kem.h b/crypto_kem/saber/avx2/kem.h
new file mode 100644
index 00000000..612ff4ff
--- /dev/null
+++ b/crypto_kem/saber/avx2/kem.h
@@ -0,0 +1,35 @@
+#ifndef INDCPA_H
+#define INDCPA_H
+
+#include <stdint.h>
+
+void PQCLEAN_SABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk);
+
+
+void PQCLEAN_SABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
+
+
+void PQCLEAN_SABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
+
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk);
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk,  uint8_t *ciphertext);
+
+void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]);
+
+
+int PQCLEAN_SABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
+
+int PQCLEAN_SABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_SABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
+
+
+
+//uint64_t clock1,clock2;
+
+//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex;
+
+
+#endif
diff --git a/crypto_kem/saber/avx2/pack_unpack.c b/crypto_kem/saber/avx2/pack_unpack.c
new file mode 100644
index 00000000..00bf9c08
--- /dev/null
+++ b/crypto_kem/saber/avx2/pack_unpack.c
@@ -0,0 +1,502 @@
+#include "pack_unpack.h"
+
+
+void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 3 * j;
+        offset_data = 8 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6);
+        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7);
+        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 );
+    }
+}
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 3 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
+        data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07;
+        data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 );
+        data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07;
+        data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07;
+        data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 );
+        data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 );
+        data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 );
+    }
+
+}
+
+void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0;
+
+    for (j = 0; j < SABER_N / 2; j++) {
+        offset_data = 2 * j;
+        bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 );
+    }
+}
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0;
+
+    for (j = 0; j < SABER_N / 2; j++) {
+        offset_data = 2 * j;
+        data[offset_data] = bytes[j] & 0x0f;
+        data[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
+    }
+}
+
+void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 3 * j;
+        offset_data = 4 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6);
+        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
+        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2);
+    }
+}
+
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 3 * j;
+        offset_data = 4 * j;
+        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
+        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |  ((bytes[offset_byte + 1] & 0x0f) << 2)  ;
+        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ;
+        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
+    }
+
+}
+
+void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
+        }
+    }
+}
+
+void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
+        }
+    }
+}
+
+void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
+
+            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
+
+        }
+    }
+
+
+}
+
+void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+    }
+}
+
+
+
+void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
+            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
+            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
+            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
+
+        }
+    }
+}
+
+void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        }
+    }
+
+
+}
+
+
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 10) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 5 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
+            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
+            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
+            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
+
+        }
+    }
+
+
+}
+
+
+void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
+
+            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
+
+        }
+    }
+
+
+}
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 13) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 13 * j;
+            offset_data = 8 * j;
+            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        }
+    }
+
+
+}
+
+void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) {
+
+    uint32_t j;
+    uint32_t offset_data = 0, offset_byte = 0;
+
+    //for(i=0;i<SABER_K;i++){
+    //i=0;
+    //offset_byte1=i*(SABER_N*13)/8;
+    for (j = 0; j < SABER_N / 8; j++) {
+        //offset_byte=offset_byte1+13*j;
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+    }
+    //}
+
+
+}
+
+
+void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+    /*This function packs 11 bit data stream into 8 bits of data.
+    */
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 11) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 11 * j;
+            offset_data = 8 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6);
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1);
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7);
+
+            bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff );
+
+            bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5);
+
+            bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff );
+
+        }
+    }
+
+}
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 11) / 8;
+        for (j = 0; j < SABER_N / 8; j++) {
+            offset_byte = offset_byte1 + 11 * j;
+            offset_data = 8 * j;
+
+            data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 );
+
+            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 );
+
+            data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 );
+
+            data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 );
+
+            data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 );
+
+            data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 );
+
+            data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 );
+
+            data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 );
+        }
+    }
+
+
+}
+
+void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 14) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 7 * j;
+            offset_data = 4 * j;
+            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+
+            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6);
+
+            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff );
+
+            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4);
+
+            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff );
+
+            bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2);
+
+            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff );
+        }
+    }
+
+
+}
+
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+
+    uint32_t i, j;
+    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+
+    for (i = 0; i < SABER_K; i++) {
+        offset_byte1 = i * (SABER_N * 14) / 8;
+        for (j = 0; j < SABER_N / 4; j++) {
+            offset_byte = offset_byte1 + 7 * j;
+            offset_data = 4 * j;
+            data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 );
+
+            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 );
+
+            data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 );
+
+            data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 );
+        }
+    }
+
+
+}
+
+void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
+
+    if (modulus == 1024) {
+        PQCLEAN_SABER_AVX2_POLVECp2BS(bytes, data);
+    } else if (modulus == 8192) {
+        PQCLEAN_SABER_AVX2_POLVECq2BS(bytes, data);
+    }
+}
+
+void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) {
+
+    if (modulus == 1024) {
+        PQCLEAN_SABER_AVX2_BS2POLVECp(data, bytes);
+    } else if (modulus == 8192) {
+        PQCLEAN_SABER_AVX2_BS2POLVECq(data, bytes);
+    }
+
+}
diff --git a/crypto_kem/saber/avx2/pack_unpack.h b/crypto_kem/saber/avx2/pack_unpack.h
new file mode 100644
index 00000000..e1608d4c
--- /dev/null
+++ b/crypto_kem/saber/avx2/pack_unpack.h
@@ -0,0 +1,56 @@
+#ifndef PACK_UNPACK_H
+#define PACK_UNPACK_H
+#include "SABER_params.h"
+#include <stdint.h>
+#include <stdio.h>
+
+void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus);
+
+void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+
+void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+
+void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+
+void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data);
+
+void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+
+
+void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes);
+
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes);
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+
+
+#endif
diff --git a/crypto_kem/saber/avx2/poly.h b/crypto_kem/saber/avx2/poly.h
new file mode 100644
index 00000000..2978d0d8
--- /dev/null
+++ b/crypto_kem/saber/avx2/poly.h
@@ -0,0 +1,27 @@
+#ifndef POLY_H
+#define POLY_H
+/*---------------------------------------------------------------------
+This file has been adapted from the implementation
+(available at, Public Domain https://github.com/pq-crystals/kyber)
+of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------------------------*/
+#include "SABER_params.h"
+#include <stdint.h>
+
+typedef struct {
+    uint16_t coeffs[SABER_N];
+} poly;
+
+typedef struct {
+    poly vec[SABER_K];
+} polyvec;
+
+void PQCLEAN_SABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce);
+
+
+void PQCLEAN_SABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3);
+
+
+#endif
diff --git a/crypto_kem/saber/avx2/polymul/consts.h b/crypto_kem/saber/avx2/polymul/consts.h
new file mode 100644
index 00000000..40826398
--- /dev/null
+++ b/crypto_kem/saber/avx2/polymul/consts.h
@@ -0,0 +1,20 @@
+#include "../SABER_params.h"
+
+#define AVX_N (SABER_N >> 4)
+#define small_len_avx (AVX_N >> 2)
+
+#define SCHB_N 16
+
+#define N_SB (SABER_N >> 2)
+#define N_SB_RES (2*N_SB-1)
+
+#define N_SB_16 (N_SB >> 2)
+#define N_SB_16_RES (2*N_SB_16-1)
+
+#define AVX_N1 16 /*N/16*/ 
+
+#define SCM_SIZE 16
+
+// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements
+#define NUM_POLY SABER_K
+//int NUM_POLY=2; 
diff --git a/crypto_kem/saber/avx2/polymul/matrix.c b/crypto_kem/saber/avx2/polymul/matrix.c
new file mode 100644
index 00000000..5fa35783
--- /dev/null
+++ b/crypto_kem/saber/avx2/polymul/matrix.c
@@ -0,0 +1,303 @@
+#include <immintrin.h>
+
+static void transpose_n1(__m256i *M)
+{
+	//int i;
+	register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
+	register __m256i temp, temp0, temp1, temp2;
+
+	//for(i=0; i<8; i=i+1)
+	//{
+		r0 = _mm256_unpacklo_epi16(M[0], M[1]); 
+		r1 = _mm256_unpacklo_epi16(M[2], M[3]); 
+		r2 = _mm256_unpacklo_epi16(M[4], M[5]); 
+		r3 = _mm256_unpacklo_epi16(M[6], M[7]);
+		r4 = _mm256_unpacklo_epi16(M[8], M[9]); 
+		r5 = _mm256_unpacklo_epi16(M[10], M[11]);
+		r6 = _mm256_unpacklo_epi16(M[12], M[13]); 
+		r7 = _mm256_unpacklo_epi16(M[14], M[15]); 
+
+
+		temp = _mm256_unpacklo_epi32(r0, r1); 
+		temp0 = _mm256_unpacklo_epi32(r2, r3); 
+		temp1 = _mm256_unpacklo_epi32(r4, r5); 
+		temp2 = _mm256_unpacklo_epi32(r6, r7); 
+
+		r8 = _mm256_unpackhi_epi32(r0, r1); 
+		r9 = _mm256_unpackhi_epi32(r2, r3); 
+		r10 = _mm256_unpackhi_epi32(r4, r5); 
+		r11 = _mm256_unpackhi_epi32(r6, r7);
+
+		r0 = _mm256_unpacklo_epi64(temp, temp0); 
+		r2 = _mm256_unpackhi_epi64(temp, temp0); 
+
+		r1 = _mm256_unpacklo_epi64(temp1, temp2); 
+		r3 = _mm256_unpackhi_epi64(temp1, temp2);
+
+		temp = _mm256_unpackhi_epi16(M[0], M[1]); 
+		temp0 = _mm256_unpackhi_epi16(M[2], M[3]); 
+		temp1 = _mm256_unpackhi_epi16(M[4], M[5]); 
+		temp2 = _mm256_unpackhi_epi16(M[6], M[7]); 
+		r4 = _mm256_unpackhi_epi16(M[8], M[9]); 
+
+		M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
+		M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
+		M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
+		M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
+
+
+		r5 = _mm256_unpackhi_epi16(M[10], M[11]); 
+		r6 = _mm256_unpackhi_epi16(M[12], M[13]); 
+		r7 = _mm256_unpackhi_epi16(M[14], M[15]); 
+
+
+
+		r0 = _mm256_unpacklo_epi64(r8, r9); 
+		r1 = _mm256_unpacklo_epi64(r10, r11); 
+
+		r2 = _mm256_unpackhi_epi64(r8, r9); 
+		r3 = _mm256_unpackhi_epi64(r10, r11); 
+
+
+
+		M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
+		M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
+		M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
+		M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
+
+
+	//for(i=0; i<4; i=i+1)
+	//{
+		r0 = _mm256_unpacklo_epi32(temp, temp0); 
+		r1 = _mm256_unpacklo_epi32(temp1, temp2);
+		r2 = _mm256_unpacklo_epi32(r4, r5); 
+		r3 = _mm256_unpacklo_epi32(r6, r7); 
+
+	//}
+
+
+	//for(i=0; i<2; i=i+1)
+	//{
+		r8 = _mm256_unpacklo_epi64(r0, r1); 
+		r10 = _mm256_unpackhi_epi64(r0, r1); 
+
+		r9 = _mm256_unpacklo_epi64(r2, r3); 
+		r11 = _mm256_unpackhi_epi64(r2, r3); 
+
+		M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
+		M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
+		M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
+		M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
+
+		r0 = _mm256_unpackhi_epi32(temp, temp0); 
+		r1 = _mm256_unpackhi_epi32(temp1, temp2); 
+		r2 = _mm256_unpackhi_epi32(r4, r5); 
+		r3 = _mm256_unpackhi_epi32(r6, r7); 
+
+	//}
+//	for(i=0; i<2; i=i+1)
+//	{
+		r4 = _mm256_unpacklo_epi64(r0, r1); 
+		r6 = _mm256_unpackhi_epi64(r0, r1); 
+
+		r5 = _mm256_unpacklo_epi64(r2, r3); 
+		r7 = _mm256_unpackhi_epi64(r2, r3); 
+
+//	}
+
+	//-------------------------------------------------------
+
+	M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
+	M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
+	M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
+	M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
+}
+
+/*
+void transpose_unrolled(__m256i *M)
+{
+	int i;
+	__m256i tL[8], tH[8];
+	__m256i bL[4], bH[4], cL[4], cH[4];
+	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
+
+	__m256i r0, r1, r2, r3, r4, r5, r6, r7;
+
+	//for(i=0; i<8; i=i+1)
+	//{
+		tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); 
+		tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); 
+
+		tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); 
+		tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); 
+
+		tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); 
+		tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); 
+
+		tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); 
+		tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); 
+
+		tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); 
+		tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); 
+
+		tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); 
+		tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); 
+
+		tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); 
+		tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); 
+
+		tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); 
+		tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); 
+
+	//}
+
+	//-------------------------------------------------------
+	//for(i=0; i<4; i=i+1)
+	//{
+		bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); 
+		bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); 
+
+		bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); 
+		bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); 
+
+		bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); 
+		bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); 
+
+		bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); 
+		bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); 
+
+	//}
+
+	//for(i=0; i<2; i=i+1)
+	//{
+		dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); 
+		dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); 
+
+		dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); 
+		dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]);
+
+		M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
+		M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
+		M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
+		M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
+
+	//}
+	//for(i=0; i<2; i=i+1)
+	//{
+		eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); 
+		eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); 
+
+		eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); 
+		eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); 
+
+	//}
+
+	//-------------------------------------------------------
+
+	//-------------------------------------------------------
+	for(i=0; i<4; i=i+1)
+	{
+		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
+		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
+	}
+
+
+	for(i=0; i<2; i=i+1)
+	{
+		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
+		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
+		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
+	}
+
+	//-------------------------------------------------------
+
+
+
+	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
+	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
+	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
+	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
+
+	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
+	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
+	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
+	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
+
+	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
+	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
+	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
+	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
+}
+
+
+void transpose1(__m256i *M)
+{
+	int i;
+	__m256i tL[8], tH[8];
+	__m256i bL[4], bH[4], cL[4], cH[4];
+	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
+
+	for(i=0; i<8; i=i+1)
+	{
+		tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); 
+		tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); 
+	}
+
+	for(i=0; i<4; i=i+1)
+	{
+		bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); 
+		bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); 
+	}
+	for(i=0; i<4; i=i+1)
+	{
+		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
+		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
+	}
+
+	for(i=0; i<2; i=i+1)
+	{
+		dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); 
+		dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); 
+		eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); 
+	}
+
+	for(i=0; i<2; i=i+1)
+	{
+		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
+		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
+	}
+	for(i=0; i<2; i=i+1)
+	{
+		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
+		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
+	}
+
+	M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
+	M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
+	M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
+	M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
+
+	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
+	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
+	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
+	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
+
+	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
+	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
+	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
+	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
+
+	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
+	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
+	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
+	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
+}
+*/
diff --git a/crypto_kem/saber/avx2/polymul/scm_avx.c b/crypto_kem/saber/avx2/polymul/scm_avx.c
new file mode 100644
index 00000000..4e4f11f8
--- /dev/null
+++ b/crypto_kem/saber/avx2/polymul/scm_avx.c
@@ -0,0 +1,753 @@
+//#define SCM_SIZE 16
+
+//#pragma STDC FP_CONTRACT ON
+
+#include <immintrin.h>
+
+inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
+    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
+}
+
+
+static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
+									      //the c_avx are added cummulatively
+{
+
+	register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+	register __m256i temp;
+
+
+	a0=a[0];
+	a1=a[1];
+	a2=a[2];
+	a3=a[3];
+	a4=a[4];
+	a5=a[5];
+	a6=a[6];
+	a7=a[7];
+
+	b0=b[0];
+	b1=b[1];
+	b2=b[2];
+	b3=b[3];
+	b4=b[4];
+	b5=b[5];
+	b6=b[6];
+	b7=b[7];
+
+	// New Unrolled first triangle
+
+	//otherwise accumulate
+	c_avx[0] = mul_add(a0, b0, c_avx[0]);
+	
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	temp=mul_add(a1, b0, temp);
+	c_avx[1] = _mm256_add_epi16(temp, c_avx[1]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b2);
+	temp = mul_add(a1, b1, temp);
+	temp=mul_add(a2, b0, temp);
+	c_avx[2] = _mm256_add_epi16(temp, c_avx[2]);
+	
+
+	temp = _mm256_mullo_epi16 (a0, b3);
+	temp = mul_add(a1, b2, temp);
+	temp = mul_add(a2, b1, temp);
+	temp=mul_add(a3, b0, temp);
+	c_avx[3] = _mm256_add_epi16(temp, c_avx[3]);
+
+	temp = _mm256_mullo_epi16 (a0, b4);
+	temp = mul_add(a1, b3, temp);
+	temp = mul_add(a3, b1, temp);
+	temp = mul_add(a4, b0, temp);
+	temp=mul_add(a2, b2, temp);
+	c_avx[4] = _mm256_add_epi16(temp, c_avx[4]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b5);
+	temp = mul_add(a1, b4 , temp);
+	temp = mul_add(a2, b3, temp);
+	temp = mul_add(a3, b2, temp);
+	temp = mul_add( a4, b1, temp);
+	temp=mul_add(a5, b0, temp);
+	c_avx[5] = _mm256_add_epi16(temp, c_avx[5]);
+	
+	temp = _mm256_mullo_epi16 (a0, b6);
+	temp = mul_add(a1, b5, temp);
+	temp = mul_add(a5, b1, temp);
+	temp = mul_add(a6, b0, temp);
+	temp = mul_add(a2, b4, temp);
+	temp = mul_add(a3, b3, temp);
+	temp=mul_add(a4, b2, temp);
+	c_avx[6] = _mm256_add_epi16(temp, c_avx[6]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b7);
+	temp = mul_add(a1, b6, temp);
+	temp = mul_add (a6, b1, temp);
+	temp = mul_add (a7, b0, temp);
+	temp = mul_add(a2, b5, temp);
+	temp = mul_add (a3, b4, temp);
+	temp = mul_add (a4, b3, temp);
+	temp=mul_add(a5, b2, temp);
+	c_avx[7] = _mm256_add_epi16(temp, c_avx[7]);
+
+	temp = _mm256_mullo_epi16 (a0, b[8]);
+	temp = mul_add (a1, b7, temp);
+	temp = mul_add (a7, b1, temp);
+	temp = mul_add (a[8], b0, temp);
+	temp = mul_add (a2, b6,temp);
+	temp = mul_add(a3, b5, temp);
+	temp = mul_add (a4, b4,temp);
+	temp = mul_add (a5, b3, temp);
+	
+		temp=mul_add(a6, b2, temp);
+		c_avx[8] = _mm256_add_epi16(temp, c_avx[8]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[9]);
+	temp = mul_add (a1, b[8], temp);
+	temp = mul_add (a[8], b1, temp);
+	temp = mul_add (a[9], b0, temp);
+	temp = mul_add (a2, b7, temp);
+	temp = mul_add (a3, b6, temp);
+	temp = mul_add (a4, b5, temp);
+	temp = mul_add (a5, b4, temp);
+	temp = mul_add (a6, b3, temp);
+		temp=mul_add(a7, b2, temp);
+		c_avx[9] = _mm256_add_epi16(temp, c_avx[9]);
+
+
+	temp= _mm256_mullo_epi16 (a0, b[10]);
+	temp = mul_add (a1, b[9], temp);
+	temp = mul_add (a[9], b1, temp);
+	temp = mul_add (a[10], b0, temp);
+	temp = mul_add (a2, b[8], temp);
+	temp = mul_add (a3, b7, temp);
+	temp = mul_add (a4, b6, temp);
+	temp = mul_add (a5, b5, temp);
+	temp = mul_add (a6, b4, temp);
+	temp = mul_add (a7, b3, temp);
+		temp=mul_add(a[8], b2, temp);
+		c_avx[10] = _mm256_add_epi16(temp, c_avx[10]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[11]);
+	temp = mul_add (a1, b[10], temp );
+	temp = mul_add (a[10], b1, temp );
+	temp = mul_add (a[11], b0, temp );
+	temp = mul_add (a2, b[9], temp );
+	temp = mul_add (a3, b[8], temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a[8], b3, temp );
+		temp=mul_add(a[9], b2, temp);
+		c_avx[11] = _mm256_add_epi16(temp, c_avx[11]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[12]);
+	temp = mul_add (a1, b[11], temp);
+	temp = mul_add (a[11], b1, temp);
+	temp = mul_add (a[12], b0, temp);
+	temp = mul_add (a2, b[10], temp);
+	temp = mul_add (a3, b[9], temp);
+	temp = mul_add (a4, b[8], temp);
+	temp = mul_add (a5, b7, temp);
+	temp = mul_add (a6, b6, temp);
+	temp = mul_add (a7, b5, temp);
+	temp = mul_add (a[8], b4, temp);
+	temp = mul_add (a[9], b3, temp);
+		temp=mul_add(a[10], b2, temp);
+		c_avx[12] = _mm256_add_epi16(temp, c_avx[12]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[13]);
+	temp = mul_add (a1, b[12], temp );
+	temp = mul_add (a[12], b1, temp );
+	temp = mul_add (a[13], b0, temp );
+	temp = mul_add (a2, b[11], temp );
+	temp = mul_add (a3, b[10], temp );
+	temp = mul_add (a4, b[9], temp );
+	temp = mul_add (a5, b[8], temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a[8], b5, temp );
+	temp = mul_add (a[9], b4, temp );
+	temp = mul_add (a[10], b3, temp );
+		temp=mul_add(a[11], b2, temp);
+		c_avx[13] = _mm256_add_epi16(temp, c_avx[13]);
+
+
+
+	temp = _mm256_mullo_epi16 (a0, b[14]);
+	temp = mul_add (a1, b[13], temp );
+	temp = mul_add (a[13], b1, temp );
+	temp = mul_add (a[14], b0, temp );
+	temp = mul_add (a2, b[12], temp );
+	temp = mul_add (a3, b[11], temp );
+	temp = mul_add (a4, b[10], temp );
+	temp = mul_add (a5, b[9], temp );
+	temp = mul_add (a6, b[8], temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a[8], b6, temp );
+	temp = mul_add (a[9], b5, temp );
+	temp = mul_add (a[10], b4, temp );
+	temp = mul_add (a[11], b3, temp );
+		temp=mul_add(a[12], b2, temp);
+		c_avx[14] = _mm256_add_epi16(temp, c_avx[14]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b[15]);
+	temp = mul_add (a1, b[14], temp );
+	temp = mul_add (a[14], b1, temp );
+	temp = mul_add (a[15], b0, temp );
+	temp = mul_add (a2, b[13], temp );
+	temp = mul_add (a3, b[12], temp );
+	temp = mul_add (a4, b[11], temp );
+	temp = mul_add (a5, b[10], temp );
+	temp = mul_add (a6, b[9], temp );
+	temp = mul_add (a7, b[8], temp );
+	temp = mul_add (a[8], b7, temp );
+	temp = mul_add (a[9], b6, temp );
+	temp = mul_add (a[10], b5, temp );
+	temp = mul_add (a[11], b4, temp );
+	temp = mul_add (a[12], b3, temp );
+		temp=mul_add(a[13], b2, temp);
+		c_avx[15] = _mm256_add_epi16(temp, c_avx[15]);
+
+
+	// unrolled second triangle
+	a0=a[14];
+	a1=a[15];
+	a2=a[13];
+	a3=a[12];
+	a4=a[11];
+	a5=a[10];
+	a6=a[9];
+	a7=a[8];
+
+	b0=b[14];
+	b1=b[15];
+	b2=b[13];
+	b3=b[12];
+	b4=b[11];
+	b5=b[10];
+	b6=b[9];
+	b7=b[8];
+
+	temp = _mm256_mullo_epi16 (a[1], b1);
+	temp = mul_add (a[2], b0, temp );
+	temp = mul_add (a[3], b2, temp );
+	temp = mul_add (a[4], b3, temp );
+	temp = mul_add (a[5], b4, temp );
+	temp = mul_add (a[6], b5, temp );
+	temp = mul_add (a[7], b6, temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a6, b[7], temp );
+	temp = mul_add (a5, b[6], temp );
+	temp = mul_add (a4, b[5], temp );
+	temp = mul_add (a3, b[4], temp );
+	temp = mul_add (a2, b[3], temp );
+	temp = mul_add (a0, b[2], temp );
+		temp=mul_add(a1, b[1], temp);
+		c_avx[16] = _mm256_add_epi16(temp, c_avx[16]);
+
+
+	temp = _mm256_mullo_epi16 (a[2], b1);
+	temp = mul_add (a[3], b0, temp );
+	temp = mul_add (a[4], b2, temp );
+	temp = mul_add (a[5], b3, temp );
+	temp = mul_add (a[6], b4, temp );
+	temp = mul_add (a[7], b5, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a5, b[7], temp );
+	temp = mul_add (a4, b[6], temp );
+	temp = mul_add (a3, b[5], temp );
+	temp = mul_add (a2, b[4], temp );
+	temp = mul_add (a0, b[3], temp );
+		temp=mul_add(a1, b[2], temp);
+		c_avx[17] = _mm256_add_epi16(temp, c_avx[17]);
+
+
+	temp = _mm256_mullo_epi16 (a[3], b1);
+	temp = mul_add (a[4], b0, temp );
+	temp = mul_add (a[5], b2, temp );
+	temp = mul_add (a[6], b3, temp );
+	temp = mul_add (a[7], b4, temp );
+	temp = mul_add (a7, b5, temp );
+	temp = mul_add (a6, b6, temp );
+	temp = mul_add (a5, b7, temp );
+	temp = mul_add (a4, b[7], temp );
+	temp = mul_add (a3, b[6], temp );
+	temp = mul_add (a2, b[5], temp );
+	temp = mul_add (a0, b[4], temp );
+		temp=mul_add(a1, b[3], temp);
+		c_avx[18] = _mm256_add_epi16(temp, c_avx[18]);
+
+
+	temp = _mm256_mullo_epi16 (a[4], b1);
+	temp = mul_add (a[5], b0, temp );
+	temp = mul_add (a[6], b2, temp );
+	temp = mul_add (a[7], b3, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a3, b[7], temp );
+	temp = mul_add (a2, b[6], temp );
+	temp = mul_add (a0, b[5], temp );
+		temp=mul_add(a1, b[4], temp);
+		c_avx[19] = _mm256_add_epi16(temp, c_avx[19]);
+
+
+	temp = _mm256_mullo_epi16 (a[5], b1);
+	temp = mul_add (a[6], b0, temp );
+	temp = mul_add (a[7], b2, temp );
+	temp = mul_add (a7, b3, temp );
+	temp = mul_add (a6, b4, temp );
+	temp = mul_add (a5, b5, temp );
+	temp = mul_add (a4, b6, temp );
+	temp = mul_add (a3, b7, temp );
+	temp = mul_add (a2, b[7], temp );
+	temp = mul_add (a0, b[6], temp );
+		temp=mul_add(a1, b[5], temp);
+		c_avx[20] = _mm256_add_epi16(temp, c_avx[20]);
+
+
+	temp = _mm256_mullo_epi16 (a[6], b1);
+	temp = mul_add (a[7], b0, temp );
+	temp = mul_add (a7, b2, temp );
+	temp = mul_add (a6, b3, temp );
+	temp = mul_add (a5, b4, temp );
+	temp = mul_add (a4, b5, temp );
+	temp = mul_add (a3, b6, temp );
+	temp = mul_add (a2, b7, temp );
+	temp = mul_add (a0, b[7], temp );
+		temp=mul_add(a1, b[6], temp);
+		c_avx[21] = _mm256_add_epi16(temp, c_avx[21]);
+
+
+	temp = _mm256_mullo_epi16 (a[7], b1);
+	temp = mul_add (a7, b0, temp );
+	temp = mul_add (a6, b2, temp );
+	temp = mul_add (a5, b3, temp );
+	temp = mul_add (a4, b4, temp );
+	temp = mul_add (a3, b5, temp );
+	temp = mul_add (a2, b6, temp );
+	temp = mul_add (a0, b7, temp );
+		temp=mul_add(a1, b[7], temp);
+		c_avx[22] = _mm256_add_epi16(temp, c_avx[22]);
+
+
+	temp = _mm256_mullo_epi16 (a7, b1);
+	temp = mul_add (a6, b0, temp );
+	temp = mul_add (a5, b2, temp );
+	temp = mul_add (a4, b3, temp );
+	temp = mul_add (a3, b4, temp );
+	temp = mul_add (a2, b5, temp );
+	temp = mul_add (a0, b6, temp );
+		temp=mul_add(a1, b7, temp);
+		c_avx[23] = _mm256_add_epi16(temp, c_avx[23]);
+
+
+	temp = _mm256_mullo_epi16 (a6, b1);
+	temp = mul_add (a5, b0, temp );
+	temp = mul_add (a4, b2, temp );
+	temp = mul_add (a3, b3, temp );
+	temp = mul_add (a2, b4, temp );
+	temp = mul_add (a0, b5, temp );
+		temp=mul_add(a1, b6, temp);
+		c_avx[24] = _mm256_add_epi16(temp, c_avx[24]);
+
+
+	temp = _mm256_mullo_epi16 (a5, b1);
+	temp = mul_add (a4, b0, temp );
+	temp = mul_add (a3, b2, temp );
+	temp = mul_add (a2, b3, temp );
+	temp = mul_add (a0, b4, temp );
+		temp=mul_add(a1, b5, temp);
+		c_avx[25] = _mm256_add_epi16(temp, c_avx[25]);
+
+
+	temp = _mm256_mullo_epi16 (a4, b1);
+	temp = mul_add (a3, b0, temp );
+	temp = mul_add (a2, b2, temp );
+	temp = mul_add (a0, b3, temp );
+		temp=mul_add(a1, b4, temp);
+		c_avx[26] = _mm256_add_epi16(temp, c_avx[26]);
+
+
+	temp = _mm256_mullo_epi16 (a3, b1);
+	temp = mul_add (a2, b0, temp );
+	temp = mul_add (a0, b2, temp );
+		temp=mul_add(a1, b3, temp);
+		c_avx[27] = _mm256_add_epi16(temp, c_avx[27]);
+
+
+	temp = _mm256_mullo_epi16 (a2, b1);
+	temp = mul_add (a0, b0, temp );
+		temp=mul_add(a1, b2, temp);
+		c_avx[28] = _mm256_add_epi16(temp, c_avx[28]);
+
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+		temp=mul_add(a1, b0, temp);
+		c_avx[29] = _mm256_add_epi16(temp, c_avx[29]);
+
+
+		c_avx[30] = mul_add(a1, b1, c_avx[30]);
+
+
+
+	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
+
+
+}
+
+
+
+static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
+									      //the c_avx are not added cummulatively
+{
+
+	__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+	__m256i temp;
+
+
+	a0=a[0];
+	a1=a[1];
+	a2=a[2];
+	a3=a[3];
+	a4=a[4];
+	a5=a[5];
+	a6=a[6];
+	a7=a[7];
+
+	b0=b[0];
+	b1=b[1];
+	b2=b[2];
+	b3=b[3];
+	b4=b[4];
+	b5=b[5];
+	b6=b[6];
+	b7=b[7];
+
+	// New Unrolled first triangle
+	c_avx[0] = _mm256_mullo_epi16 (a0, b0);
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	c_avx[1]=mul_add(a1, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b2);
+
+	temp = mul_add(a1, b1, temp);
+	c_avx[2]= mul_add(a2, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b3);
+	temp = mul_add(a1, b2, temp);
+	temp = mul_add(a2, b1, temp);
+	c_avx[3]= mul_add(a3, b0, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b4);
+	temp = mul_add(a1, b3, temp);
+	temp = mul_add(a3, b1, temp);
+	temp = mul_add(a4, b0, temp);
+	c_avx[4]= mul_add(a2, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b5);
+	temp = mul_add(a1, b4 , temp);
+	temp = mul_add(a2, b3, temp);
+	temp = mul_add(a3, b2, temp);
+	temp = mul_add( a4, b1, temp);
+	c_avx[5] = mul_add(a5, b0, temp);
+	
+	temp = _mm256_mullo_epi16 (a0, b6);
+	temp = mul_add(a1, b5, temp);
+	temp = mul_add(a5, b1, temp);
+	temp = mul_add(a6, b0, temp);
+	temp = mul_add(a2, b4, temp);
+	temp = mul_add(a3, b3, temp);
+	c_avx[6] = mul_add(a4, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b7);
+	temp = mul_add(a1, b6, temp);
+	temp = mul_add (a6, b1, temp);
+	temp = mul_add (a7, b0, temp);
+	temp = mul_add(a2, b5, temp);
+	temp = mul_add (a3, b4, temp);
+	temp = mul_add (a4, b3, temp);
+	c_avx[7] = mul_add (a5, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[8]);
+	temp = mul_add (a1, b7, temp);
+	temp = mul_add (a7, b1, temp);
+	temp = mul_add (a[8], b0, temp);
+	temp = mul_add (a2, b6,temp);
+	temp = mul_add(a3, b5, temp);
+	temp = mul_add (a4, b4,temp);
+	temp = mul_add (a5, b3, temp);
+	c_avx[8] = mul_add (a6, b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[9]);
+	temp = mul_add (a1, b[8], temp);
+	temp = mul_add (a[8], b1, temp);
+	temp = mul_add (a[9], b0, temp);
+	temp = mul_add (a2, b7, temp);
+	temp = mul_add (a3, b6, temp);
+	temp = mul_add (a4, b5, temp);
+	temp = mul_add (a5, b4, temp);
+	temp = mul_add (a6, b3, temp);
+	c_avx[9] = mul_add (a7, b2, temp);
+
+	temp= _mm256_mullo_epi16 (a0, b[10]);
+	temp = mul_add (a1, b[9], temp);
+	temp = mul_add (a[9], b1, temp);
+	temp = mul_add (a[10], b0, temp);
+	temp = mul_add (a2, b[8], temp);
+	temp = mul_add (a3, b7, temp);
+	temp = mul_add (a4, b6, temp);
+	temp = mul_add (a5, b5, temp);
+	temp = mul_add (a6, b4, temp);
+	temp = mul_add (a7, b3, temp);
+	c_avx[10] = mul_add (a[8], b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[11]);
+	temp = mul_add (a1, b[10], temp );
+	temp = mul_add (a[10], b1, temp );
+	temp = mul_add (a[11], b0, temp );
+	temp = mul_add (a2, b[9], temp );
+	temp = mul_add (a3, b[8], temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a[8], b3, temp );
+	c_avx[11] = mul_add (a[9], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[12]);
+	temp = mul_add (a1, b[11], temp);
+	temp = mul_add (a[11], b1, temp);
+	temp = mul_add (a[12], b0, temp);
+	temp = mul_add (a2, b[10], temp);
+	temp = mul_add (a3, b[9], temp);
+	temp = mul_add (a4, b[8], temp);
+	temp = mul_add (a5, b7, temp);
+	temp = mul_add (a6, b6, temp);
+	temp = mul_add (a7, b5, temp);
+	temp = mul_add (a[8], b4, temp);
+	temp = mul_add (a[9], b3, temp);
+	c_avx[12] = mul_add (a[10], b2, temp);
+
+	temp = _mm256_mullo_epi16 (a0, b[13]);
+	temp = mul_add (a1, b[12], temp );
+	temp = mul_add (a[12], b1, temp );
+	temp = mul_add (a[13], b0, temp );
+	temp = mul_add (a2, b[11], temp );
+	temp = mul_add (a3, b[10], temp );
+	temp = mul_add (a4, b[9], temp );
+	temp = mul_add (a5, b[8], temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a[8], b5, temp );
+	temp = mul_add (a[9], b4, temp );
+	temp = mul_add (a[10], b3, temp );
+	c_avx[13] = mul_add (a[11], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[14]);
+	temp = mul_add (a1, b[13], temp );
+	temp = mul_add (a[13], b1, temp );
+	temp = mul_add (a[14], b0, temp );
+	temp = mul_add (a2, b[12], temp );
+	temp = mul_add (a3, b[11], temp );
+	temp = mul_add (a4, b[10], temp );
+	temp = mul_add (a5, b[9], temp );
+	temp = mul_add (a6, b[8], temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a[8], b6, temp );
+	temp = mul_add (a[9], b5, temp );
+	temp = mul_add (a[10], b4, temp );
+	temp = mul_add (a[11], b3, temp );
+	c_avx[14] = mul_add (a[12], b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b[15]);
+	temp = mul_add (a1, b[14], temp );
+	temp = mul_add (a[14], b1, temp );
+	temp = mul_add (a[15], b0, temp );
+	temp = mul_add (a2, b[13], temp );
+	temp = mul_add (a3, b[12], temp );
+	temp = mul_add (a4, b[11], temp );
+	temp = mul_add (a5, b[10], temp );
+	temp = mul_add (a6, b[9], temp );
+	temp = mul_add (a7, b[8], temp );
+	temp = mul_add (a[8], b7, temp );
+	temp = mul_add (a[9], b6, temp );
+	temp = mul_add (a[10], b5, temp );
+	temp = mul_add (a[11], b4, temp );
+	temp = mul_add (a[12], b3, temp );
+	c_avx[15] = mul_add (a[13], b2, temp );
+
+
+	// unrolled second triangle
+	a0=a[14];
+	a1=a[15];
+	a2=a[13];
+	a3=a[12];
+	a4=a[11];
+	a5=a[10];
+	a6=a[9];
+	a7=a[8];
+
+	b0=b[14];
+	b1=b[15];
+	b2=b[13];
+	b3=b[12];
+	b4=b[11];
+	b5=b[10];
+	b6=b[9];
+	b7=b[8];
+	
+
+	temp = _mm256_mullo_epi16 (a[1], b1);
+	temp = mul_add (a[2], b0, temp );
+	temp = mul_add (a[3], b2, temp );
+	temp = mul_add (a[4], b3, temp );
+	temp = mul_add (a[5], b4, temp );
+	temp = mul_add (a[6], b5, temp );
+	temp = mul_add (a[7], b6, temp );
+	temp = mul_add (a7, b7, temp );
+	temp = mul_add (a6, b[7], temp );
+	temp = mul_add (a5, b[6], temp );
+	temp = mul_add (a4, b[5], temp );
+	temp = mul_add (a3, b[4], temp );
+	temp = mul_add (a2, b[3], temp );
+	temp = mul_add (a0, b[2], temp );
+	c_avx[16] = mul_add (a1, b[1], temp );
+
+	temp = _mm256_mullo_epi16 (a[2], b1);
+	temp = mul_add (a[3], b0, temp );
+	temp = mul_add (a[4], b2, temp );
+	temp = mul_add (a[5], b3, temp );
+	temp = mul_add (a[6], b4, temp );
+	temp = mul_add (a[7], b5, temp );
+	temp = mul_add (a7, b6, temp );
+	temp = mul_add (a6, b7, temp );
+	temp = mul_add (a5, b[7], temp );
+	temp = mul_add (a4, b[6], temp );
+	temp = mul_add (a3, b[5], temp );
+	temp = mul_add (a2, b[4], temp );
+	temp = mul_add (a0, b[3], temp );
+	c_avx[17] = mul_add (a1, b[2], temp );
+
+	temp = _mm256_mullo_epi16 (a[3], b1);
+	temp = mul_add (a[4], b0, temp );
+	temp = mul_add (a[5], b2, temp );
+	temp = mul_add (a[6], b3, temp );
+	temp = mul_add (a[7], b4, temp );
+	temp = mul_add (a7, b5, temp );
+	temp = mul_add (a6, b6, temp );
+	temp = mul_add (a5, b7, temp );
+	temp = mul_add (a4, b[7], temp );
+	temp = mul_add (a3, b[6], temp );
+	temp = mul_add (a2, b[5], temp );
+	temp = mul_add (a0, b[4], temp );
+	c_avx[18] = mul_add (a1, b[3], temp );
+
+	temp = _mm256_mullo_epi16 (a[4], b1);
+	temp = mul_add (a[5], b0, temp );
+	temp = mul_add (a[6], b2, temp );
+	temp = mul_add (a[7], b3, temp );
+	temp = mul_add (a7, b4, temp );
+	temp = mul_add (a6, b5, temp );
+	temp = mul_add (a5, b6, temp );
+	temp = mul_add (a4, b7, temp );
+	temp = mul_add (a3, b[7], temp );
+	temp = mul_add (a2, b[6], temp );
+	temp = mul_add (a0, b[5], temp );
+	c_avx[19] = mul_add (a1, b[4], temp );
+
+	temp = _mm256_mullo_epi16 (a[5], b1);
+	temp = mul_add (a[6], b0, temp );
+	temp = mul_add (a[7], b2, temp );
+	temp = mul_add (a7, b3, temp );
+	temp = mul_add (a6, b4, temp );
+	temp = mul_add (a5, b5, temp );
+	temp = mul_add (a4, b6, temp );
+	temp = mul_add (a3, b7, temp );
+	temp = mul_add (a2, b[7], temp );
+	temp = mul_add (a0, b[6], temp );
+	c_avx[20] = mul_add (a1, b[5], temp );
+
+	temp = _mm256_mullo_epi16 (a[6], b1);
+	temp = mul_add (a[7], b0, temp );
+	temp = mul_add (a7, b2, temp );
+	temp = mul_add (a6, b3, temp );
+	temp = mul_add (a5, b4, temp );
+	temp = mul_add (a4, b5, temp );
+	temp = mul_add (a3, b6, temp );
+	temp = mul_add (a2, b7, temp );
+	temp = mul_add (a0, b[7], temp );
+	c_avx[21] = mul_add (a1, b[6], temp );
+
+	temp = _mm256_mullo_epi16 (a[7], b1);
+	temp = mul_add (a7, b0, temp );
+	temp = mul_add (a6, b2, temp );
+	temp = mul_add (a5, b3, temp );
+	temp = mul_add (a4, b4, temp );
+	temp = mul_add (a3, b5, temp );
+	temp = mul_add (a2, b6, temp );
+	temp = mul_add (a0, b7, temp );
+	c_avx[22] = mul_add (a1, b[7], temp );
+
+	temp = _mm256_mullo_epi16 (a7, b1);
+	temp = mul_add (a6, b0, temp );
+	temp = mul_add (a5, b2, temp );
+	temp = mul_add (a4, b3, temp );
+	temp = mul_add (a3, b4, temp );
+	temp = mul_add (a2, b5, temp );
+	temp = mul_add (a0, b6, temp );
+	c_avx[23] = mul_add (a1, b7, temp );
+
+	temp = _mm256_mullo_epi16 (a6, b1);
+	temp = mul_add (a5, b0, temp );
+	temp = mul_add (a4, b2, temp );
+	temp = mul_add (a3, b3, temp );
+	temp = mul_add (a2, b4, temp );
+	temp = mul_add (a0, b5, temp );
+	c_avx[24] = mul_add (a1, b6, temp );
+
+	temp = _mm256_mullo_epi16 (a5, b1);
+	temp = mul_add (a4, b0, temp );
+	temp = mul_add (a3, b2, temp );
+	temp = mul_add (a2, b3, temp );
+	temp = mul_add (a0, b4, temp );
+	c_avx[25] = mul_add (a1, b5, temp );
+
+	temp = _mm256_mullo_epi16 (a4, b1);
+	temp = mul_add (a3, b0, temp );
+	temp = mul_add (a2, b2, temp );
+	temp = mul_add (a0, b3, temp );
+	c_avx[26] = mul_add (a1, b4, temp );
+
+	temp = _mm256_mullo_epi16 (a3, b1);
+	temp = mul_add (a2, b0, temp );
+	temp = mul_add (a0, b2, temp );
+	c_avx[27] = mul_add (a1, b3, temp );
+
+	temp = _mm256_mullo_epi16 (a2, b1);
+	temp = mul_add (a0, b0, temp );
+	c_avx[28] = mul_add (a1, b2, temp );
+
+	temp = _mm256_mullo_epi16 (a0, b1);
+	c_avx[29] = mul_add (a1, b0, temp);
+
+	c_avx[30] = _mm256_mullo_epi16 (a1, b1);
+
+
+	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
+
+}
diff --git a/crypto_kem/saber/avx2/polymul/toom-cook_4way.c b/crypto_kem/saber/avx2/polymul/toom-cook_4way.c
new file mode 100644
index 00000000..78fb86c2
--- /dev/null
+++ b/crypto_kem/saber/avx2/polymul/toom-cook_4way.c
@@ -0,0 +1,1010 @@
+/*
+Cleaned version for step by step approach look into the _debug file
+*/
+//#include "timing.c"
+#include "consts.h"
+#include "matrix.c"
+#include "scm_avx.c"
+
+static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX.
+{
+	__m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time
+
+	//uint16_t i;
+
+	register __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+
+
+		//CLOCK1=cpucycles();
+		
+		//------------------AVX evaluation for 1st poly-----------------------
+
+                    r0_avx=a[0];
+                    r1_avx=a[1];
+                    r2_avx=a[2];
+                    r3_avx=a[3];
+		    a_bucket[0]=r0_avx;
+		    a_bucket[1]=r1_avx;
+		    a_bucket[2]=r2_avx;
+		    a_bucket[3]=r3_avx;
+		    a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]);
+
+
+		//------------------AVX evaluation for 1st poly ends------------------
+
+
+		//------------------AVX evaluation for 2nd poly-----------------------
+                    r0_avx=a[small_len_avx];
+                    r1_avx=a[small_len_avx+1];
+                    r2_avx=a[small_len_avx+2];
+                    r3_avx=a[small_len_avx+3];
+		    a_bucket[0+9]=r0_avx;
+		    a_bucket[1+9]=r1_avx;
+		    a_bucket[2+9]=r2_avx;
+		    a_bucket[3+9]=r3_avx;
+		    a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]);
+
+	
+		//------------------AVX evaluation for 2nd poly ends------------------
+
+
+		//------------------AVX evaluation for 3rd poly-----------------------
+                    r0_avx=a[2*small_len_avx];
+                    r1_avx=a[2*small_len_avx+1];
+                    r2_avx=a[2*small_len_avx+2];
+                    r3_avx=a[2*small_len_avx+3];
+		    a_bucket[0+18]=r0_avx;
+		    a_bucket[1+18]=r1_avx;
+		    a_bucket[2+18]=r2_avx;
+		    a_bucket[3+18]=r3_avx;
+		    a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]);
+		
+		//------------------AVX evaluation for 3rd poly ends------------------
+
+
+		//------------------AVX evaluation for 4th poly-----------------------
+
+                    r0_avx=a[3*small_len_avx];
+                    r1_avx=a[3*small_len_avx+1];
+                    r2_avx=a[3*small_len_avx+2];
+                    r3_avx=a[3*small_len_avx+3];
+		    a_bucket[0+27]=r0_avx;
+		    a_bucket[1+27]=r1_avx;
+		    a_bucket[2+27]=r2_avx;
+		    a_bucket[3+27]=r3_avx;
+		    a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]);
+		
+		//------------------AVX evaluation for 4th poly ends------------------
+
+		//------------------AVX evaluation for 5th poly-----------------------
+		
+                    r0_avx=a[4*small_len_avx+0];
+                    r1_avx=a[4*small_len_avx+1];
+                    r2_avx=a[4*small_len_avx+2];
+                    r3_avx=a[4*small_len_avx+3];
+		    a_bucket[0+36]=r0_avx;
+		    a_bucket[1+36]=r1_avx;
+		    a_bucket[2+36]=r2_avx;
+		    a_bucket[3+36]=r3_avx;
+		    a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]);
+		
+		//------------------AVX evaluation for 5th poly ends------------------
+
+
+		//------------------AVX evaluation for 6th poly-----------------------
+                    r0_avx=a[5*small_len_avx];
+                    r1_avx=a[5*small_len_avx+1];
+                    r2_avx=a[5*small_len_avx+2];
+                    r3_avx=a[5*small_len_avx+3];
+		    a_bucket[0+45]=r0_avx;
+		    a_bucket[1+45]=r1_avx;
+		    a_bucket[2+45]=r2_avx;
+		    a_bucket[3+45]=r3_avx;
+		    a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]);
+		
+		//------------------AVX evaluation for 6th poly ends------------------
+
+		//------------------AVX evaluation for 7th poly-----------------------
+
+                    r0_avx=a[6*small_len_avx];
+                    r1_avx=a[6*small_len_avx+1];
+                    r2_avx=a[6*small_len_avx+2];
+                    r3_avx=a[6*small_len_avx+3];
+		    a_bucket[0+54]=r0_avx;
+		    a_bucket[1+54]=r1_avx;
+		    a_bucket[2+54]=r2_avx;
+		    a_bucket[3+54]=r3_avx;
+		    a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
+		    a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
+		    a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
+		    a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
+		    a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]);
+
+		//------------------AVX evaluation for 7th poly ends------------------
+		
+	
+
+		//CLOCK2=cpucycles();
+		//CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1);
+		//printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1);
+
+
+		//CLOCK1=cpucycles();
+		//-----------------Forward transposes--------------------------------------
+			transpose_n1(a_bucket);
+			transpose_n1(a_bucket+16);
+			transpose_n1(a_bucket+32);
+			transpose_n1(a_bucket+48);
+
+		//-----------------Forwatrd transposes ends---------------------------------
+
+		//----------------------all multiplications---------------------------------
+		if(f==0){
+			schoolbook_avx_new2(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
+			schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
+			schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
+		}
+		else{
+			schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
+			//schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket);
+			schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
+			schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
+		}
+		/*
+		schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f);
+		schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f);
+		schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f);
+		schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f);
+		*/
+
+
+		//----------------------all multiplications ends-----------------------------
+
+
+		//-----------------Reverse transposes--------------------------------------
+
+			/*
+			transpose(c_bucket);
+			transpose(c_bucket+16);
+
+			transpose(c_bucket+2*SCM_SIZE);
+			transpose(c_bucket+16+2*SCM_SIZE);
+
+			transpose(c_bucket+4*SCM_SIZE);
+			transpose(c_bucket+16+4*SCM_SIZE);
+
+			transpose(c_bucket+6*SCM_SIZE);
+			transpose(c_bucket+16+6*SCM_SIZE);
+			*/
+		//-----------------Reverse transposes ends---------------------------------
+
+		//CLOCK2=cpucycles();
+		//CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1);
+
+		//KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6);
+		
+}
+
+static void KARA_eval(__m256i* b, __m256i *b_bucket){
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+
+		//-------1st poly----------------------------------------------------
+                    r0_avx=b[0];
+                    r1_avx=b[1];
+                    r2_avx=b[2];
+                    r3_avx=b[3];
+		    b_bucket[0]=r0_avx;
+		    b_bucket[1]=r1_avx;
+		    b_bucket[2]=r2_avx;
+		    b_bucket[3]=r3_avx;
+		    b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]);
+		//-------2nd poly----------------------------------------------------
+
+                    r0_avx=b[small_len_avx];
+                    r1_avx=b[small_len_avx+1];
+                    r2_avx=b[small_len_avx+2];
+                    r3_avx=b[small_len_avx+3];
+		    b_bucket[0+9]=r0_avx;
+		    b_bucket[1+9]=r1_avx;
+		    b_bucket[2+9]=r2_avx;
+		    b_bucket[3+9]=r3_avx;
+		    b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]);
+
+		//-------3rd poly----------------------------------------------------
+
+                    r0_avx=b[2*small_len_avx+0];
+                    r1_avx=b[2*small_len_avx+1];
+                    r2_avx=b[2*small_len_avx+2];
+                    r3_avx=b[2*small_len_avx+3];
+		    b_bucket[0+18]=r0_avx;
+		    b_bucket[1+18]=r1_avx;
+		    b_bucket[2+18]=r2_avx;
+		    b_bucket[3+18]=r3_avx;
+		    b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]);
+
+		//-------4th poly----------------------------------------------------
+                    r0_avx=b[3*small_len_avx];
+                    r1_avx=b[3*small_len_avx+1];
+                    r2_avx=b[3*small_len_avx+2];
+                    r3_avx=b[3*small_len_avx+3];
+		    b_bucket[0+27]=r0_avx;
+		    b_bucket[1+27]=r1_avx;
+		    b_bucket[2+27]=r2_avx;
+		    b_bucket[3+27]=r3_avx;
+		    b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]);
+
+		//-------5th poly----------------------------------------------------
+
+                    r0_avx=b[4*small_len_avx];
+                    r1_avx=b[4*small_len_avx+1];
+                    r2_avx=b[4*small_len_avx+2];
+                    r3_avx=b[4*small_len_avx+3];
+		    b_bucket[0+36]=r0_avx;
+		    b_bucket[1+36]=r1_avx;
+		    b_bucket[2+36]=r2_avx;
+		    b_bucket[3+36]=r3_avx;
+		    b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]);
+
+		//-------6th poly----------------------------------------------------
+
+                    r0_avx=b[5*small_len_avx];
+                    r1_avx=b[5*small_len_avx+1];
+                    r2_avx=b[5*small_len_avx+2];
+                    r3_avx=b[5*small_len_avx+3];
+		    b_bucket[0+45]=r0_avx;
+		    b_bucket[1+45]=r1_avx;
+		    b_bucket[2+45]=r2_avx;
+		    b_bucket[3+45]=r3_avx;
+		    b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]);
+
+		//-------7th poly----------------------------------------------------
+
+                    r0_avx=b[6*small_len_avx];
+                    r1_avx=b[6*small_len_avx+1];
+                    r2_avx=b[6*small_len_avx+2];
+                    r3_avx=b[6*small_len_avx+3];
+		    b_bucket[0+54]=r0_avx;
+		    b_bucket[1+54]=r1_avx;
+		    b_bucket[2+54]=r2_avx;
+		    b_bucket[3+54]=r3_avx;
+		    b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
+		    b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
+		    b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
+		    b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
+		    b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]);
+
+		//--------------Evaluating B poly ends-------------------------------
+
+			transpose_n1(b_bucket);
+			transpose_n1(b_bucket+16);
+			transpose_n1(b_bucket+32);
+			transpose_n1(b_bucket+48);	
+}
+
+static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){
+
+		//int64_t i;
+		register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
+
+		__m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
+
+		//CLOCK1=cpucycles();
+
+		   //------------------------AVX interpolation for 1st poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[0];
+				res_avx2 = c_bucket[1];
+				res_avx4 = c_bucket[2];
+				res_avx6 = c_bucket[3];
+
+				c6_avx=c_bucket[6];
+				c7_avx=c_bucket[7];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[16];
+				res_avx3 = c_bucket[17];
+				res_avx5 = c_bucket[18];
+				res_avx7 = c_bucket[19];
+
+				c22_avx=c_bucket[22];
+				c23_avx=c_bucket[23];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final0[0]=res_avx0;
+				result_final0[1]=res_avx1;
+
+				result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final0[6]=res_avx6;
+				result_final0[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 1st poly ends--------------
+
+
+		   //------------------------AVX interpolation for 2nd poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[9]; //c_bucket0
+				res_avx2 = c_bucket[10]; //c_bucket1
+				res_avx4 = c_bucket[11]; //c_bucket2
+				res_avx6 = c_bucket[12]; //c_bucket3
+
+				c6_avx=c_bucket[15]; //c_bucket6
+				c7_avx=c_bucket[32]; //c_bucket7
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[25]; //c_bucket0
+				res_avx3 = c_bucket[26]; //c_bucket1
+				res_avx5 = c_bucket[27]; //c_bucket2
+				res_avx7 = c_bucket[28]; //c_bucket3
+
+				c22_avx=c_bucket[31];
+				c23_avx=c_bucket[48];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final1[0]=res_avx0;
+				result_final1[1]=res_avx1;
+
+				result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final1[6]=res_avx6;
+				result_final1[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 2nd poly ends--------------
+
+		   //------------------------AVX interpolation for 3rd poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[34]; //c_bucket0
+				res_avx2 = c_bucket[35]; //c_bucket1
+				res_avx4 = c_bucket[36];
+				res_avx6 = c_bucket[37];
+
+				c6_avx=c_bucket[40];
+				c7_avx=c_bucket[41];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[50]; //c_bucket0
+				res_avx3 = c_bucket[51]; //c_bucket1
+				res_avx5 = c_bucket[52];
+				res_avx7 = c_bucket[53];
+
+				c22_avx=c_bucket[56];
+				c23_avx=c_bucket[57];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+			//loop4
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+			//loop5
+				result_final2[0]=res_avx0;
+				result_final2[1]=res_avx1;
+
+				result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final2[6]=res_avx6;
+				result_final2[7]=res_avx7;
+
+		   //------------------------AVX interpolation for 3rd poly ends--------------
+		
+		   //------------------------AVX interpolation for 4th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[43];
+				res_avx2 = c_bucket[44];
+				res_avx4 = c_bucket[45];
+				res_avx6 = c_bucket[46];
+
+				c6_avx=c_bucket[65];
+				c7_avx=c_bucket[66];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[59];
+				res_avx3 = c_bucket[60];
+				res_avx5 = c_bucket[61];
+				res_avx7 = c_bucket[62];
+
+				c22_avx=c_bucket[81];
+				c23_avx=c_bucket[82];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final3[0]=res_avx0;
+				result_final3[1]=res_avx1;
+
+				result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final3[6]=res_avx6;
+				result_final3[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 4th poly ends--------------
+
+		   //------------------------AVX interpolation for 5th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[68];
+				res_avx2 = c_bucket[69];
+				res_avx4 = c_bucket[70];
+				res_avx6 = c_bucket[71];
+
+				c6_avx=c_bucket[74];
+				c7_avx=c_bucket[75];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[84];
+				res_avx3 = c_bucket[85];
+				res_avx5 = c_bucket[86];
+				res_avx7 = c_bucket[87];
+
+				c22_avx=c_bucket[90];
+				c23_avx=c_bucket[91];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final4[0]=res_avx0;
+				result_final4[1]=res_avx1;
+
+				result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final4[6]=res_avx6;
+				result_final4[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 5th poly ends--------------
+
+		   //------------------------AVX interpolation for 6th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[77];
+				res_avx2 = c_bucket[78];
+				res_avx4 = c_bucket[79];
+				res_avx6 = c_bucket[96];
+
+				c6_avx=c_bucket[99];
+				c7_avx=c_bucket[100];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[93];
+				res_avx3 = c_bucket[94];
+				res_avx5 = c_bucket[95];
+				res_avx7 = c_bucket[112];
+
+				c22_avx=c_bucket[115];
+				c23_avx=c_bucket[116];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final5[0]=res_avx0;
+				result_final5[1]=res_avx1;
+
+				result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final5[6]=res_avx6;
+				result_final5[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 6th poly ends--------------
+
+		   //------------------------AVX interpolation for 7th poly external-------------------		
+			
+			//loop1
+				res_avx0 = c_bucket[102];
+				res_avx2 = c_bucket[103];
+				res_avx4 = c_bucket[104];
+				res_avx6 = c_bucket[105];
+
+				c6_avx=c_bucket[108];
+				c7_avx=c_bucket[109];
+		
+				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx);
+
+				res_avx1 = c_bucket[118];
+				res_avx3 = c_bucket[119];
+				res_avx5 = c_bucket[120];
+				res_avx7 = c_bucket[121];
+
+				c22_avx=c_bucket[124];
+				c23_avx=c_bucket[125];
+
+				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7);
+
+				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx);
+
+				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3);
+
+				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6);
+				res_avx5 = _mm256_add_epi16(res_avx5, temp);
+
+				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2);
+				res_avx1 = _mm256_add_epi16(res_avx1, temp);
+
+				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
+
+				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+
+				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+
+				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
+
+
+			//loop4
+
+				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+
+				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+			//loop5
+				result_final6[0]=res_avx0;
+				result_final6[1]=res_avx1;
+
+				result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx);
+				result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx);
+
+
+				result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx);
+				result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx);
+
+				result_final6[6]=res_avx6;
+				result_final6[7]=res_avx7;
+
+
+		   //------------------------AVX interpolation for 7th poly ends--------------
+
+		//CLOCK2=cpucycles();
+		//CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1);
+		//printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1);
+
+
+
+}
+
+static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ 
+
+	int i;
+
+//---------------AVX data-----------------------------
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+	__m256i aw_avx[7*small_len_avx];
+
+//----------------AVX data----------------------------
+
+
+// EVALUATION
+
+	//CLOCK1=cpucycles();
+
+	for (i=0; i<small_len_avx; i++){
+		r0_avx=a_avx[i];
+		r1_avx=a_avx[i + small_len_avx];
+		r2_avx=a_avx[i + 2*small_len_avx];
+		r3_avx=a_avx[i + 3*small_len_avx];
+		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
+		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		aw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		aw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx=_mm256_slli_epi16(r0_avx,2);
+		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
+		r4_avx=_mm256_slli_epi16(r4_avx,1);
+		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
+		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
+		aw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		aw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx= _mm256_slli_epi16(r3_avx, 3);
+		r6_avx= _mm256_slli_epi16(r2_avx, 2);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		r6_avx= _mm256_slli_epi16(r1_avx, 1);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		aw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
+		aw_avx[6*small_len_avx+i]= r0_avx; 
+		aw_avx[i]= r3_avx;
+	}
+
+
+	//CLOCK2=cpucycles();
+	//CLOCK_TC_EVAL=CLOCK_TC_EVAL+(CLOCK2-CLOCK1);
+
+	batch_64coefficient_multiplications_new(aw_avx, b_bucket, c_bucket, f);//New
+
+}
+
+static void TC_eval(__m256i* b_avx, __m256i* b_bucket){
+
+	int i;
+	__m256i bw_avx[7*small_len_avx];
+
+	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+
+	for (i=0; i<small_len_avx; i++){
+		
+		r0_avx=b_avx[i];
+		r1_avx=b_avx[i + small_len_avx];
+		r2_avx=b_avx[i + 2*small_len_avx];
+		r3_avx=b_avx[i + 3*small_len_avx];
+		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
+		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		bw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		bw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx=_mm256_slli_epi16(r0_avx,2);
+		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
+		r4_avx=_mm256_slli_epi16(r4_avx,1);
+		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
+		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
+		bw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
+		bw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
+		r4_avx= _mm256_slli_epi16(r3_avx, 3);
+		r6_avx= _mm256_slli_epi16(r2_avx, 2);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		r6_avx= _mm256_slli_epi16(r1_avx, 1);
+		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
+		bw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
+		bw_avx[6*small_len_avx+i]= r0_avx;
+		bw_avx[i]= r3_avx;
+	}
+
+	KARA_eval(bw_avx, b_bucket);
+
+}
+
+
+static void TC_interpol(__m256i *c_bucket, __m256i* res_avx){
+
+	int i;
+
+	register __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
+
+	__m256i w1_avx[2*small_len_avx],w2_avx[2*small_len_avx],w3_avx[2*small_len_avx],w4_avx[2*small_len_avx],w5_avx[2*small_len_avx],w6_avx[2*small_len_avx],w7_avx[2*small_len_avx];
+
+	__m256i res_avx_output[2*AVX_N1];
+
+	//CLOCK1=cpucycles();
+
+	
+	transpose_n1(c_bucket);
+	transpose_n1(c_bucket+16);
+
+	transpose_n1(c_bucket+2*SCM_SIZE);
+	transpose_n1(c_bucket+16+2*SCM_SIZE);
+
+	transpose_n1(c_bucket+4*SCM_SIZE);
+	transpose_n1(c_bucket+16+4*SCM_SIZE);
+
+	transpose_n1(c_bucket+6*SCM_SIZE);
+	transpose_n1(c_bucket+16+6*SCM_SIZE);
+	
+
+	KARA_interpol(c_bucket, w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx);
+
+	for (i = 0; i < 2*small_len_avx; i++) {
+
+		r0_avx = w1_avx[i];
+		r1_avx = w2_avx[i];
+		r2_avx = w3_avx[i];
+		r3_avx = w4_avx[i];
+		r4_avx = w5_avx[i];
+		r5_avx = w6_avx[i];
+		r6_avx = w7_avx[i];
+		r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
+		r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
+		r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
+		r3_avx = _mm256_srli_epi16(r3_avx, 1);
+		r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
+		temp_avx = _mm256_slli_epi16(r6_avx, 6);
+		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+		r4_avx = _mm256_slli_epi16(r4_avx, 1);
+		r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
+		r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
+		temp_avx = _mm256_slli_epi16(r2_avx, 6);
+		r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
+		r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
+		r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
+		r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
+		temp_avx = _mm256_mullo_epi16 (r2_avx, _mm256_set1_epi16(45));
+		r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+		temp_avx = _mm256_slli_epi16(r2_avx, 3);
+		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+		r4_avx = _mm256_mullo_epi16 (r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
+		r4_avx = _mm256_srli_epi16(r4_avx, 3);
+		r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
+		temp_avx = _mm256_slli_epi16(r3_avx, 4);
+		r1_avx= _mm256_add_epi16(r1_avx, temp_avx);
+		r1_avx = _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
+		r1_avx= _mm256_srli_epi16(r1_avx, 1); 	
+		r3_avx= _mm256_add_epi16(r1_avx, r3_avx);
+		r3_avx= _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
+		temp_avx= _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(30));
+		temp_avx= _mm256_sub_epi16(temp_avx, r5_avx);
+		temp_avx= _mm256_mullo_epi16 (temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
+		r5_avx= _mm256_srli_epi16(temp_avx, 2);
+		r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
+		r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
+
+		if(i<small_len_avx){
+			res_avx_output[0*small_len_avx+i]=r6_avx;
+			res_avx_output[1*small_len_avx+i]=r5_avx;
+			res_avx_output[2*small_len_avx+i]=r4_avx;
+			res_avx_output[3*small_len_avx+i]=r3_avx;
+			res_avx_output[4*small_len_avx+i]=r2_avx;
+			res_avx_output[5*small_len_avx+i]=r1_avx;
+			res_avx_output[6*small_len_avx+i]=r0_avx;
+		}
+		else{
+			res_avx_output[0*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[0*small_len_avx+i], r6_avx);
+			res_avx_output[1*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[1*small_len_avx+i], r5_avx);
+			res_avx_output[2*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[2*small_len_avx+i], r4_avx);
+			res_avx_output[3*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[3*small_len_avx+i], r3_avx);
+			res_avx_output[4*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[4*small_len_avx+i], r2_avx);
+			res_avx_output[5*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[5*small_len_avx+i], r1_avx);
+			res_avx_output[6*small_len_avx+i]=r0_avx;
+		}
+	}
+
+	//CLOCK2=cpucycles();
+	//CLOCK_TC_INTER=CLOCK_TC_INTER+(CLOCK2-CLOCK1);
+
+	// Reduction by X^256 + 1
+	for(i=0; i<16; i++)
+  {
+		res_avx[i] = _mm256_sub_epi16(res_avx_output[i], res_avx_output[i+16]);
+  }
+
+}
diff --git a/crypto_kem/saber/avx2/verify.c b/crypto_kem/saber/avx2/verify.c
new file mode 100644
index 00000000..fe089639
--- /dev/null
+++ b/crypto_kem/saber/avx2/verify.c
@@ -0,0 +1,35 @@
+#include "verify.h"
+
+/*-------------------------------------------------
+This file has been adapted from the implementation
+(available at https://github.com/pq-crystals/kyber) of
+"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------*/
+
+
+/* returns 0 for equal strings, 1 for non-equal strings */
+uint8_t PQCLEAN_SABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len) {
+    uint64_t r;
+    size_t i;
+    r = 0;
+
+    for (i = 0; i < len; i++) {
+        r |= a[i] ^ b[i];
+    }
+
+    r = (~r + 1); // Two's complement
+    r >>= 63;
+    return (uint8_t) r;
+}
+
+/* b = 1 means mov, b = 0 means don't mov*/
+void PQCLEAN_SABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
+    size_t i;
+
+    b = -b;
+    for (i = 0; i < len; i++) {
+        r[i] ^= b & (x[i] ^ r[i]);
+    }
+}
diff --git a/crypto_kem/saber/avx2/verify.h b/crypto_kem/saber/avx2/verify.h
new file mode 100644
index 00000000..32edf5d0
--- /dev/null
+++ b/crypto_kem/saber/avx2/verify.h
@@ -0,0 +1,22 @@
+#ifndef VERIFY_H
+#define VERIFY_H
+/*-------------------------------------------------
+This file has been adapted from the implementation
+(available at https://github.com/pq-crystals/kyber) of
+"CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
+ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
+Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
+----------------------------------------------------*/
+
+#include <stddef.h>
+#include <stdint.h>
+
+/* returns 0 for equal strings, 1 for non-equal strings */
+uint8_t PQCLEAN_SABER_AVX2_verify(const uint8_t *a, const uint8_t *b, size_t len);
+
+
+/* b = 1 means mov, b = 0 means don't mov*/
+void PQCLEAN_SABER_AVX2_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
+
+
+#endif
diff --git a/crypto_kem/saber/clean/LICENSE b/crypto_kem/saber/clean/LICENSE
index 08c799e3..d5d21fff 100644
--- a/crypto_kem/saber/clean/LICENSE
+++ b/crypto_kem/saber/clean/LICENSE
@@ -1,8 +1 @@
-----------------------------------------------------------------------------------------
-SABER_v1.1
-
-Public domain
-
-Authors: Jan-Pieter D'Anvers, Angshuman Karmakar, Sujoy Sinha Roy,
-Frederik Vercauteren
-----------------------------------------------------------------------------------------
+Public Domain
diff --git a/crypto_kem/saber/clean/Makefile b/crypto_kem/saber/clean/Makefile
index 2052d200..cbc1357c 100644
--- a/crypto_kem/saber/clean/Makefile
+++ b/crypto_kem/saber/clean/Makefile
@@ -1,10 +1,10 @@
 # This Makefile can be used with GNU Make or BSD Make
 
 LIB=libsaber_clean.a
-HEADERS=api.h cbd.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h pack_unpack.h 
+HEADERS=api.h cbd.h pack_unpack.h poly.h poly_mul.h SABER_indcpa.h SABER_params.h verify.h 
 OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
-CFLAGS=-O3 -Wall -Wextra -Wpedantic -Werror -Wmissing-prototypes -Wredundant-decls -std=c99 -I../../../common $(EXTRAFLAGS)
+CFLAGS=-O3 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
 
 all: $(LIB)
 
diff --git a/crypto_kem/saber/clean/SABER_indcpa.c b/crypto_kem/saber/clean/SABER_indcpa.c
index d381194c..fe54f4ca 100644
--- a/crypto_kem/saber/clean/SABER_indcpa.c
+++ b/crypto_kem/saber/clean/SABER_indcpa.c
@@ -3,296 +3,90 @@
 #include "fips202.h"
 #include "pack_unpack.h"
 #include "poly.h"
-#include "poly_mul.h"
 #include "randombytes.h"
 #include <stdint.h>
 #include <string.h>
 
+#define h1 (1 << (SABER_EQ - SABER_EP - 1))
+#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
+void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
+    uint16_t A[SABER_L][SABER_L][SABER_N];
+    uint16_t s[SABER_L][SABER_N];
+    uint16_t b[SABER_L][SABER_N] = {0};
 
-/*-----------------------------------------------------------------------------------
-    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
--------------------------------------------------------------------------------------*/
+    uint8_t seed_A[SABER_SEEDBYTES];
+    uint8_t seed_s[SABER_NOISE_SEEDBYTES];
+    int i, j;
 
-#define h1 4 //2^(EQ-EP-1)
+    randombytes(seed_A, SABER_SEEDBYTES);
+    shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
+    randombytes(seed_s, SABER_NOISE_SEEDBYTES);
 
-#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+    PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A);
+    PQCLEAN_SABER_CLEAN_GenSecret(s, seed_s);
+    PQCLEAN_SABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1);
 
-static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]);
-static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose);
-
-static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec);
-
-static void GenMatrix(polyvec *a, const unsigned char *seed) {
-    unsigned char buf[SABER_K * SABER_K * (13 * SABER_N / 8)];
-
-    uint16_t temp_ar[SABER_N];
-
-    int i, j, k;
-    uint16_t mod = (SABER_Q - 1);
-
-    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            PQCLEAN_SABER_CLEAN_BS2POL(buf + (i * SABER_K + j) * (13 * SABER_N / 8), temp_ar);
-            for (k = 0; k < SABER_N; k++) {
-                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
-            }
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_N; j++) {
+            b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP);
         }
     }
+
+    PQCLEAN_SABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s);
+    PQCLEAN_SABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b);
+    memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A));
 }
 
+void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+    uint16_t A[SABER_L][SABER_L][SABER_N];
+    uint16_t sp[SABER_L][SABER_N];
+    uint16_t bp[SABER_L][SABER_N] = {0};
+    uint16_t vp[SABER_N] = {0};
+    uint16_t mp[SABER_N];
+    uint16_t b[SABER_L][SABER_N];
+    int i, j;
+    const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
-void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk) {
-    polyvec a[SABER_K];
+    PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A);
+    PQCLEAN_SABER_CLEAN_GenSecret(sp, seed_sp);
+    PQCLEAN_SABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0);
 
-    uint16_t skpv[SABER_K][SABER_N];
-
-    unsigned char seed[SABER_SEEDBYTES];
-    unsigned char noiseseed[SABER_COINBYTES];
-    int32_t i, j;
-    uint16_t mod_q = SABER_Q - 1;
-
-
-    uint16_t res[SABER_K][SABER_N];
-
-    randombytes(seed, SABER_SEEDBYTES);
-
-    // for not revealing system RNG state
-    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES);
-    randombytes(noiseseed, SABER_COINBYTES);
-
-    GenMatrix(a, seed);   //sample matrix A
-
-    // generate secret from constant-time binomial distribution
-    PQCLEAN_SABER_CLEAN_GenSecret(skpv, noiseseed);
-
-    // do the matrix vector multiplication and rounding
-    for (i = 0; i < SABER_K; i++) {
+    for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_N; j++) {
-            res[i][j] = 0;
-        }
-    }
-    MatrixVectorMul(a, skpv, res, SABER_Q - 1, 1);
-
-    // now rounding
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            // shift right 3 bits
-            res[i][j] = (res[i][j] + h1) & (mod_q);
-            res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP));
+            bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP);
         }
     }
 
-    // unload and pack sk=3 x (256 coefficients of 14 bits)
-    PQCLEAN_SABER_CLEAN_POLVEC2BS(sk, skpv, SABER_Q);
+    PQCLEAN_SABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp);
+    PQCLEAN_SABER_CLEAN_BS2POLVECp(b, pk);
+    PQCLEAN_SABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp);
 
-    // unload and pack pk=256 bits seed and 3 x (256 coefficients of 11 bits)
-    // load the public-key coefficients
-    PQCLEAN_SABER_CLEAN_POLVEC2BS(pk, res, SABER_P);
+    PQCLEAN_SABER_CLEAN_BS2POLmsg(mp, m);
 
-
-    // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
-    for (i = 0; i < SABER_SEEDBYTES; i++) {
-        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
+    for (j = 0; j < SABER_N; j++) {
+        vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET);
     }
 
+    PQCLEAN_SABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp);
 }
 
+void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
 
-void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(const unsigned char *message_received, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext) {
-    uint32_t i, j, k;
-    polyvec a[SABER_K];
-    unsigned char seed[SABER_SEEDBYTES];
-    // public key of received by the client
-    uint16_t pkcl[SABER_K][SABER_N];
-    uint16_t skpv1[SABER_K][SABER_N];
-    uint16_t message[SABER_KEYBYTES * 8];
-    uint16_t res[SABER_K][SABER_N];
-    uint16_t mod_p = SABER_P - 1;
-    uint16_t mod_q = SABER_Q - 1;
-    uint16_t vprime[SABER_N];
-    unsigned char msk_c[SABER_SCALEBYTES_KEM];
+    uint16_t s[SABER_L][SABER_N];
+    uint16_t b[SABER_L][SABER_N];
+    uint16_t v[SABER_N] = {0};
+    uint16_t cm[SABER_N];
+    int i;
 
-    // extract the seedbytes from Public Key.
-    for (i = 0; i < SABER_SEEDBYTES; i++) {
-        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
-    }
+    PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk);
+    PQCLEAN_SABER_CLEAN_BS2POLVECp(b, ciphertext);
+    PQCLEAN_SABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s);
+    PQCLEAN_SABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES);
 
-    GenMatrix(a, seed);
-
-    // generate secret from constant-time binomial distribution
-    PQCLEAN_SABER_CLEAN_GenSecret(skpv1, noiseseed);
-
-    // matrix-vector multiplication and rounding
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            res[i][j] = 0;
-        }
-    }
-    MatrixVectorMul(a, skpv1, res, SABER_Q - 1, 0);
-
-    // now rounding
-    //shift right 3 bits
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            res[i][j] = ( res[i][j] + h1 ) & mod_q;
-            res[i][j] = (res[i][j] >> (SABER_EQ - SABER_EP) );
-        }
-    }
-
-    PQCLEAN_SABER_CLEAN_POLVEC2BS(ciphertext, res, SABER_P);
-
-    // ************client matrix-vector multiplication ends************
-
-    // now calculate the v'
-    // unpack the public_key
-    // pkcl is the b in the protocol
-    PQCLEAN_SABER_CLEAN_BS2POLVEC(pk, pkcl, SABER_P);
     for (i = 0; i < SABER_N; i++) {
-        vprime[i] = 0;
-    }
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            skpv1[i][j] = skpv1[i][j] & (mod_p);
-        }
+        v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1);
     }
 
-    // vector-vector scalar multiplication with mod p
-    InnerProd(pkcl, skpv1, mod_p, vprime);
-
-    // addition of h1 to vprime
-    for (i = 0; i < SABER_N; i++) {
-        vprime[i] = vprime[i] + h1;
-    }
-
-    // unpack message_received;
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        for (i = 0; i < 8; i++) {
-            message[8 * j + i] = ((message_received[j] >> i) & 0x01);
-        }
-    }
-
-    // message encoding
-    for (i = 0; i < SABER_N; i++) {
-        message[i] = (message[i] << (SABER_EP - 1));
-    }
-
-    for (k = 0; k < SABER_N; k++) {
-        vprime[k] = ( (vprime[k] - message[k]) & (mod_p) ) >> (SABER_EP - SABER_ET);
-    }
-
-
-    PQCLEAN_SABER_CLEAN_pack_4bit(msk_c, vprime);
-
-    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
-        ciphertext[SABER_POLYVECCOMPRESSEDBYTES + j] = msk_c[j];
-    }
-}
-
-
-void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char message_dec[]) {
-    uint32_t i, j;
-    // secret key of the server
-    uint16_t sksv[SABER_K][SABER_N];
-    uint16_t pksv[SABER_K][SABER_N];
-    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
-    uint16_t mod_p = SABER_P - 1;
-    uint16_t v[SABER_N];
-    uint16_t op[SABER_N];
-
-    // sksv is the secret-key
-    PQCLEAN_SABER_CLEAN_BS2POLVEC(sk, sksv, SABER_Q);
-    // pksv is the ciphertext
-    PQCLEAN_SABER_CLEAN_BS2POLVEC(ciphertext, pksv, SABER_P);
-
-    // vector-vector scalar multiplication with mod p
-    for (i = 0; i < SABER_N; i++) {
-        v[i] = 0;
-    }
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N; j++) {
-            sksv[i][j] = sksv[i][j] & (mod_p);
-        }
-    }
-    InnerProd(pksv, sksv, mod_p, v);
-
-    //Extraction
-    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
-        scale_ar[i] = ciphertext[SABER_POLYVECCOMPRESSEDBYTES + i];
-    }
-
-    PQCLEAN_SABER_CLEAN_un_pack4bit(scale_ar, op);
-
-    //addition of h1
-    for (i = 0; i < SABER_N; i++) {
-        v[i] = ( ( v[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (mod_p) ) >> (SABER_EP - 1);
-    }
-
-    // pack decrypted message
-    POL2MSG(v, message_dec);
-}
-static void MatrixVectorMul(polyvec *a, uint16_t skpv[SABER_K][SABER_N], uint16_t res[SABER_K][SABER_N], uint16_t mod, int16_t transpose) {
-    uint16_t acc[SABER_N];
-    int32_t i, j, k;
-
-    if (transpose == 1) {
-        for (i = 0; i < SABER_K; i++) {
-            for (j = 0; j < SABER_K; j++) {
-                PQCLEAN_SABER_CLEAN_pol_mul((uint16_t *)&a[j].vec[i], skpv[j], acc, SABER_Q, SABER_N);
-
-                for (k = 0; k < SABER_N; k++) {
-                    res[i][k] = res[i][k] + acc[k];
-                    //reduction mod p
-                    res[i][k] = (res[i][k] & mod);
-                    //clear the accumulator
-                    acc[k] = 0;
-                }
-            }
-        }
-    } else {
-        for (i = 0; i < SABER_K; i++) {
-            for (j = 0; j < SABER_K; j++) {
-                PQCLEAN_SABER_CLEAN_pol_mul((uint16_t *)&a[i].vec[j], skpv[j], acc, SABER_Q, SABER_N);
-                for (k = 0; k < SABER_N; k++) {
-                    res[i][k] = res[i][k] + acc[k];
-                    // reduction
-                    res[i][k] = res[i][k] & mod;
-                    // clear the accumulator
-                    acc[k] = 0;
-                }
-            }
-        }
-    }
-}
-
-static void POL2MSG(const uint16_t *message_dec_unpacked, unsigned char *message_dec) {
-    int32_t i, j;
-
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        message_dec[j] = 0;
-        for (i = 0; i < 8; i++) {
-            message_dec[j] = message_dec[j] | (uint8_t) (message_dec_unpacked[j * 8 + i] << i);
-        }
-    }
-}
-
-
-static void InnerProd(uint16_t pkcl[SABER_K][SABER_N], uint16_t skpv[SABER_K][SABER_N], uint16_t mod, uint16_t res[SABER_N]) {
-    uint32_t j, k;
-    uint16_t acc[SABER_N];
-
-    // vector-vector scalar multiplication with mod p
-    for (j = 0; j < SABER_K; j++) {
-        PQCLEAN_SABER_CLEAN_pol_mul(pkcl[j], skpv[j], acc, SABER_P, SABER_N);
-
-        for (k = 0; k < SABER_N; k++) {
-            res[k] = res[k] + acc[k];
-            // reduction
-            res[k] = res[k] & mod;
-            // clear the accumulator
-            acc[k] = 0;
-        }
-    }
+    PQCLEAN_SABER_CLEAN_POLmsg2BS(m, v);
 }
diff --git a/crypto_kem/saber/clean/SABER_indcpa.h b/crypto_kem/saber/clean/SABER_indcpa.h
index f8503f66..3be3ce1c 100644
--- a/crypto_kem/saber/clean/SABER_indcpa.h
+++ b/crypto_kem/saber/clean/SABER_indcpa.h
@@ -1,9 +1,13 @@
 #ifndef INDCPA_H
 #define INDCPA_H
+#include "SABER_params.h"
+#include <stdint.h>
+
+void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
+
+void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+
+void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
 
-void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(unsigned char *pk, unsigned char *sk);
-void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(const unsigned char *message, unsigned char *noiseseed, const unsigned char *pk, unsigned char *ciphertext);
-void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(const unsigned char *sk, const unsigned char *ciphertext, unsigned char *message_dec);
 
 #endif
-
diff --git a/crypto_kem/saber/clean/SABER_params.h b/crypto_kem/saber/clean/SABER_params.h
index faa9f6db..200ed0e6 100644
--- a/crypto_kem/saber/clean/SABER_params.h
+++ b/crypto_kem/saber/clean/SABER_params.h
@@ -1,50 +1,39 @@
 #ifndef PARAMS_H
 #define PARAMS_H
 
-#include "api.h"
 
-#define SABER_K 3
+/* Change this for different security strengths */
+
+/* Don't change anything below this line */
+#define SABER_L 3
 #define SABER_MU 8
 #define SABER_ET 4
 
-
 #define SABER_EQ 13
 #define SABER_EP 10
-
 #define SABER_N 256
-#define SABER_Q 8192
-#define SABER_P 1024
 
-#define SABER_SEEDBYTES       32
-#define SABER_NOISESEEDBYTES  32
-#define SABER_COINBYTES       32
-#define SABER_KEYBYTES        32
+#define SABER_SEEDBYTES 32
+#define SABER_NOISE_SEEDBYTES 32
+#define SABER_KEYBYTES 32
+#define SABER_HASHBYTES 32
 
-#define SABER_HASHBYTES       32
+#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8)
 
-#define SABER_POLYBYTES       416 //13*256/8 
+#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8)
+#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES)
 
-#define SABER_POLYVECBYTES    (SABER_K * SABER_POLYBYTES)
+#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8)
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES)
 
-#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
-
-#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
-
-#define SABER_SCALEBYTES (SABER_DELTA*SABER_N/8)
-
-#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8)
 
 #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
 #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
 
 #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
 
-#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
-
-#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
-
-
-
+#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM)
 
 #endif
-
diff --git a/crypto_kem/saber/clean/api.h b/crypto_kem/saber/clean/api.h
index 66c3b8bf..699a19f4 100644
--- a/crypto_kem/saber/clean/api.h
+++ b/crypto_kem/saber/clean/api.h
@@ -1,14 +1,18 @@
 #ifndef PQCLEAN_SABER_CLEAN_API_H
 #define PQCLEAN_SABER_CLEAN_API_H
 
+
 #define PQCLEAN_SABER_CLEAN_CRYPTO_ALGNAME "Saber"
-#define PQCLEAN_SABER_CLEAN_CRYPTO_SECRETKEYBYTES 2304
-#define PQCLEAN_SABER_CLEAN_CRYPTO_PUBLICKEYBYTES (3*320+32)
 #define PQCLEAN_SABER_CLEAN_CRYPTO_BYTES 32
 #define PQCLEAN_SABER_CLEAN_CRYPTO_CIPHERTEXTBYTES 1088
+#define PQCLEAN_SABER_CLEAN_CRYPTO_PUBLICKEYBYTES 992
+#define PQCLEAN_SABER_CLEAN_CRYPTO_SECRETKEYBYTES 2304
 
 int PQCLEAN_SABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
-int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
-int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
+
+int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, const unsigned char *pk);
+
+int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
+
 
 #endif /* api_h */
diff --git a/crypto_kem/saber/clean/cbd.c b/crypto_kem/saber/clean/cbd.c
index a2d9fcdd..e0ccef9d 100644
--- a/crypto_kem/saber/clean/cbd.c
+++ b/crypto_kem/saber/clean/cbd.c
@@ -1,3 +1,7 @@
+#include "SABER_params.h"
+#include "api.h"
+#include "cbd.h"
+#include <stdint.h>
 /*---------------------------------------------------------------------
 This file has been adapted from the implementation
 (available at, Public Domain https://github.com/pq-crystals/kyber)
@@ -6,12 +10,8 @@ by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
 
-#include "SABER_params.h"
-#include "api.h"
-#include "cbd.h"
-#include <stdint.h>
 
-static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+static uint64_t load_littleendian(const uint8_t *x, int bytes) {
     int i;
     uint64_t r = x[0];
     for (i = 1; i < bytes; i++) {
@@ -20,32 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) {
     return r;
 }
 
-
-void PQCLEAN_SABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf) {
-    uint16_t Qmod_minus1 = SABER_Q - 1;
-
+void PQCLEAN_SABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) {
     uint32_t t, d, a[4], b[4];
     int i, j;
 
     for (i = 0; i < SABER_N / 4; i++) {
-        t = (uint32_t) load_littleendian(buf + 4 * i, 4);
+        t = load_littleendian(buf + 4 * i, 4);
         d = 0;
         for (j = 0; j < 4; j++) {
             d += (t >> j) & 0x11111111;
         }
 
-        a[0] =  d & 0xf;
-        b[0] = (d >>  4) & 0xf;
-        a[1] = (d >>  8) & 0xf;
+        a[0] = d & 0xf;
+        b[0] = (d >> 4) & 0xf;
+        a[1] = (d >> 8) & 0xf;
         b[1] = (d >> 12) & 0xf;
         a[2] = (d >> 16) & 0xf;
         b[2] = (d >> 20) & 0xf;
         a[3] = (d >> 24) & 0xf;
         b[3] = (d >> 28);
 
-        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
-        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
-        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
-        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
+        s[4 * i + 0] = (uint16_t)(a[0] - b[0]);
+        s[4 * i + 1] = (uint16_t)(a[1] - b[1]);
+        s[4 * i + 2] = (uint16_t)(a[2] - b[2]);
+        s[4 * i + 3] = (uint16_t)(a[3] - b[3]);
     }
 }
diff --git a/crypto_kem/saber/clean/cbd.h b/crypto_kem/saber/clean/cbd.h
index b307921f..88b0b0b5 100644
--- a/crypto_kem/saber/clean/cbd.h
+++ b/crypto_kem/saber/clean/cbd.h
@@ -1,6 +1,5 @@
 #ifndef CBD_H
 #define CBD_H
-
 /*---------------------------------------------------------------------
 This file has been adapted from the implementation
 (available at, Public Domain https://github.com/pq-crystals/kyber)
@@ -8,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
 by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
-
-#include "poly.h"
+#include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_SABER_CLEAN_cbd(uint16_t *r, const unsigned char *buf);
+void PQCLEAN_SABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]);
+
 
 #endif
diff --git a/crypto_kem/saber/clean/kem.c b/crypto_kem/saber/clean/kem.c
index 9e5b01f4..ed8e3ac7 100644
--- a/crypto_kem/saber/clean/kem.c
+++ b/crypto_kem/saber/clean/kem.c
@@ -1,5 +1,6 @@
 #include "SABER_indcpa.h"
 #include "SABER_params.h"
+#include "api.h"
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
@@ -7,90 +8,71 @@
 #include <stdio.h>
 #include <string.h>
 
-int PQCLEAN_SABER_CLEAN_crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
+
+int PQCLEAN_SABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
     int i;
 
-    // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
-    PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(pk, sk);
-
-    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
+    PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
-        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];
+        sk[i + SABER_INDCPA_SECRETKEYBYTES] = pk[i];    // sk[SABER_INDCPA_SECRETKEYBYTES:SABER_INDCPA_SECRETKEYBYTES+SABER_INDCPA_SECRETKEYBYTES-1] <-- pk
     }
 
-    // Then hash(pk) is appended.
-    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES);
+    sha3_256(sk + SABER_SECRETKEYBYTES - 64, pk, SABER_INDCPA_PUBLICKEYBYTES); // Then hash(pk) is appended.
 
-    // Remaining part of sk contains a pseudo-random number.
-    // This is output when check in crypto_kem_dec() fails.
-    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES );
+    randombytes(sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES); // Remaining part of sk contains a pseudo-random number.
+    // This is output when check in PQCLEAN_SABER_CLEAN_crypto_kem_dec() fails.
     return (0);
 }
 
-int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) {
-    // Will contain key, coins
-    unsigned char kr[64];
-    unsigned char buf[64];
+int PQCLEAN_SABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk) {
+
+    uint8_t kr[64]; // Will contain key, coins
+    uint8_t buf[64];
 
     randombytes(buf, 32);
 
-    // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
-    sha3_256(buf, buf, 32);
+    sha3_256(buf, buf, 32); // BUF[0:31] <-- random message (will be used as the key for client) Note: hash doesnot release system RNG output
 
-    // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
-    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES);
-
-    // kr[0:63] <-- Hash(buf[0:63]);
-    sha3_512(kr, buf, 64);
+    sha3_256(buf + 32, pk, SABER_INDCPA_PUBLICKEYBYTES); // BUF[32:63] <-- Hash(public key);  Multitarget countermeasure for coins + contributory KEM
 
+    sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
     // K^ <-- kr[0:31]
     // noiseseed (r) <-- kr[32:63];
-    // buf[0:31] contains message; kr[32:63] contains randomness r;
-    PQCLEAN_SABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk,  ct);
+    PQCLEAN_SABER_CLEAN_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
 
-    sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC);
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
 
-    // hash concatenation of pre-k and h(c) to k
-    sha3_256(ss, kr, 64);
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
 
     return (0);
 }
 
-
-int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) {
+int PQCLEAN_SABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
     int i;
-    unsigned char fail;
-    unsigned char cmp[SABER_BYTES_CCA_DEC];
-    unsigned char buf[64];
-
-    // Will contain key, coins
-    unsigned char kr[64];
-    const unsigned char *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
-
-    // buf[0:31] <-- message
-    PQCLEAN_SABER_CLEAN_indcpa_kem_dec(sk, ct, buf);
+    uint8_t fail;
+    uint8_t cmp[SABER_BYTES_CCA_DEC];
+    uint8_t buf[64];
+    uint8_t kr[64]; // Will contain key, coins
+    const uint8_t *pk = sk + SABER_INDCPA_SECRETKEYBYTES;
 
+    PQCLEAN_SABER_CLEAN_indcpa_kem_dec(buf, sk, c); // buf[0:31] <-- message
 
     // Multitarget countermeasure for coins + contributory KEM
-    // Save hash by storing h(pk) in sk
-    for (i = 0; i < 32; i++) {
+    for (i = 0; i < 32; i++) { // Save hash by storing h(pk) in sk
         buf[32 + i] = sk[SABER_SECRETKEYBYTES - 64 + i];
     }
 
     sha3_512(kr, buf, 64);
 
-    PQCLEAN_SABER_CLEAN_indcpa_kem_enc(buf, kr + 32, pk, cmp);
+    PQCLEAN_SABER_CLEAN_indcpa_kem_enc(cmp, buf, kr + 32, pk);
 
+    fail = PQCLEAN_SABER_CLEAN_verify(c, cmp, SABER_BYTES_CCA_DEC);
 
-    fail = PQCLEAN_SABER_CLEAN_verify(ct, cmp, SABER_BYTES_CCA_DEC);
-
-    // overwrite coins in kr with h(c)
-    sha3_256(kr + 32, ct, SABER_BYTES_CCA_DEC);
+    sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC); // overwrite coins in kr with h(c)
 
     PQCLEAN_SABER_CLEAN_cmov(kr, sk + SABER_SECRETKEYBYTES - SABER_KEYBYTES, SABER_KEYBYTES, fail);
 
-    // hash concatenation of pre-k and h(c) to k
-    sha3_256(ss, kr, 64);
+    sha3_256(k, kr, 64); // hash concatenation of pre-k and h(c) to k
 
     return (0);
 }
diff --git a/crypto_kem/saber/clean/pack_unpack.c b/crypto_kem/saber/clean/pack_unpack.c
index 06a74778..e196bd34 100644
--- a/crypto_kem/saber/clean/pack_unpack.c
+++ b/crypto_kem/saber/clean/pack_unpack.c
@@ -1,254 +1,132 @@
+#include "api.h"
 #include "pack_unpack.h"
+#include <string.h>
 
-void PQCLEAN_SABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) |
-                                 ((data[offset_data + 1] & 0x7) << 3) |
-                                 ((data[offset_data + 2] & 0x3) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  |
-                                 ((data[offset_data + 3] & 0x7) << 1) |
-                                 ((data[offset_data + 4] & 0x7) << 4) |
-                                 (((data[offset_data + 5]) & 0x01) << 7);
-        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) |
-                                 ((data[offset_data + 6] & 0x7) << 2) |
-                                 ((data[offset_data + 7] & 0x7) << 5);
-    }
-}
-
-void PQCLEAN_SABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
-        data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3 ) & 0x07;
-        data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6 ) & 0x03) |
-                                (((bytes[offset_byte + 1]) & 0x01) << 2);
-        data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1 ) & 0x07;
-        data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4 ) & 0x07;
-        data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7 ) & 0x01) |
-                                (((bytes[offset_byte + 2]) & 0x03) << 1);
-        data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07);
-        data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07);
-    }
-}
-
-void PQCLEAN_SABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data;
-
+void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 2; j++) {
+        offset_byte = j;
         offset_data = 2 * j;
-        bytes[j] = (data[offset_data] & 0x0f) |
-                   ((data[offset_data + 1] & 0x0f) << 4);
+        bytes[offset_byte] = (data[offset_data] & 0x0f) | ((data[offset_data + 1] & 0x0f) << 4);
     }
 }
 
-void PQCLEAN_SABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar) {
-    uint32_t j;
-    uint32_t offset_data;
-
+void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 2; j++) {
+        offset_byte = j;
         offset_data = 2 * j;
-        ar[offset_data] = bytes[j] & 0x0f;
-        ar[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
+        data[offset_data] = bytes[offset_byte] & 0x0f;
+        data[offset_data + 1] = (bytes[offset_byte] >> 4) & 0x0f;
     }
 }
 
-void PQCLEAN_SABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
-    for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) |
-                                 ((data[offset_data + 1] & 0x03) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) |
-                                 ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) |
-                                 ((data[offset_data + 3] & 0x3f) << 2);
-    }
-}
-
-
-void PQCLEAN_SABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
-    for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
-        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |
-                                ((bytes[offset_byte + 1] & 0x0f) << 2);
-        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) |
-                                ((bytes[offset_byte + 2] & 0x03) << 4);
-        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
-    }
-}
-
-
-static void POLVECp2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff));
-            bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x03) |
-                                     ((data[i][offset_data + 1] & 0x3f) << 2);
-            bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 6) & 0x0f) |
-                                     ((data[i][offset_data + 2] & 0x0f) << 4);
-            bytes[offset_byte + 3] = ((data[i][offset_data + 2] >> 4) & 0x3f) |
-                                     ((data[i][offset_data + 3] & 0x03) << 6);
-            bytes[offset_byte + 4] = ((data[i][offset_data + 3] >> 2) & 0xff);
-        }
-    }
-}
-
-static void BS2POLVECp(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                       ((bytes[offset_byte + 1] & 0x03) << 8);
-            data[i][offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) |
-                                       ((bytes[offset_byte + 2] & 0x0f) << 6);
-            data[i][offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) |
-                                       ((bytes[offset_byte + 3] & 0x3f) << 4);
-            data[i][offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) |
-                                       ((bytes[offset_byte + 4] & 0xff) << 2);
-        }
-    }
-}
-
-
-
-static void POLVECq2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = (data[i][offset_data + 0] & (0xff));
-            bytes[offset_byte + 1] = ((data[i][offset_data + 0] >> 8) & 0x1f) |
-                                     ((data[i][offset_data + 1] & 0x07) << 5);
-            bytes[offset_byte + 2] = ((data[i][offset_data + 1] >> 3) & 0xff);
-            bytes[offset_byte + 3] = ((data[i][offset_data + 1] >> 11) & 0x03) |
-                                     ((data[i][offset_data + 2] & 0x3f) << 2);
-            bytes[offset_byte + 4] = ((data[i][offset_data + 2] >> 6) & 0x7f) |
-                                     ((data[i][offset_data + 3] & 0x01) << 7);
-            bytes[offset_byte + 5] = ((data[i][offset_data + 3] >> 1) & 0xff);
-            bytes[offset_byte + 6] = ((data[i][offset_data + 3] >> 9) & 0x0f) |
-                                     ((data[i][offset_data + 4] & 0x0f) << 4);
-            bytes[offset_byte + 7] = ((data[i][offset_data + 4] >> 4) & 0xff);
-            bytes[offset_byte + 8] = ((data[i][offset_data + 4] >> 12) & 0x01) |
-                                     ((data[i][offset_data + 5] & 0x7f) << 1);
-            bytes[offset_byte + 9] = ((data[i][offset_data + 5] >> 7) & 0x3f) |
-                                     ((data[i][offset_data + 6] & 0x03) << 6);
-            bytes[offset_byte + 10] = ((data[i][offset_data + 6] >> 2) & 0xff);
-            bytes[offset_byte + 11] = ((data[i][offset_data + 6] >> 10) & 0x07) |
-                                      ((data[i][offset_data + 7] & 0x1f) << 3);
-            bytes[offset_byte + 12] = ((data[i][offset_data + 7] >> 5) & 0xff);
-        }
-    }
-}
-
-static void BS2POLVECq(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N]) {
-    uint32_t i, j;
-    uint32_t offset_data, offset_byte, offset_byte1;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                       ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) |
-                                       ((bytes[offset_byte + 2] & 0xff) << 3) |
-                                       ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) |
-                                       ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) |
-                                       ((bytes[offset_byte + 5] & 0xff) << 1) |
-                                       ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) |
-                                       ((bytes[offset_byte + 7] & 0xff) << 4) |
-                                       ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) |
-                                       ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) |
-                                       ((bytes[offset_byte + 10] & 0xff) << 2) |
-                                       ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) |
-                                       ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-}
-
-//only BS2POLq no BS2POLp
-void PQCLEAN_SABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]) {
-    uint32_t j;
-    uint32_t offset_data, offset_byte;
-
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
     for (j = 0; j < SABER_N / 8; j++) {
         offset_byte = 13 * j;
         offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) |
-                                ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) |
-                                ((bytes[offset_byte + 2] & 0xff) << 3) |
-                                ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) |
-                                ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) |
-                                ((bytes[offset_byte + 5] & 0xff) << 1) |
-                                ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) |
-                                ((bytes[offset_byte + 7] & 0xff) << 4) |
-                                ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) |
-                                ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) |
-                                ((bytes[offset_byte + 10] & 0xff) << 2) |
-                                ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) |
-                                ((bytes[offset_byte + 12] & 0xff) << 5);
+        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
+        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5);
+        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff);
+        bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2);
+        bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7);
+        bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff);
+        bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4);
+        bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff);
+        bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1);
+        bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6);
+        bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff);
+        bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3);
+        bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff);
     }
 }
 
-void PQCLEAN_SABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-    if (modulus == 1024) {
-        POLVECp2BS(bytes, data);
-    } else if (modulus == 8192) {
-        POLVECq2BS(bytes, data);
+static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 8; j++) {
+        offset_byte = 13 * j;
+        offset_data = 8 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
+        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
+        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
+        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
+        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
+        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
+        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
+        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
     }
 }
 
-void PQCLEAN_SABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-    if (modulus == 1024) {
-        BS2POLVECp(bytes, data);
-    } else if (modulus == 8192) {
-        BS2POLVECq(bytes, data);
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 5 * j;
+        offset_data = 4 * j;
+        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
+        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2);
+        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
+        bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6);
+        bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff);
+    }
+}
+
+static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j, offset_byte, offset_data;
+    for (j = 0; j < SABER_N / 4; j++) {
+        offset_byte = 5 * j;
+        offset_data = 4 * j;
+        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8);
+        data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6);
+        data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4);
+        data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2);
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLq2BS(bytes + i * SABER_POLYBYTES, data[i]);
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLq(data[i], bytes + i * SABER_POLYBYTES);
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]);
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8));
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) {
+    size_t i, j;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            data[j * 8 + i] = ((bytes[j] >> i) & 0x01);
+        }
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) {
+    size_t i, j;
+    memset(bytes, 0, SABER_KEYBYTES);
+
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i);
+        }
     }
 }
diff --git a/crypto_kem/saber/clean/pack_unpack.h b/crypto_kem/saber/clean/pack_unpack.h
index 2431a217..52537c07 100644
--- a/crypto_kem/saber/clean/pack_unpack.h
+++ b/crypto_kem/saber/clean/pack_unpack.h
@@ -1,28 +1,27 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
-
 #include "SABER_params.h"
 #include <stdint.h>
 #include <stdio.h>
 
+void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]);
 
-void PQCLEAN_SABER_CLEAN_pack_3bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_SABER_CLEAN_un_pack3bit(const uint8_t *bytes, uint16_t *data);
-
-void PQCLEAN_SABER_CLEAN_pack_4bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_SABER_CLEAN_un_pack4bit(const unsigned char *bytes, uint16_t *ar);
-
-void PQCLEAN_SABER_CLEAN_pack_6bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_SABER_CLEAN_un_pack6bit(const unsigned char *bytes, uint16_t *data);
+void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_SABER_CLEAN_BS2POL(const unsigned char *bytes, uint16_t data[SABER_N]);
+void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]);
 
-void PQCLEAN_SABER_CLEAN_POLVEC2BS(uint8_t *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]);
+
+
+void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]);
+
+void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
+
+
+void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]);
+
+void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]);
 
-void PQCLEAN_SABER_CLEAN_BS2POLVEC(const unsigned char *bytes, uint16_t data[SABER_K][SABER_N], uint16_t modulus);
 
 #endif
diff --git a/crypto_kem/saber/clean/poly.c b/crypto_kem/saber/clean/poly.c
index 93f55fde..f0403ccf 100644
--- a/crypto_kem/saber/clean/poly.c
+++ b/crypto_kem/saber/clean/poly.c
@@ -1,21 +1,49 @@
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
-#include "SABER_params.h"
+#include "api.h"
 #include "cbd.h"
 #include "fips202.h"
+#include "pack_unpack.h"
 #include "poly.h"
+#include "poly_mul.h"
+#include <stdio.h>
 
-void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed) {
-    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
-
-    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
-
-    for (size_t i = 0; i < SABER_K; i++) {
-        PQCLEAN_SABER_CLEAN_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
+void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
+    int i, j;
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_L; j++) {
+            if (transpose == 1) {
+                PQCLEAN_SABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]);
+            } else {
+                PQCLEAN_SABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]);
+            }
+        }
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
+    int j;
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_SABER_CLEAN_poly_mul_acc(b[j], s[j], res);
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+    int i;
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_SABER_CLEAN_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES);
+    }
+}
+
+void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) {
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+    size_t i;
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_SABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES);
     }
 }
diff --git a/crypto_kem/saber/clean/poly.h b/crypto_kem/saber/clean/poly.h
index 9d216804..dd882cb7 100644
--- a/crypto_kem/saber/clean/poly.h
+++ b/crypto_kem/saber/clean/poly.h
@@ -1,26 +1,15 @@
 #ifndef POLY_H
 #define POLY_H
-
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
-
-
 #include "SABER_params.h"
 #include <stdint.h>
 
-typedef struct {
-    uint16_t coeffs[SABER_N];
-} poly;
+void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose);
 
-typedef struct {
-    poly vec[SABER_K];
-} polyvec;
+void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]);
+
+void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]);
 
-void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t r[SABER_K][SABER_N], const unsigned char *seed);
 
 #endif
diff --git a/crypto_kem/saber/clean/poly_mul.c b/crypto_kem/saber/clean/poly_mul.c
index dc1cc779..0655383b 100644
--- a/crypto_kem/saber/clean/poly_mul.c
+++ b/crypto_kem/saber/clean/poly_mul.c
@@ -228,19 +228,15 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
     }
 }
 
-void PQCLEAN_SABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n) {
-    uint32_t i;
-    // normal multiplication
-    uint16_t c[512];
-
-    for (i = 0; i < 512; i++) {
-        c[i] = 0;
-    }
+/* res += a*b */
+void PQCLEAN_SABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) {
+    uint16_t c[2 * SABER_N] = {0};
+    int i;
 
     toom_cook_4way(a, b, c);
 
-    // reduction
-    for (i = n; i < 2 * n; i++) {
-        res[i - n] = (c[i - n] - c[i]) & (p - 1);
+    /* reduction */
+    for (i = SABER_N; i < 2 * SABER_N; i++) {
+        res[i - SABER_N] += (c[i - SABER_N] - c[i]);
     }
 }
diff --git a/crypto_kem/saber/clean/poly_mul.h b/crypto_kem/saber/clean/poly_mul.h
index f813be10..e0f10043 100644
--- a/crypto_kem/saber/clean/poly_mul.h
+++ b/crypto_kem/saber/clean/poly_mul.h
@@ -1,9 +1,9 @@
-#ifndef POLYMUL_H
-#define POLYMUL_H
-
+#ifndef POLY_MUL_H
+#define POLY_MUL_H
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_SABER_CLEAN_pol_mul(uint16_t *a, uint16_t *b, uint16_t *res, uint16_t p, uint32_t n);
+void PQCLEAN_SABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]);
+
 
 #endif
diff --git a/crypto_kem/saber/clean/verify.c b/crypto_kem/saber/clean/verify.c
index 81f30604..72f4dd34 100644
--- a/crypto_kem/saber/clean/verify.c
+++ b/crypto_kem/saber/clean/verify.c
@@ -1,3 +1,5 @@
+#include "verify.h"
+
 /*-------------------------------------------------
 This file has been adapted from the implementation
 (available at https://github.com/pq-crystals/kyber) of
@@ -5,26 +7,25 @@ This file has been adapted from the implementation
  by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------*/
-#include "verify.h"
-#include <stdint.h>
+
 
 /* returns 0 for equal strings, 1 for non-equal strings */
-unsigned char PQCLEAN_SABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len) {
+uint8_t PQCLEAN_SABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len) {
     uint64_t r;
     size_t i;
-
     r = 0;
+
     for (i = 0; i < len; i++) {
         r |= a[i] ^ b[i];
     }
 
     r = (~r + 1); // Two's complement
     r >>= 63;
-    return (unsigned char)r;
+    return (uint8_t) r;
 }
 
 /* b = 1 means mov, b = 0 means don't mov*/
-void PQCLEAN_SABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b) {
+void PQCLEAN_SABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b) {
     size_t i;
 
     b = -b;
diff --git a/crypto_kem/saber/clean/verify.h b/crypto_kem/saber/clean/verify.h
index cacb2ee6..f88fe396 100644
--- a/crypto_kem/saber/clean/verify.h
+++ b/crypto_kem/saber/clean/verify.h
@@ -1,6 +1,5 @@
 #ifndef VERIFY_H
 #define VERIFY_H
-
 /*-------------------------------------------------
 This file has been adapted from the implementation
 (available at https://github.com/pq-crystals/kyber) of
@@ -13,9 +12,11 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 #include <stdint.h>
 
 /* returns 0 for equal strings, 1 for non-equal strings */
-unsigned char PQCLEAN_SABER_CLEAN_verify(const unsigned char *a, const unsigned char *b, size_t len);
+uint8_t PQCLEAN_SABER_CLEAN_verify(const uint8_t *a, const uint8_t *b, size_t len);
+
 
 /* b = 1 means mov, b = 0 means don't mov*/
-void PQCLEAN_SABER_CLEAN_cmov(unsigned char *r, const unsigned char *x, size_t len, unsigned char b);
+void PQCLEAN_SABER_CLEAN_cmov(uint8_t *r, const uint8_t *x, size_t len, uint8_t b);
+
 
 #endif
diff --git a/test/duplicate_consistency/firesaber_avx2.yml b/test/duplicate_consistency/firesaber_avx2.yml
new file mode 100644
index 00000000..1790559f
--- /dev/null
+++ b/test/duplicate_consistency/firesaber_avx2.yml
@@ -0,0 +1,7 @@
+consistency_checks:
+    - source:
+        scheme: firesaber
+        implementation: clean
+      files:
+      - verify.h
+      - verify.c
diff --git a/test/duplicate_consistency/firesaber_clean.yml b/test/duplicate_consistency/firesaber_clean.yml
index 60a1a153..3e93674e 100644
--- a/test/duplicate_consistency/firesaber_clean.yml
+++ b/test/duplicate_consistency/firesaber_clean.yml
@@ -1,31 +1,7 @@
 consistency_checks:
-- source:
-    scheme: lightsaber
-    implementation: clean
-  files:
-    - cbd.h
-    - kem.c
-    - pack_unpack.c
-    - pack_unpack.h
-    - poly.c
-    - poly.h
-    - poly_mul.c
-    - poly_mul.h
-    - SABER_indcpa.h
-    - verify.c
-    - verify.h 
-- source:
-    scheme: saber
-    implementation: clean
-  files:
-    - cbd.h
-    - kem.c
-    - pack_unpack.c
-    - pack_unpack.h
-    - poly.c
-    - poly.h
-    - poly_mul.c
-    - poly_mul.h
-    - SABER_indcpa.h
-    - verify.c
-    - verify.h 
+    - source:
+        scheme: firesaber
+        implementation: avx2
+      files:
+      - verify.h
+      - verify.c
diff --git a/test/duplicate_consistency/lightsaber_avx2.yml b/test/duplicate_consistency/lightsaber_avx2.yml
new file mode 100644
index 00000000..9239f8f0
--- /dev/null
+++ b/test/duplicate_consistency/lightsaber_avx2.yml
@@ -0,0 +1,45 @@
+consistency_checks:
+    - source:
+        scheme: lightsaber
+        implementation: clean
+      files:
+      - verify.h
+      - verify.c
+    - source:
+        scheme: saber
+        implementation: clean
+      files:
+      - verify.h
+      - verify.c
+    - source:
+        scheme: saber
+        implementation: avx2
+      files:
+      - cbd.h
+      - kem.h
+      - pack_unpack.h
+      - poly.h
+      - SABER_indcpa.h
+      - verify.h
+      - kem.c
+      - pack_unpack.c
+      - verify.c
+    - source:
+        scheme: firesaber
+        implementation: clean
+      files:
+      - verify.h
+      - verify.c
+    - source:
+        scheme: firesaber
+        implementation: avx2
+      files:
+      - cbd.h
+      - kem.h
+      - pack_unpack.h
+      - poly.h
+      - SABER_indcpa.h
+      - verify.h
+      - kem.c
+      - pack_unpack.c
+      - verify.c
diff --git a/test/duplicate_consistency/lightsaber_clean.yml b/test/duplicate_consistency/lightsaber_clean.yml
index a4d483be..14c8975d 100644
--- a/test/duplicate_consistency/lightsaber_clean.yml
+++ b/test/duplicate_consistency/lightsaber_clean.yml
@@ -1,31 +1,49 @@
 consistency_checks:
-- source:
-    scheme: saber
-    implementation: clean
-  files:
-    - cbd.h
-    - kem.c
-    - pack_unpack.c
-    - pack_unpack.h
-    - poly.c
-    - poly.h
-    - poly_mul.c
-    - poly_mul.h
-    - SABER_indcpa.h
-    - verify.c
-    - verify.h 
-- source:
-    scheme: firesaber
-    implementation: clean
-  files:
-    - cbd.h
-    - kem.c
-    - pack_unpack.c
-    - pack_unpack.h
-    - poly.c
-    - poly.h
-    - poly_mul.c
-    - poly_mul.h
-    - SABER_indcpa.h
-    - verify.c
-    - verify.h 
+    - source:
+        scheme: lightsaber
+        implementation: avx2
+      files:
+      - verify.h
+      - verify.c
+    - source:
+        scheme: saber
+        implementation: clean
+      files:
+      - cbd.h
+      - pack_unpack.h
+      - poly.h
+      - poly_mul.h
+      - SABER_indcpa.h
+      - verify.h
+      - kem.c
+      - poly.c
+      - poly_mul.c
+      - SABER_indcpa.c
+      - verify.c
+    - source:
+        scheme: saber
+        implementation: avx2
+      files:
+      - verify.h
+      - verify.c
+    - source:
+        scheme: firesaber
+        implementation: clean
+      files:
+      - cbd.h
+      - pack_unpack.h
+      - poly.h
+      - poly_mul.h
+      - SABER_indcpa.h
+      - verify.h
+      - kem.c
+      - poly.c
+      - poly_mul.c
+      - SABER_indcpa.c
+      - verify.c
+    - source:
+        scheme: firesaber
+        implementation: avx2
+      files:
+      - verify.h
+      - verify.c
diff --git a/test/duplicate_consistency/saber_avx2.yml b/test/duplicate_consistency/saber_avx2.yml
new file mode 100644
index 00000000..010ac0c9
--- /dev/null
+++ b/test/duplicate_consistency/saber_avx2.yml
@@ -0,0 +1,26 @@
+consistency_checks:
+    - source:
+        scheme: saber
+        implementation: clean
+      files:
+      - verify.h
+      - verify.c
+    - source:
+        scheme: firesaber
+        implementation: clean
+      files:
+      - verify.h
+      - verify.c
+    - source:
+        scheme: firesaber
+        implementation: avx2
+      files:
+      - cbd.h
+      - kem.h
+      - pack_unpack.h
+      - poly.h
+      - SABER_indcpa.h
+      - verify.h
+      - kem.c
+      - pack_unpack.c
+      - verify.c
diff --git a/test/duplicate_consistency/saber_clean.yml b/test/duplicate_consistency/saber_clean.yml
index 0e1b89dd..7f01d619 100644
--- a/test/duplicate_consistency/saber_clean.yml
+++ b/test/duplicate_consistency/saber_clean.yml
@@ -1,31 +1,28 @@
 consistency_checks:
-- source:
-    scheme: lightsaber
-    implementation: clean
-  files:
-    - cbd.h
-    - kem.c
-    - pack_unpack.c
-    - pack_unpack.h
-    - poly.c
-    - poly.h
-    - poly_mul.c
-    - poly_mul.h
-    - SABER_indcpa.h
-    - verify.c
-    - verify.h 
-- source:
-    scheme: firesaber
-    implementation: clean
-  files:
-    - cbd.h
-    - kem.c
-    - pack_unpack.c
-    - pack_unpack.h
-    - poly.c
-    - poly.h
-    - poly_mul.c
-    - poly_mul.h
-    - SABER_indcpa.h
-    - verify.c
-    - verify.h 
+    - source:
+        scheme: saber
+        implementation: avx2
+      files:
+      - verify.h
+      - verify.c
+    - source:
+        scheme: firesaber
+        implementation: clean
+      files:
+      - cbd.h
+      - pack_unpack.h
+      - poly.h
+      - poly_mul.h
+      - SABER_indcpa.h
+      - verify.h
+      - kem.c
+      - poly.c
+      - poly_mul.c
+      - SABER_indcpa.c
+      - verify.c
+    - source:
+        scheme: firesaber
+        implementation: avx2
+      files:
+      - verify.h
+      - verify.c

From baa309ea7a0b8e7c2b7d6ab3829713ae179ec790 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Thu, 15 Oct 2020 20:45:21 -0400
Subject: [PATCH 02/10] *saber/avx2: declare mul_add static

---
 crypto_kem/firesaber/avx2/polymul/scm_avx.c  | 2 +-
 crypto_kem/lightsaber/avx2/polymul/scm_avx.c | 2 +-
 crypto_kem/saber/avx2/polymul/scm_avx.c      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/crypto_kem/firesaber/avx2/polymul/scm_avx.c b/crypto_kem/firesaber/avx2/polymul/scm_avx.c
index 4e4f11f8..48870f51 100644
--- a/crypto_kem/firesaber/avx2/polymul/scm_avx.c
+++ b/crypto_kem/firesaber/avx2/polymul/scm_avx.c
@@ -4,7 +4,7 @@
 
 #include <immintrin.h>
 
-inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
+static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
     return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
 }
 
diff --git a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c
index 4e4f11f8..48870f51 100644
--- a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c
+++ b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c
@@ -4,7 +4,7 @@
 
 #include <immintrin.h>
 
-inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
+static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
     return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
 }
 
diff --git a/crypto_kem/saber/avx2/polymul/scm_avx.c b/crypto_kem/saber/avx2/polymul/scm_avx.c
index 4e4f11f8..48870f51 100644
--- a/crypto_kem/saber/avx2/polymul/scm_avx.c
+++ b/crypto_kem/saber/avx2/polymul/scm_avx.c
@@ -4,7 +4,7 @@
 
 #include <immintrin.h>
 
-inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
+static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
     return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
 }
 

From 2f97b11031d2329a257f1e0ea8b7c9dc22e88e00 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Fri, 16 Oct 2020 07:48:23 -0400
Subject: [PATCH 03/10] saber: fix initialization and casting issues

---
 crypto_kem/firesaber/clean/SABER_indcpa.c  | 4 ++--
 crypto_kem/firesaber/clean/cbd.c           | 2 +-
 crypto_kem/lightsaber/clean/SABER_indcpa.c | 4 ++--
 crypto_kem/saber/clean/SABER_indcpa.c      | 4 ++--
 crypto_kem/saber/clean/cbd.c               | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.c b/crypto_kem/firesaber/clean/SABER_indcpa.c
index 8f4364e7..76156e79 100644
--- a/crypto_kem/firesaber/clean/SABER_indcpa.c
+++ b/crypto_kem/firesaber/clean/SABER_indcpa.c
@@ -13,7 +13,7 @@
 void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
     uint16_t A[SABER_L][SABER_L][SABER_N];
     uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N] = {0};
+    uint16_t b[SABER_L][SABER_N] = {{0}};
 
     uint8_t seed_A[SABER_SEEDBYTES];
     uint8_t seed_s[SABER_NOISE_SEEDBYTES];
@@ -41,7 +41,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKE
 void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
     uint16_t A[SABER_L][SABER_L][SABER_N];
     uint16_t sp[SABER_L][SABER_N];
-    uint16_t bp[SABER_L][SABER_N] = {0};
+    uint16_t bp[SABER_L][SABER_N] = {{0}};
     uint16_t vp[SABER_N] = {0};
     uint16_t mp[SABER_N];
     uint16_t b[SABER_L][SABER_N];
diff --git a/crypto_kem/firesaber/clean/cbd.c b/crypto_kem/firesaber/clean/cbd.c
index 8032eb5c..28fbc61c 100644
--- a/crypto_kem/firesaber/clean/cbd.c
+++ b/crypto_kem/firesaber/clean/cbd.c
@@ -25,7 +25,7 @@ void PQCLEAN_FIRESABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_PO
     int i, j;
 
     for (i = 0; i < SABER_N / 4; i++) {
-        t = load_littleendian(buf + 3 * i, 3);
+        t = (uint32_t) load_littleendian(buf + 3 * i, 3);
         d = 0;
         for (j = 0; j < 3; j++) {
             d += (t >> j) & 0x249249;
diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.c b/crypto_kem/lightsaber/clean/SABER_indcpa.c
index ccb72492..4b9cb150 100644
--- a/crypto_kem/lightsaber/clean/SABER_indcpa.c
+++ b/crypto_kem/lightsaber/clean/SABER_indcpa.c
@@ -13,7 +13,7 @@
 void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
     uint16_t A[SABER_L][SABER_L][SABER_N];
     uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N] = {0};
+    uint16_t b[SABER_L][SABER_N] = {{0}};
 
     uint8_t seed_A[SABER_SEEDBYTES];
     uint8_t seed_s[SABER_NOISE_SEEDBYTES];
@@ -41,7 +41,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICK
 void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
     uint16_t A[SABER_L][SABER_L][SABER_N];
     uint16_t sp[SABER_L][SABER_N];
-    uint16_t bp[SABER_L][SABER_N] = {0};
+    uint16_t bp[SABER_L][SABER_N] = {{0}};
     uint16_t vp[SABER_N] = {0};
     uint16_t mp[SABER_N];
     uint16_t b[SABER_L][SABER_N];
diff --git a/crypto_kem/saber/clean/SABER_indcpa.c b/crypto_kem/saber/clean/SABER_indcpa.c
index fe54f4ca..c36f02ea 100644
--- a/crypto_kem/saber/clean/SABER_indcpa.c
+++ b/crypto_kem/saber/clean/SABER_indcpa.c
@@ -13,7 +13,7 @@
 void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
     uint16_t A[SABER_L][SABER_L][SABER_N];
     uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N] = {0};
+    uint16_t b[SABER_L][SABER_N] = {{0}};
 
     uint8_t seed_A[SABER_SEEDBYTES];
     uint8_t seed_s[SABER_NOISE_SEEDBYTES];
@@ -41,7 +41,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYT
 void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
     uint16_t A[SABER_L][SABER_L][SABER_N];
     uint16_t sp[SABER_L][SABER_N];
-    uint16_t bp[SABER_L][SABER_N] = {0};
+    uint16_t bp[SABER_L][SABER_N] = {{0}};
     uint16_t vp[SABER_N] = {0};
     uint16_t mp[SABER_N];
     uint16_t b[SABER_L][SABER_N];
diff --git a/crypto_kem/saber/clean/cbd.c b/crypto_kem/saber/clean/cbd.c
index e0ccef9d..b8dee33b 100644
--- a/crypto_kem/saber/clean/cbd.c
+++ b/crypto_kem/saber/clean/cbd.c
@@ -25,7 +25,7 @@ void PQCLEAN_SABER_CLEAN_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCO
     int i, j;
 
     for (i = 0; i < SABER_N / 4; i++) {
-        t = load_littleendian(buf + 4 * i, 4);
+        t = (uint32_t) load_littleendian(buf + 4 * i, 4);
         d = 0;
         for (j = 0; j < 4; j++) {
             d += (t >> j) & 0x11111111;

From e92a052ea48563d5d06bcb539e76b9f36b351d13 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Mon, 19 Oct 2020 12:23:48 -0400
Subject: [PATCH 04/10] saber: output pointers on left, and size_t for indexing

---
 crypto_kem/firesaber/META.yml              |  4 ++--
 crypto_kem/firesaber/avx2/SABER_indcpa.c   | 12 +++++-----
 crypto_kem/firesaber/clean/SABER_indcpa.c  |  6 ++---
 crypto_kem/firesaber/clean/kem.c           |  7 +++---
 crypto_kem/firesaber/clean/poly.c          | 14 ++++++------
 crypto_kem/firesaber/clean/poly_mul.c      | 26 +++++++++++-----------
 crypto_kem/firesaber/clean/poly_mul.h      |  2 +-
 crypto_kem/lightsaber/META.yml             |  4 ++--
 crypto_kem/lightsaber/avx2/SABER_indcpa.c  | 12 +++++-----
 crypto_kem/lightsaber/clean/SABER_indcpa.c |  6 ++---
 crypto_kem/lightsaber/clean/kem.c          |  7 +++---
 crypto_kem/lightsaber/clean/poly.c         | 14 ++++++------
 crypto_kem/lightsaber/clean/poly_mul.c     | 26 +++++++++++-----------
 crypto_kem/lightsaber/clean/poly_mul.h     |  2 +-
 crypto_kem/saber/META.yml                  |  4 ++--
 crypto_kem/saber/avx2/SABER_indcpa.c       | 12 +++++-----
 crypto_kem/saber/clean/SABER_indcpa.c      |  6 ++---
 crypto_kem/saber/clean/kem.c               |  7 +++---
 crypto_kem/saber/clean/poly.c              | 14 ++++++------
 crypto_kem/saber/clean/poly_mul.c          | 26 +++++++++++-----------
 crypto_kem/saber/clean/poly_mul.h          |  2 +-
 21 files changed, 105 insertions(+), 108 deletions(-)

diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml
index def16e46..d1781339 100644
--- a/crypto_kem/firesaber/META.yml
+++ b/crypto_kem/firesaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/firesaber/avx2/SABER_indcpa.c b/crypto_kem/firesaber/avx2/SABER_indcpa.c
index ab017224..625a3f6b 100644
--- a/crypto_kem/firesaber/avx2/SABER_indcpa.c
+++ b/crypto_kem/firesaber/avx2/SABER_indcpa.c
@@ -66,7 +66,7 @@ static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
 }
 
 //********************************matrix-vector mul routines*****************************************************
-static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) {
+static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) {
     int64_t i, j;
 
     __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
@@ -86,7 +86,7 @@ static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1
 
 }
 
-static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) {
+static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) {
 
     int64_t i;
 
@@ -162,7 +162,7 @@ void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
     for (j = 0; j < NUM_POLY; j++) {
         TC_eval(sk_avx[j], b_bucket[j]);
     }
-    matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order
+    matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order
 
     // Now truncation
 
@@ -259,7 +259,7 @@ void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DE
     for (j = 0; j < NUM_POLY; j++) {
         TC_eval(sk_avx[j], b_bucket[j]);
     }
-    matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order
+    matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order
 
     // Now truncation
 
@@ -302,7 +302,7 @@ void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DE
 
     // vector-vector scalar multiplication with mod p
 
-    vector_vector_mul(pkcl_avx, b_bucket, vprime_avx);
+    vector_vector_mul(vprime_avx, pkcl_avx, b_bucket);
 
     // Computation of v'+h1
     for (i = 0; i < SABER_N / 16; i++) { //adding h1
@@ -392,7 +392,7 @@ void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint
         TC_eval(sksv_avx[j], b_bucket[j]);
     }
 
-    vector_vector_mul(pksv_avx, b_bucket, v_avx);
+    vector_vector_mul(v_avx, pksv_avx, b_bucket);
 
     for (i = 0; i < SABER_N / 16; i++) {
         _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.c b/crypto_kem/firesaber/clean/SABER_indcpa.c
index 76156e79..342eb7ca 100644
--- a/crypto_kem/firesaber/clean/SABER_indcpa.c
+++ b/crypto_kem/firesaber/clean/SABER_indcpa.c
@@ -17,7 +17,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKE
 
     uint8_t seed_A[SABER_SEEDBYTES];
     uint8_t seed_s[SABER_NOISE_SEEDBYTES];
-    int i, j;
+    size_t i, j;
 
     randombytes(seed_A, SABER_SEEDBYTES);
     shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
@@ -45,7 +45,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_D
     uint16_t vp[SABER_N] = {0};
     uint16_t mp[SABER_N];
     uint16_t b[SABER_L][SABER_N];
-    int i, j;
+    size_t i, j;
     const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
     PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A);
@@ -77,7 +77,7 @@ void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uin
     uint16_t b[SABER_L][SABER_N];
     uint16_t v[SABER_N] = {0};
     uint16_t cm[SABER_N];
-    int i;
+    size_t i;
 
     PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk);
     PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, ciphertext);
diff --git a/crypto_kem/firesaber/clean/kem.c b/crypto_kem/firesaber/clean/kem.c
index e94219a6..2ffe4e75 100644
--- a/crypto_kem/firesaber/clean/kem.c
+++ b/crypto_kem/firesaber/clean/kem.c
@@ -4,13 +4,12 @@
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
+#include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 
 
 int PQCLEAN_FIRESABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
-    int i;
+    size_t i;
 
     PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -48,7 +47,7 @@ int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t
 }
 
 int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
-    int i;
+    size_t i;
     uint8_t fail;
     uint8_t cmp[SABER_BYTES_CCA_DEC];
     uint8_t buf[64];
diff --git a/crypto_kem/firesaber/clean/poly.c b/crypto_kem/firesaber/clean/poly.c
index c65175fe..c6d729ba 100644
--- a/crypto_kem/firesaber/clean/poly.c
+++ b/crypto_kem/firesaber/clean/poly.c
@@ -4,31 +4,31 @@
 #include "pack_unpack.h"
 #include "poly.h"
 #include "poly_mul.h"
-#include <stdio.h>
+#include <stddef.h>
 
 void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
-    int i, j;
+    size_t i, j;
     for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_L; j++) {
             if (transpose == 1) {
-                PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]);
+                PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]);
             } else {
-                PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]);
+                PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]);
             }
         }
     }
 }
 
 void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
-    int j;
+    size_t j;
     for (j = 0; j < SABER_L; j++) {
-        PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(b[j], s[j], res);
+        PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res, b[j], s[j]);
     }
 }
 
 void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
     uint8_t buf[SABER_L * SABER_POLYVECBYTES];
-    int i;
+    size_t i;
 
     shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
 
diff --git a/crypto_kem/firesaber/clean/poly_mul.c b/crypto_kem/firesaber/clean/poly_mul.c
index 27c92f29..6b527c21 100644
--- a/crypto_kem/firesaber/clean/poly_mul.c
+++ b/crypto_kem/firesaber/clean/poly_mul.c
@@ -11,13 +11,13 @@
 #define OVERFLOWING_MUL(X, Y) ((uint16_t)((uint32_t)(X) * (uint32_t)(Y)))
 
 #define KARATSUBA_N 64
-static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t *result_final) {
+static void karatsuba_simple(uint16_t *result_final, const uint16_t *a_1, const uint16_t *b_1) {
     uint16_t d01[KARATSUBA_N / 2 - 1];
     uint16_t d0123[KARATSUBA_N / 2 - 1];
     uint16_t d23[KARATSUBA_N / 2 - 1];
     uint16_t result_d01[KARATSUBA_N - 1];
 
-    int32_t i, j;
+    size_t i, j;
 
     memset(result_d01, 0, (KARATSUBA_N - 1)*sizeof(uint16_t));
     memset(d01, 0, (KARATSUBA_N / 2 - 1)*sizeof(uint16_t));
@@ -110,7 +110,7 @@ static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t
 
 
 
-static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *result) {
+static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t *b1) {
     uint16_t inv3 = 43691, inv9 = 36409, inv15 = 61167;
 
     uint16_t aw1[N_SB], aw2[N_SB], aw3[N_SB], aw4[N_SB], aw5[N_SB], aw6[N_SB], aw7[N_SB];
@@ -181,13 +181,13 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
 
     // MULTIPLICATION
 
-    karatsuba_simple(aw1, bw1, w1);
-    karatsuba_simple(aw2, bw2, w2);
-    karatsuba_simple(aw3, bw3, w3);
-    karatsuba_simple(aw4, bw4, w4);
-    karatsuba_simple(aw5, bw5, w5);
-    karatsuba_simple(aw6, bw6, w6);
-    karatsuba_simple(aw7, bw7, w7);
+    karatsuba_simple(w1, aw1, bw1);
+    karatsuba_simple(w2, aw2, bw2);
+    karatsuba_simple(w3, aw3, bw3);
+    karatsuba_simple(w4, aw4, bw4);
+    karatsuba_simple(w5, aw5, bw5);
+    karatsuba_simple(w6, aw6, bw6);
+    karatsuba_simple(w7, aw7, bw7);
 
     // INTERPOLATION
     for (i = 0; i < N_SB_RES; ++i) {
@@ -229,11 +229,11 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
 }
 
 /* res += a*b */
-void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) {
+void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) {
     uint16_t c[2 * SABER_N] = {0};
-    int i;
+    size_t i;
 
-    toom_cook_4way(a, b, c);
+    toom_cook_4way(c, a, b);
 
     /* reduction */
     for (i = SABER_N; i < 2 * SABER_N; i++) {
diff --git a/crypto_kem/firesaber/clean/poly_mul.h b/crypto_kem/firesaber/clean/poly_mul.h
index e554d60c..b6911577 100644
--- a/crypto_kem/firesaber/clean/poly_mul.h
+++ b/crypto_kem/firesaber/clean/poly_mul.h
@@ -3,7 +3,7 @@
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]);
+void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]);
 
 
 #endif
diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml
index 1cc06c9a..7e1dd2eb 100644
--- a/crypto_kem/lightsaber/META.yml
+++ b/crypto_kem/lightsaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/lightsaber/avx2/SABER_indcpa.c b/crypto_kem/lightsaber/avx2/SABER_indcpa.c
index 3270a8c9..47f760e9 100644
--- a/crypto_kem/lightsaber/avx2/SABER_indcpa.c
+++ b/crypto_kem/lightsaber/avx2/SABER_indcpa.c
@@ -66,7 +66,7 @@ static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
 }
 
 //********************************matrix-vector mul routines*****************************************************
-static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) {
+static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) {
     int64_t i, j;
 
     __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
@@ -86,7 +86,7 @@ static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1
 
 }
 
-static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) {
+static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) {
 
     int64_t i;
 
@@ -162,7 +162,7 @@ void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
     for (j = 0; j < NUM_POLY; j++) {
         TC_eval(sk_avx[j], b_bucket[j]);
     }
-    matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order
+    matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order
 
     // Now truncation
 
@@ -259,7 +259,7 @@ void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_D
     for (j = 0; j < NUM_POLY; j++) {
         TC_eval(sk_avx[j], b_bucket[j]);
     }
-    matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order
+    matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order
 
     // Now truncation
 
@@ -302,7 +302,7 @@ void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_D
 
     // vector-vector scalar multiplication with mod p
 
-    vector_vector_mul(pkcl_avx, b_bucket, vprime_avx);
+    vector_vector_mul(vprime_avx, pkcl_avx, b_bucket);
 
     // Computation of v'+h1
     for (i = 0; i < SABER_N / 16; i++) { //adding h1
@@ -392,7 +392,7 @@ void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uin
         TC_eval(sksv_avx[j], b_bucket[j]);
     }
 
-    vector_vector_mul(pksv_avx, b_bucket, v_avx);
+    vector_vector_mul(v_avx, pksv_avx, b_bucket);
 
     for (i = 0; i < SABER_N / 16; i++) {
         _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.c b/crypto_kem/lightsaber/clean/SABER_indcpa.c
index 4b9cb150..9dcdfb93 100644
--- a/crypto_kem/lightsaber/clean/SABER_indcpa.c
+++ b/crypto_kem/lightsaber/clean/SABER_indcpa.c
@@ -17,7 +17,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICK
 
     uint8_t seed_A[SABER_SEEDBYTES];
     uint8_t seed_s[SABER_NOISE_SEEDBYTES];
-    int i, j;
+    size_t i, j;
 
     randombytes(seed_A, SABER_SEEDBYTES);
     shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
@@ -45,7 +45,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_
     uint16_t vp[SABER_N] = {0};
     uint16_t mp[SABER_N];
     uint16_t b[SABER_L][SABER_N];
-    int i, j;
+    size_t i, j;
     const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
     PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A);
@@ -77,7 +77,7 @@ void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const ui
     uint16_t b[SABER_L][SABER_N];
     uint16_t v[SABER_N] = {0};
     uint16_t cm[SABER_N];
-    int i;
+    size_t i;
 
     PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk);
     PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, ciphertext);
diff --git a/crypto_kem/lightsaber/clean/kem.c b/crypto_kem/lightsaber/clean/kem.c
index eb9353b1..d0a67736 100644
--- a/crypto_kem/lightsaber/clean/kem.c
+++ b/crypto_kem/lightsaber/clean/kem.c
@@ -4,13 +4,12 @@
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
+#include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 
 
 int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
-    int i;
+    size_t i;
 
     PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -48,7 +47,7 @@ int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_
 }
 
 int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
-    int i;
+    size_t i;
     uint8_t fail;
     uint8_t cmp[SABER_BYTES_CCA_DEC];
     uint8_t buf[64];
diff --git a/crypto_kem/lightsaber/clean/poly.c b/crypto_kem/lightsaber/clean/poly.c
index 1c1e22cc..9bb55afe 100644
--- a/crypto_kem/lightsaber/clean/poly.c
+++ b/crypto_kem/lightsaber/clean/poly.c
@@ -4,31 +4,31 @@
 #include "pack_unpack.h"
 #include "poly.h"
 #include "poly_mul.h"
-#include <stdio.h>
+#include <stddef.h>
 
 void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
-    int i, j;
+    size_t i, j;
     for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_L; j++) {
             if (transpose == 1) {
-                PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]);
+                PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]);
             } else {
-                PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]);
+                PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]);
             }
         }
     }
 }
 
 void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
-    int j;
+    size_t j;
     for (j = 0; j < SABER_L; j++) {
-        PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(b[j], s[j], res);
+        PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res, b[j], s[j]);
     }
 }
 
 void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
     uint8_t buf[SABER_L * SABER_POLYVECBYTES];
-    int i;
+    size_t i;
 
     shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
 
diff --git a/crypto_kem/lightsaber/clean/poly_mul.c b/crypto_kem/lightsaber/clean/poly_mul.c
index 5e37a024..c7f5c424 100644
--- a/crypto_kem/lightsaber/clean/poly_mul.c
+++ b/crypto_kem/lightsaber/clean/poly_mul.c
@@ -11,13 +11,13 @@
 #define OVERFLOWING_MUL(X, Y) ((uint16_t)((uint32_t)(X) * (uint32_t)(Y)))
 
 #define KARATSUBA_N 64
-static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t *result_final) {
+static void karatsuba_simple(uint16_t *result_final, const uint16_t *a_1, const uint16_t *b_1) {
     uint16_t d01[KARATSUBA_N / 2 - 1];
     uint16_t d0123[KARATSUBA_N / 2 - 1];
     uint16_t d23[KARATSUBA_N / 2 - 1];
     uint16_t result_d01[KARATSUBA_N - 1];
 
-    int32_t i, j;
+    size_t i, j;
 
     memset(result_d01, 0, (KARATSUBA_N - 1)*sizeof(uint16_t));
     memset(d01, 0, (KARATSUBA_N / 2 - 1)*sizeof(uint16_t));
@@ -110,7 +110,7 @@ static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t
 
 
 
-static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *result) {
+static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t *b1) {
     uint16_t inv3 = 43691, inv9 = 36409, inv15 = 61167;
 
     uint16_t aw1[N_SB], aw2[N_SB], aw3[N_SB], aw4[N_SB], aw5[N_SB], aw6[N_SB], aw7[N_SB];
@@ -181,13 +181,13 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
 
     // MULTIPLICATION
 
-    karatsuba_simple(aw1, bw1, w1);
-    karatsuba_simple(aw2, bw2, w2);
-    karatsuba_simple(aw3, bw3, w3);
-    karatsuba_simple(aw4, bw4, w4);
-    karatsuba_simple(aw5, bw5, w5);
-    karatsuba_simple(aw6, bw6, w6);
-    karatsuba_simple(aw7, bw7, w7);
+    karatsuba_simple(w1, aw1, bw1);
+    karatsuba_simple(w2, aw2, bw2);
+    karatsuba_simple(w3, aw3, bw3);
+    karatsuba_simple(w4, aw4, bw4);
+    karatsuba_simple(w5, aw5, bw5);
+    karatsuba_simple(w6, aw6, bw6);
+    karatsuba_simple(w7, aw7, bw7);
 
     // INTERPOLATION
     for (i = 0; i < N_SB_RES; ++i) {
@@ -229,11 +229,11 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
 }
 
 /* res += a*b */
-void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) {
+void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) {
     uint16_t c[2 * SABER_N] = {0};
-    int i;
+    size_t i;
 
-    toom_cook_4way(a, b, c);
+    toom_cook_4way(c, a, b);
 
     /* reduction */
     for (i = SABER_N; i < 2 * SABER_N; i++) {
diff --git a/crypto_kem/lightsaber/clean/poly_mul.h b/crypto_kem/lightsaber/clean/poly_mul.h
index 0d5cf6ed..5ec233bb 100644
--- a/crypto_kem/lightsaber/clean/poly_mul.h
+++ b/crypto_kem/lightsaber/clean/poly_mul.h
@@ -3,7 +3,7 @@
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]);
+void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]);
 
 
 #endif
diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml
index 50250180..87187702 100644
--- a/crypto_kem/saber/META.yml
+++ b/crypto_kem/saber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/commit/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/saber/avx2/SABER_indcpa.c b/crypto_kem/saber/avx2/SABER_indcpa.c
index d16a7a06..5515c610 100644
--- a/crypto_kem/saber/avx2/SABER_indcpa.c
+++ b/crypto_kem/saber/avx2/SABER_indcpa.c
@@ -66,7 +66,7 @@ static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
 }
 
 //********************************matrix-vector mul routines*****************************************************
-static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[NUM_POLY][AVX_N1], int isTranspose) {
+static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) {
     int64_t i, j;
 
     __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
@@ -86,7 +86,7 @@ static void matrix_vector_mul(__m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1
 
 }
 
-static void vector_vector_mul(__m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], __m256i res_avx[AVX_N1]) {
+static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) {
 
     int64_t i;
 
@@ -162,7 +162,7 @@ void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
     for (j = 0; j < NUM_POLY; j++) {
         TC_eval(sk_avx[j], b_bucket[j]);
     }
-    matrix_vector_mul(a_avx, b_bucket, res_avx, 1);// Matrix-vector multiplication; Matrix in transposed order
+    matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order
 
     // Now truncation
 
@@ -259,7 +259,7 @@ void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC],
     for (j = 0; j < NUM_POLY; j++) {
         TC_eval(sk_avx[j], b_bucket[j]);
     }
-    matrix_vector_mul(a_avx, b_bucket, res_avx, 0);// Matrix-vector multiplication; Matrix in normal order
+    matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order
 
     // Now truncation
 
@@ -302,7 +302,7 @@ void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC],
 
     // vector-vector scalar multiplication with mod p
 
-    vector_vector_mul(pkcl_avx, b_bucket, vprime_avx);
+    vector_vector_mul(vprime_avx, pkcl_avx, b_bucket);
 
     // Computation of v'+h1
     for (i = 0; i < SABER_N / 16; i++) { //adding h1
@@ -392,7 +392,7 @@ void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t
         TC_eval(sksv_avx[j], b_bucket[j]);
     }
 
-    vector_vector_mul(pksv_avx, b_bucket, v_avx);
+    vector_vector_mul(v_avx, pksv_avx, b_bucket);
 
     for (i = 0; i < SABER_N / 16; i++) {
         _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
diff --git a/crypto_kem/saber/clean/SABER_indcpa.c b/crypto_kem/saber/clean/SABER_indcpa.c
index c36f02ea..23325749 100644
--- a/crypto_kem/saber/clean/SABER_indcpa.c
+++ b/crypto_kem/saber/clean/SABER_indcpa.c
@@ -17,7 +17,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYT
 
     uint8_t seed_A[SABER_SEEDBYTES];
     uint8_t seed_s[SABER_NOISE_SEEDBYTES];
-    int i, j;
+    size_t i, j;
 
     randombytes(seed_A, SABER_SEEDBYTES);
     shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
@@ -45,7 +45,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC],
     uint16_t vp[SABER_N] = {0};
     uint16_t mp[SABER_N];
     uint16_t b[SABER_L][SABER_N];
-    int i, j;
+    size_t i, j;
     const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
     PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A);
@@ -77,7 +77,7 @@ void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t
     uint16_t b[SABER_L][SABER_N];
     uint16_t v[SABER_N] = {0};
     uint16_t cm[SABER_N];
-    int i;
+    size_t i;
 
     PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk);
     PQCLEAN_SABER_CLEAN_BS2POLVECp(b, ciphertext);
diff --git a/crypto_kem/saber/clean/kem.c b/crypto_kem/saber/clean/kem.c
index ed8e3ac7..6a7f20c4 100644
--- a/crypto_kem/saber/clean/kem.c
+++ b/crypto_kem/saber/clean/kem.c
@@ -4,13 +4,12 @@
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
+#include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 
 
 int PQCLEAN_SABER_CLEAN_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
-    int i;
+    size_t i;
 
     PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -48,7 +47,7 @@ int PQCLEAN_SABER_CLEAN_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk
 }
 
 int PQCLEAN_SABER_CLEAN_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
-    int i;
+    size_t i;
     uint8_t fail;
     uint8_t cmp[SABER_BYTES_CCA_DEC];
     uint8_t buf[64];
diff --git a/crypto_kem/saber/clean/poly.c b/crypto_kem/saber/clean/poly.c
index f0403ccf..2c44e962 100644
--- a/crypto_kem/saber/clean/poly.c
+++ b/crypto_kem/saber/clean/poly.c
@@ -4,31 +4,31 @@
 #include "pack_unpack.h"
 #include "poly.h"
 #include "poly_mul.h"
-#include <stdio.h>
+#include <stddef.h>
 
 void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
-    int i, j;
+    size_t i, j;
     for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_L; j++) {
             if (transpose == 1) {
-                PQCLEAN_SABER_CLEAN_poly_mul_acc(A[j][i], s[j], res[i]);
+                PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]);
             } else {
-                PQCLEAN_SABER_CLEAN_poly_mul_acc(A[i][j], s[j], res[i]);
+                PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]);
             }
         }
     }
 }
 
 void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
-    int j;
+    size_t j;
     for (j = 0; j < SABER_L; j++) {
-        PQCLEAN_SABER_CLEAN_poly_mul_acc(b[j], s[j], res);
+        PQCLEAN_SABER_CLEAN_poly_mul_acc(res, b[j], s[j]);
     }
 }
 
 void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
     uint8_t buf[SABER_L * SABER_POLYVECBYTES];
-    int i;
+    size_t i;
 
     shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
 
diff --git a/crypto_kem/saber/clean/poly_mul.c b/crypto_kem/saber/clean/poly_mul.c
index 0655383b..686960dc 100644
--- a/crypto_kem/saber/clean/poly_mul.c
+++ b/crypto_kem/saber/clean/poly_mul.c
@@ -11,13 +11,13 @@
 #define OVERFLOWING_MUL(X, Y) ((uint16_t)((uint32_t)(X) * (uint32_t)(Y)))
 
 #define KARATSUBA_N 64
-static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t *result_final) {
+static void karatsuba_simple(uint16_t *result_final, const uint16_t *a_1, const uint16_t *b_1) {
     uint16_t d01[KARATSUBA_N / 2 - 1];
     uint16_t d0123[KARATSUBA_N / 2 - 1];
     uint16_t d23[KARATSUBA_N / 2 - 1];
     uint16_t result_d01[KARATSUBA_N - 1];
 
-    int32_t i, j;
+    size_t i, j;
 
     memset(result_d01, 0, (KARATSUBA_N - 1)*sizeof(uint16_t));
     memset(d01, 0, (KARATSUBA_N / 2 - 1)*sizeof(uint16_t));
@@ -110,7 +110,7 @@ static void karatsuba_simple(const uint16_t *a_1, const uint16_t *b_1, uint16_t
 
 
 
-static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *result) {
+static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t *b1) {
     uint16_t inv3 = 43691, inv9 = 36409, inv15 = 61167;
 
     uint16_t aw1[N_SB], aw2[N_SB], aw3[N_SB], aw4[N_SB], aw5[N_SB], aw6[N_SB], aw7[N_SB];
@@ -181,13 +181,13 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
 
     // MULTIPLICATION
 
-    karatsuba_simple(aw1, bw1, w1);
-    karatsuba_simple(aw2, bw2, w2);
-    karatsuba_simple(aw3, bw3, w3);
-    karatsuba_simple(aw4, bw4, w4);
-    karatsuba_simple(aw5, bw5, w5);
-    karatsuba_simple(aw6, bw6, w6);
-    karatsuba_simple(aw7, bw7, w7);
+    karatsuba_simple(w1, aw1, bw1);
+    karatsuba_simple(w2, aw2, bw2);
+    karatsuba_simple(w3, aw3, bw3);
+    karatsuba_simple(w4, aw4, bw4);
+    karatsuba_simple(w5, aw5, bw5);
+    karatsuba_simple(w6, aw6, bw6);
+    karatsuba_simple(w7, aw7, bw7);
 
     // INTERPOLATION
     for (i = 0; i < N_SB_RES; ++i) {
@@ -229,11 +229,11 @@ static void toom_cook_4way (const uint16_t *a1, const uint16_t *b1, uint16_t *re
 }
 
 /* res += a*b */
-void PQCLEAN_SABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]) {
+void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) {
     uint16_t c[2 * SABER_N] = {0};
-    int i;
+    size_t i;
 
-    toom_cook_4way(a, b, c);
+    toom_cook_4way(c, a, b);
 
     /* reduction */
     for (i = SABER_N; i < 2 * SABER_N; i++) {
diff --git a/crypto_kem/saber/clean/poly_mul.h b/crypto_kem/saber/clean/poly_mul.h
index e0f10043..82140f5b 100644
--- a/crypto_kem/saber/clean/poly_mul.h
+++ b/crypto_kem/saber/clean/poly_mul.h
@@ -3,7 +3,7 @@
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_SABER_CLEAN_poly_mul_acc(const uint16_t a[SABER_N], const uint16_t b[SABER_N], uint16_t res[SABER_N]);
+void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]);
 
 
 #endif

From 135f95e15b66d6cb6c8e0273c0b63948b5e0586c Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Wed, 28 Oct 2020 12:02:04 -0400
Subject: [PATCH 05/10] Clean up AVX2 code

---
 crypto_kem/firesaber/META.yml                 |    4 +-
 crypto_kem/firesaber/avx2/Makefile            |    2 +-
 crypto_kem/firesaber/avx2/SABER_indcpa.c      |  435 +----
 crypto_kem/firesaber/avx2/SABER_params.h      |   40 +-
 crypto_kem/firesaber/avx2/cbd.c               |   26 +-
 crypto_kem/firesaber/avx2/cbd.h               |    4 +-
 crypto_kem/firesaber/avx2/kem.c               |   12 +-
 crypto_kem/firesaber/avx2/kem.h               |   32 -
 crypto_kem/firesaber/avx2/pack_unpack.c       |  587 ++-----
 crypto_kem/firesaber/avx2/pack_unpack.h       |   46 +-
 crypto_kem/firesaber/avx2/poly.c              |   62 +
 crypto_kem/firesaber/avx2/poly.h              |   37 +-
 crypto_kem/firesaber/avx2/poly_mul.c          | 1524 +++++++++++++++++
 crypto_kem/firesaber/avx2/polymul/consts.h    |   20 -
 crypto_kem/firesaber/avx2/polymul/matrix.c    |  303 ----
 crypto_kem/firesaber/avx2/polymul/scm_avx.c   |  753 --------
 .../firesaber/avx2/polymul/toom-cook_4way.c   | 1010 -----------
 crypto_kem/firesaber/clean/SABER_indcpa.c     |  123 +-
 crypto_kem/firesaber/clean/SABER_indcpa.h     |    2 +-
 crypto_kem/firesaber/clean/SABER_params.h     |   12 +-
 crypto_kem/firesaber/clean/api.h              |    2 +-
 crypto_kem/firesaber/clean/pack_unpack.c      |  161 +-
 crypto_kem/firesaber/clean/pack_unpack.h      |   17 +-
 crypto_kem/firesaber/clean/poly.c             |   44 +-
 crypto_kem/firesaber/clean/poly.h             |   16 +-
 crypto_kem/firesaber/clean/poly_mul.c         |   18 +-
 crypto_kem/firesaber/clean/poly_mul.h         |    8 +-
 crypto_kem/lightsaber/META.yml                |    4 +-
 crypto_kem/lightsaber/avx2/Makefile           |    2 +-
 crypto_kem/lightsaber/avx2/SABER_indcpa.c     |  435 +----
 crypto_kem/lightsaber/avx2/SABER_params.h     |   43 +-
 crypto_kem/lightsaber/avx2/cbd.c              |   19 +-
 crypto_kem/lightsaber/avx2/cbd.h              |    4 +-
 crypto_kem/lightsaber/avx2/kem.c              |   12 +-
 crypto_kem/lightsaber/avx2/kem.h              |   32 -
 crypto_kem/lightsaber/avx2/pack_unpack.c      |  587 ++-----
 crypto_kem/lightsaber/avx2/pack_unpack.h      |   46 +-
 crypto_kem/lightsaber/avx2/poly.c             |   62 +
 crypto_kem/lightsaber/avx2/poly.h             |   37 +-
 crypto_kem/lightsaber/avx2/poly_mul.c         | 1524 +++++++++++++++++
 crypto_kem/lightsaber/avx2/polymul/consts.h   |   20 -
 crypto_kem/lightsaber/avx2/polymul/matrix.c   |  303 ----
 crypto_kem/lightsaber/avx2/polymul/scm_avx.c  |  753 --------
 .../lightsaber/avx2/polymul/toom-cook_4way.c  | 1010 -----------
 crypto_kem/lightsaber/clean/SABER_indcpa.c    |  123 +-
 crypto_kem/lightsaber/clean/SABER_indcpa.h    |    2 +-
 crypto_kem/lightsaber/clean/SABER_params.h    |   12 +-
 crypto_kem/lightsaber/clean/api.h             |    2 +-
 crypto_kem/lightsaber/clean/pack_unpack.c     |  169 +-
 crypto_kem/lightsaber/clean/pack_unpack.h     |   17 +-
 crypto_kem/lightsaber/clean/poly.c            |   44 +-
 crypto_kem/lightsaber/clean/poly.h            |   16 +-
 crypto_kem/lightsaber/clean/poly_mul.c        |   18 +-
 crypto_kem/lightsaber/clean/poly_mul.h        |    8 +-
 crypto_kem/saber/META.yml                     |    4 +-
 crypto_kem/saber/avx2/Makefile                |    2 +-
 crypto_kem/saber/avx2/SABER_indcpa.c          |  435 +----
 crypto_kem/saber/avx2/SABER_params.h          |   43 +-
 crypto_kem/saber/avx2/cbd.c                   |   23 +-
 crypto_kem/saber/avx2/cbd.h                   |    4 +-
 crypto_kem/saber/avx2/kem.c                   |   12 +-
 crypto_kem/saber/avx2/kem.h                   |   32 -
 crypto_kem/saber/avx2/pack_unpack.c           |  583 ++-----
 crypto_kem/saber/avx2/pack_unpack.h           |   46 +-
 crypto_kem/saber/avx2/poly.c                  |   62 +
 crypto_kem/saber/avx2/poly.h                  |   37 +-
 crypto_kem/saber/avx2/poly_mul.c              | 1524 +++++++++++++++++
 crypto_kem/saber/avx2/polymul/consts.h        |   20 -
 crypto_kem/saber/avx2/polymul/matrix.c        |  303 ----
 crypto_kem/saber/avx2/polymul/scm_avx.c       |  753 --------
 .../saber/avx2/polymul/toom-cook_4way.c       | 1010 -----------
 crypto_kem/saber/clean/SABER_indcpa.c         |  123 +-
 crypto_kem/saber/clean/SABER_indcpa.h         |    2 +-
 crypto_kem/saber/clean/SABER_params.h         |   12 +-
 crypto_kem/saber/clean/api.h                  |    2 +-
 crypto_kem/saber/clean/pack_unpack.c          |  153 +-
 crypto_kem/saber/clean/pack_unpack.h          |   17 +-
 crypto_kem/saber/clean/poly.c                 |   44 +-
 crypto_kem/saber/clean/poly.h                 |   16 +-
 crypto_kem/saber/clean/poly_mul.c             |   18 +-
 crypto_kem/saber/clean/poly_mul.h             |    8 +-
 test/duplicate_consistency/firesaber_avx2.yml |    9 +
 .../duplicate_consistency/firesaber_clean.yml |    9 +
 .../duplicate_consistency/lightsaber_avx2.yml |   27 +-
 .../lightsaber_clean.yml                      |   19 +
 test/duplicate_consistency/saber_avx2.yml     |   18 +-
 test/duplicate_consistency/saber_clean.yml    |   14 +
 87 files changed, 6314 insertions(+), 9674 deletions(-)
 create mode 100644 crypto_kem/firesaber/avx2/poly.c
 create mode 100644 crypto_kem/firesaber/avx2/poly_mul.c
 delete mode 100644 crypto_kem/firesaber/avx2/polymul/consts.h
 delete mode 100644 crypto_kem/firesaber/avx2/polymul/matrix.c
 delete mode 100644 crypto_kem/firesaber/avx2/polymul/scm_avx.c
 delete mode 100644 crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c
 create mode 100644 crypto_kem/lightsaber/avx2/poly.c
 create mode 100644 crypto_kem/lightsaber/avx2/poly_mul.c
 delete mode 100644 crypto_kem/lightsaber/avx2/polymul/consts.h
 delete mode 100644 crypto_kem/lightsaber/avx2/polymul/matrix.c
 delete mode 100644 crypto_kem/lightsaber/avx2/polymul/scm_avx.c
 delete mode 100644 crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c
 create mode 100644 crypto_kem/saber/avx2/poly.c
 create mode 100644 crypto_kem/saber/avx2/poly_mul.c
 delete mode 100644 crypto_kem/saber/avx2/polymul/consts.h
 delete mode 100644 crypto_kem/saber/avx2/polymul/matrix.c
 delete mode 100644 crypto_kem/saber/avx2/polymul/scm_avx.c
 delete mode 100644 crypto_kem/saber/avx2/polymul/toom-cook_4way.c

diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml
index d1781339..0aa614ca 100644
--- a/crypto_kem/firesaber/META.yml
+++ b/crypto_kem/firesaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/firesaber/avx2/Makefile b/crypto_kem/firesaber/avx2/Makefile
index a44bbdb4..b7fbd7d8 100644
--- a/crypto_kem/firesaber/avx2/Makefile
+++ b/crypto_kem/firesaber/avx2/Makefile
@@ -2,7 +2,7 @@
 
 LIB=libfiresaber_avx2.a
 HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
-OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o 
+OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
 CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
 
diff --git a/crypto_kem/firesaber/avx2/SABER_indcpa.c b/crypto_kem/firesaber/avx2/SABER_indcpa.c
index 625a3f6b..285a6625 100644
--- a/crypto_kem/firesaber/avx2/SABER_indcpa.c
+++ b/crypto_kem/firesaber/avx2/SABER_indcpa.c
@@ -1,416 +1,125 @@
-#include "./polymul/toom-cook_4way.c"
 #include "SABER_indcpa.h"
 #include "SABER_params.h"
-#include "api.h"
-#include "cbd.h"
 #include "fips202.h"
 #include "pack_unpack.h"
+#include "poly.h"
 #include "randombytes.h"
 #include <stdint.h>
-#include <stdio.h>
 #include <string.h>
-//#include "randombytes.h"
-//#include "./polymul/toom_cook_4/toom-cook_4way.c"
 
-#define h1 4 //2^(EQ-EP-1)
+#define h1 (1 << (SABER_EQ - SABER_EP - 1))
+#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
-#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
+    size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly *skpv1 = A[0]; // use first row of A to hold sk temporarily
+    toom4_points skpv1_eval[SABER_L];
+    poly res[SABER_L];
 
-static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) {
-    int32_t i, j;
+    uint8_t rand[SABER_NOISESEEDBYTES];
+    uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        message_dec[j] = 0;
-        for (i = 0; i < 8; i++) {
-            message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i);
-        }
-    }
-}
+    randombytes(seed_A, SABER_SEEDBYTES);
+    shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
 
-/*-----------------------------------------------------------------------------------
-    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
--------------------------------------------------------------------------------------*/
+    randombytes(rand, SABER_NOISESEEDBYTES);
+    PQCLEAN_FIRESABER_AVX2_GenSecret(skpv1, rand);
+    PQCLEAN_FIRESABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key
 
-static void GenMatrix(polyvec *a, const uint8_t *seed) {
-    uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8];
-
-    uint16_t temp_ar[SABER_N];
-
-    int i, j, k;
-    uint16_t mod = (SABER_Q - 1);
-
-    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            PQCLEAN_FIRESABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8);
-            for (k = 0; k < SABER_N; k++) {
-                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
-            }
-        }
-    }
-}
-
-static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
-
-    uint32_t i;
-
-    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
-
-    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        PQCLEAN_FIRESABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
-    }
-}
-
-//********************************matrix-vector mul routines*****************************************************
-static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) {
-    int64_t i, j;
-
-    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
-
-    for (i = 0; i < NUM_POLY; i++) {
-        for (j = 0; j < NUM_POLY; j++) {
-
-            if (isTranspose == 0) {
-                toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j);
-            } else {
-                toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j);
-            }
-        }
-
-        TC_interpol(c_bucket, res_avx[i]);
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_FIRESABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]);
     }
 
-}
+    PQCLEAN_FIRESABER_AVX2_GenMatrix(A, seed_A); // sample matrix A
+    PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order
 
-static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) {
-
-    int64_t i;
-
-    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
-
-    for (i = 0; i < NUM_POLY; i++) {
-        toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i);
-    }
-    TC_interpol(c_bucket, res_avx);
-}
-
-//********************************matrix-vector mul routines*****************************************************
-
-void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
-
-    polyvec a[SABER_K];
-
-    uint16_t skpv1[SABER_K][SABER_N];
-
-
-
-    uint8_t seed[SABER_SEEDBYTES];
-    uint8_t noiseseed[SABER_COINBYTES];
-    int32_t i, j, k;
-
-
-//--------------AVX declaration------------------
-
-    __m256i sk_avx[SABER_K][SABER_N / 16];
-    __m256i mod;
-    __m256i res_avx[SABER_K][SABER_N / 16];
-    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
-    //__m256i acc[2*SABER_N/16];
-
-    mod = _mm256_set1_epi16(SABER_Q - 1);
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-
-//--------------AVX declaration ends------------------
-
-    randombytes(seed, SABER_SEEDBYTES);
-
-    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state
-    randombytes(noiseseed, SABER_COINBYTES);
-
-
-    GenMatrix(a, seed); //sample matrix A
-
-    GenSecret(skpv1, noiseseed);
-
-
-// Load sk into avx vectors
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
-        }
-
-    }
-
-    // Load a into avx vectors
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            for (k = 0; k < SABER_N / 16; k++) {
-                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
-            }
+    // rounding
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_N; j++) {
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
 
-
-
-    //------------------------do the matrix vector multiplication and rounding------------
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sk_avx[j], b_bucket[j]);
-    }
-    matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order
-
-    // Now truncation
-
-
-    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
-        for (j = 0; j < SABER_N / 16; j++) {
-            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
-            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
-            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
-        }
-    }
-
-    //------------------Pack sk into byte string-------
-
-    PQCLEAN_FIRESABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q);
-
-    //------------------Pack pk into byte string-------
-
-    for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key
-        for (j = 0; j < SABER_N / 16; j++) {
-            _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
-        }
-    }
-    PQCLEAN_FIRESABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string
-
-
-    for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
-        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
-    }
-
+    PQCLEAN_FIRESABER_AVX2_POLVECp2BS(pk, res); // pack public key
 }
 
 
 void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+    size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly res[SABER_L];
+    toom4_points skpv1_eval[SABER_L];
 
-    uint32_t i, j, k;
-    polyvec a[SABER_K];     // skpv;
-    uint8_t seed[SABER_SEEDBYTES];
-    uint16_t pkcl[SABER_K][SABER_N];    //public key of received by the client
+    poly *temp = A[0]; // re-use stack space
+    poly *vprime = &A[0][0];
+    poly *message = &A[0][1];
 
+    const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+    uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
 
-    uint16_t skpv1[SABER_K][SABER_N];
-    uint16_t temp[SABER_K][SABER_N];
-    uint16_t message[SABER_KEYBYTES * 8];
-
-    uint8_t msk_c[SABER_SCALEBYTES_KEM];
-
-    //--------------AVX declaration------------------
-
-    __m256i sk_avx[SABER_K][SABER_N / 16];
-    __m256i mod, mod_p;
-    __m256i res_avx[SABER_K][SABER_N / 16];
-    __m256i vprime_avx[SABER_N / 16];
-    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
-    //__m256i acc[2*SABER_N/16];
-
-    __m256i pkcl_avx[SABER_K][SABER_N / 16];
-
-    __m256i message_avx[SABER_N / 16];
-
-    mod = _mm256_set1_epi16(SABER_Q - 1);
-    mod_p = _mm256_set1_epi16(SABER_P - 1);
-
-
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-
-    //--------------AVX declaration ends------------------
-    for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK.
-        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
+    PQCLEAN_FIRESABER_AVX2_GenSecret(temp, noiseseed);
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_FIRESABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]);
     }
 
-    GenMatrix(a, seed);
-    GenSecret(skpv1, noiseseed);
+    PQCLEAN_FIRESABER_AVX2_GenMatrix(A, seed_A);
+    PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed
 
-    // ----------- Load skpv1 into avx vectors ----------
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+    // rounding
+    for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N; j++) {
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
-
-    // ----------- Load skpv1 into avx vectors ----------
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            for (k = 0; k < SABER_N / 16; k++) {
-                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
-            }
-        }
-    }
-    //-----------------matrix-vector multiplication and rounding
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sk_avx[j], b_bucket[j]);
-    }
-    matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order
-
-    // Now truncation
-
-    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
-        for (j = 0; j < SABER_N / 16; j++) {
-            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
-            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
-            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
-
-        }
-    }
-
-
-    //-----this result should be put in b_prime for later use in server.
-    for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays
-        for (j = 0; j < SABER_N / 16; j++) {
-            _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
-        }
-    }
-
-    PQCLEAN_FIRESABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string
-
-//**************client matrix-vector multiplication ends******************//
-
-    //------now calculate the v'
-
-    //-------unpack the public_key
-    PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16]));
-        }
-    }
-
-    // InnerProduct
-    //for(k=0;k<SABER_N/16;k++){
-    //  vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]);
-    //}
+    PQCLEAN_FIRESABER_AVX2_POLVECp2BS(ciphertext, res);
 
     // vector-vector scalar multiplication with mod p
+    PQCLEAN_FIRESABER_AVX2_BS2POLVECp(temp, pk);
+    PQCLEAN_FIRESABER_AVX2_InnerProd(vprime, temp, skpv1_eval);
+    PQCLEAN_FIRESABER_AVX2_BS2POLmsg(message, m);
 
-    vector_vector_mul(vprime_avx, pkcl_avx, b_bucket);
-
-    // Computation of v'+h1
-    for (i = 0; i < SABER_N / 16; i++) { //adding h1
-        vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1));
-    }
-
-    // unpack m;
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        for (i = 0; i < 8; i++) {
-            message[8 * j + i] = ((m[j] >> i) & 0x01);
-        }
-    }
-    // message encoding
-    for (i = 0; i < SABER_N / 16; i++) {
-        message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16]));
-        message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) );
-    }
-
-    // SHIFTRIGHT(v'+h1-m mod p, EP-ET)
-    for (k = 0; k < SABER_N / 16; k++) {
-        vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]);
-        vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p);
-        vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) );
-    }
-
-    // Unpack avx
-    for (j = 0; j < SABER_N / 16; j++) {
-        _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]);
-    }
-
-    PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(msk_c, temp[0]);
-
-
-    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
-        ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j];
+    for (i = 0; i < SABER_N; i++) {
+        vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1));
+        vprime->coeffs[i] &= SABER_P - 1;
+        vprime->coeffs[i] >>= SABER_EP - SABER_ET;
     }
 
+    PQCLEAN_FIRESABER_AVX2_POLT2BS(msk_c, vprime);
 }
 
 
 void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+    size_t i;
 
-    uint32_t i, j;
-    uint16_t sksv[SABER_K][SABER_N]; //secret key of the server
-    uint16_t pksv[SABER_K][SABER_N];
-    uint16_t message_dec_unpacked[SABER_KEYBYTES * 8];  // one element containes on decrypted bit;
-    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
-    uint16_t op[SABER_N];
+    poly temp[SABER_L];
+    toom4_points sksv_eval[SABER_L];
 
-    //--------------AVX declaration------------------
+    const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
+    poly *v = &temp[0];
+    poly *cm = &temp[1];
 
-
-    //__m256i mod_p;
-
-    __m256i v_avx[SABER_N / 16];
-
-    //__m256i acc[2*SABER_N/16];
-
-    __m256i sksv_avx[SABER_K][SABER_N / 16];
-    __m256i pksv_avx[SABER_K][SABER_N / 16];
-
-    //mod_p=_mm256_set1_epi16(SABER_P-1);
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-    //--------------AVX declaration ends------------------
-
-    //-------unpack the public_key
-
-    PQCLEAN_FIRESABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key
-    PQCLEAN_FIRESABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16]));
-            pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16]));
-        }
+    PQCLEAN_FIRESABER_AVX2_BS2POLVECq(temp, sk);
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_FIRESABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]);
     }
 
-    for (i = 0; i < SABER_N / 16; i++) {
-        v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]);
-    }
+    PQCLEAN_FIRESABER_AVX2_BS2POLVECp(temp, ciphertext);
+    PQCLEAN_FIRESABER_AVX2_InnerProd(v, temp, sksv_eval);
 
+    PQCLEAN_FIRESABER_AVX2_BS2POLT(cm, packed_cm);
 
-    // InnerProduct(b', s, mod p)
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sksv_avx[j], b_bucket[j]);
-    }
-
-    vector_vector_mul(v_avx, pksv_avx, b_bucket);
-
-    for (i = 0; i < SABER_N / 16; i++) {
-        _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
-    }
-
-
-    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
-        scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i];
-    }
-
-    PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(op, scale_ar);
-
-
-    //addition of h2
     for (i = 0; i < SABER_N; i++) {
-        message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1);
+        v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET));
+        v->coeffs[i] &= SABER_P - 1;
+        v->coeffs[i] >>= SABER_EP - 1;
     }
 
-
-    POL2MSG(m, message_dec_unpacked);
+    PQCLEAN_FIRESABER_AVX2_POLmsg2BS(m, v);
 }
diff --git a/crypto_kem/firesaber/avx2/SABER_params.h b/crypto_kem/firesaber/avx2/SABER_params.h
index e1476b6a..6481efec 100644
--- a/crypto_kem/firesaber/avx2/SABER_params.h
+++ b/crypto_kem/firesaber/avx2/SABER_params.h
@@ -1,45 +1,41 @@
 #ifndef PARAMS_H
 #define PARAMS_H
-#include "api.h"
 
 
-
-
-#define SABER_K 4
+/* Don't change anything below this line */
+#define SABER_L 4
 #define SABER_MU 6
 #define SABER_ET 6
 
-#define SABER_EQ 13
-#define SABER_EP 10
-
 #define SABER_N 256
-#define SABER_Q 8192 //2^13
-#define SABER_P 1024
 
-#define SABER_SEEDBYTES       32
-#define SABER_NOISESEEDBYTES  32
-#define SABER_COINBYTES       32
-#define SABER_KEYBYTES        32
+#define SABER_EP 10
+#define SABER_P (1 << SABER_EP)
 
-#define SABER_HASHBYTES       32
+#define SABER_EQ 13
+#define SABER_Q (1 << SABER_EQ)
 
-#define SABER_POLYBYTES              416 //13*256/8 
+#define SABER_SEEDBYTES 32
+#define SABER_NOISESEEDBYTES 32
+#define SABER_KEYBYTES 32
+#define SABER_HASHBYTES 32
 
-#define SABER_POLYVECBYTES           (SABER_K * SABER_POLYBYTES)
+#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8)
 
-#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
+#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8)
+#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES)
 
-#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
+#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8)
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES)
 
-#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8)
 
 #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
 #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
 
 #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
 
-#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
-
-#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
+#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM)
 
 #endif
diff --git a/crypto_kem/firesaber/avx2/cbd.c b/crypto_kem/firesaber/avx2/cbd.c
index 37970a81..0da0876f 100644
--- a/crypto_kem/firesaber/avx2/cbd.c
+++ b/crypto_kem/firesaber/avx2/cbd.c
@@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
 
 
-static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+static uint64_t load_littleendian(const uint8_t *x, int bytes) {
     int i;
     uint64_t r = x[0];
     for (i = 1; i < bytes; i++) {
@@ -20,33 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) {
     return r;
 }
 
-
-void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) {
-    uint16_t Qmod_minus1 = SABER_Q - 1;
-
+void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) {
     uint32_t t, d, a[4], b[4];
     int i, j;
 
     for (i = 0; i < SABER_N / 4; i++) {
-        t = load_littleendian(buf + 3 * i, 3);
+        t = (uint32_t) load_littleendian(buf + 3 * i, 3);
         d = 0;
         for (j = 0; j < 3; j++) {
             d += (t >> j) & 0x249249;
         }
 
-        a[0] =  d & 0x7;
-        b[0] = (d >>  3) & 0x7;
-        a[1] = (d >>  6) & 0x7;
-        b[1] = (d >>  9) & 0x7;
+        a[0] = d & 0x7;
+        b[0] = (d >> 3) & 0x7;
+        a[1] = (d >> 6) & 0x7;
+        b[1] = (d >> 9) & 0x7;
         a[2] = (d >> 12) & 0x7;
         b[2] = (d >> 15) & 0x7;
         a[3] = (d >> 18) & 0x7;
         b[3] = (d >> 21);
 
-        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
-        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
-        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
-        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
-
+        s[4 * i + 0] = (uint16_t)(a[0] - b[0]);
+        s[4 * i + 1] = (uint16_t)(a[1] - b[1]);
+        s[4 * i + 2] = (uint16_t)(a[2] - b[2]);
+        s[4 * i + 3] = (uint16_t)(a[3] - b[3]);
     }
 }
diff --git a/crypto_kem/firesaber/avx2/cbd.h b/crypto_kem/firesaber/avx2/cbd.h
index 210bcc50..dba55d9d 100644
--- a/crypto_kem/firesaber/avx2/cbd.h
+++ b/crypto_kem/firesaber/avx2/cbd.h
@@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
 by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
-#include "poly.h"
+#include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t *r, const unsigned char *buf);
+void PQCLEAN_FIRESABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]);
 
 
 #endif
diff --git a/crypto_kem/firesaber/avx2/kem.c b/crypto_kem/firesaber/avx2/kem.c
index 2e72e6aa..92c19a7d 100644
--- a/crypto_kem/firesaber/avx2/kem.c
+++ b/crypto_kem/firesaber/avx2/kem.c
@@ -4,14 +4,12 @@
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
-#include <immintrin.h>
+#include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 
 
 int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
-    int i;
+    size_t i;
 
     PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -39,7 +37,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t
     sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
     // K^ <-- kr[0:31]
     // noiseseed (r) <-- kr[32:63];
-    PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
+    PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
 
     sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
 
@@ -49,7 +47,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t
 }
 
 int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
-    int i;
+    size_t i;
     uint8_t fail;
     uint8_t cmp[SABER_BYTES_CCA_DEC];
     uint8_t buf[64];
@@ -65,7 +63,7 @@ int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const ui
 
     sha3_512(kr, buf, 64);
 
-    PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk);
+    PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk);
 
     fail = PQCLEAN_FIRESABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC);
 
diff --git a/crypto_kem/firesaber/avx2/kem.h b/crypto_kem/firesaber/avx2/kem.h
index a55514d9..b28b04f6 100644
--- a/crypto_kem/firesaber/avx2/kem.h
+++ b/crypto_kem/firesaber/avx2/kem.h
@@ -1,35 +1,3 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-
-void PQCLEAN_FIRESABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk);
-
-
-void PQCLEAN_FIRESABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
-
-
-void PQCLEAN_FIRESABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
-
-
-void PQCLEAN_FIRESABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-void PQCLEAN_FIRESABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk,  uint8_t *ciphertext);
-
-void PQCLEAN_FIRESABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]);
-
-
-int PQCLEAN_FIRESABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
-
-int PQCLEAN_FIRESABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
-
-int PQCLEAN_FIRESABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
 
 
 
-//uint64_t clock1,clock2;
-
-//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex;
-
-
-#endif
diff --git a/crypto_kem/firesaber/avx2/pack_unpack.c b/crypto_kem/firesaber/avx2/pack_unpack.c
index 33c481b3..41b9747a 100644
--- a/crypto_kem/firesaber/avx2/pack_unpack.c
+++ b/crypto_kem/firesaber/avx2/pack_unpack.c
@@ -1,502 +1,149 @@
+#include "SABER_params.h"
 #include "pack_unpack.h"
+#include "poly.h"
+#include <string.h>
 
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7);
-        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 );
-    }
-}
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
-        data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07;
-        data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 );
-        data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07;
-        data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07;
-        data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 );
-        data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 );
-        data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 );
-    }
-
-}
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0;
-
-    for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 );
-    }
-}
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0;
-
-    for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        data[offset_data] = bytes[j] & 0x0f;
-        data[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
-    }
-}
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
+void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2);
+        out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6);
+        out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4);
+        out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2);
+        in += 4;
+        out += 3;
     }
 }
 
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
+void PQCLEAN_FIRESABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
-        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |  ((bytes[offset_byte + 1] & 0x0f) << 2)  ;
-        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ;
-        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
-    }
-
-}
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
-        }
+        out[0] = in[0] & 0x3f;
+        out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2);
+        out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4);
+        out[3] = ((in[2] & 0xff) >> 2);
+        in += 3;
+        out += 4;
     }
 }
 
-void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
-        }
-    }
-}
-
-void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
-
-            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
-
-        }
-    }
-
-
-}
-
-void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
+        out[2] = ((in[1] >> 3) & 0xff);
+        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
+        out[5] = ((in[3] >> 1) & 0xff);
+        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
+        out[7] = ((in[4] >> 4) & 0xff);
+        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
+        out[10] = ((in[6] >> 2) & 0xff);
+        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
+        out[12] = ((in[7] >> 5) & 0xff);
+        in += 8;
+        out += 13;
     }
 }
 
-
-
-void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
-            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
-            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
-            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
-
-        }
-    }
-}
-
-void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-
-
-}
-
-
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
-            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
-            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
-            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
-
-        }
-    }
-
-
-}
-
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
-
-            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
-
-        }
-    }
-
-
-}
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-
-
-}
-
-void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    //for(i=0;i<SABER_K;i++){
-    //i=0;
-    //offset_byte1=i*(SABER_N*13)/8;
+static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        //offset_byte=offset_byte1+13*j;
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
+        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
+        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
+        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
+        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
+        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
+        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
+        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        in += 13;
+        out += 8;
     }
-    //}
-
-
 }
 
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
+    for (j = 0; j < SABER_N / 4; j++) {
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
+        out[4] = ((in[3] >> 2) & 0xff);
+        in += 4;
+        out += 5;
+    }
+}
 
-void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-    /*This function packs 11 bit data stream into 8 bits of data.
-    */
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
+    for (j = 0; j < SABER_N / 4; j++) {
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
+        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
+        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
+        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        in += 5;
+        out += 4;
+    }
+}
 
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 11) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 11 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]);
+    }
+}
 
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3);
+void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLq(&data[i], bytes + i * SABER_POLYBYTES);
+    }
+}
 
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6);
+void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]);
+    }
+}
 
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1);
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7);
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5);
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff );
+void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES);
+    }
+}
 
+void PQCLEAN_FIRESABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) {
+    size_t i, j;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01);
         }
     }
-
 }
 
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+void PQCLEAN_FIRESABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) {
+    size_t i, j;
+    memset(bytes, 0, SABER_KEYBYTES);
 
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 11) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 11 * j;
-            offset_data = 8 * j;
-
-            data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 );
-
-            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 );
-
-            data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 );
-
-            data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 );
-
-            data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 );
-
-            data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 );
-
-            data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 );
-
-            data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 );
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i);
         }
     }
-
-
-}
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 14) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 7 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff );
-        }
-    }
-
-
-}
-
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 14) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 7 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 );
-
-            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 );
-
-            data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 );
-
-            data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 );
-        }
-    }
-
-
-}
-
-void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-
-    if (modulus == 1024) {
-        PQCLEAN_FIRESABER_AVX2_POLVECp2BS(bytes, data);
-    } else if (modulus == 8192) {
-        PQCLEAN_FIRESABER_AVX2_POLVECq2BS(bytes, data);
-    }
-}
-
-void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) {
-
-    if (modulus == 1024) {
-        PQCLEAN_FIRESABER_AVX2_BS2POLVECp(data, bytes);
-    } else if (modulus == 8192) {
-        PQCLEAN_FIRESABER_AVX2_BS2POLVECq(data, bytes);
-    }
-
 }
diff --git a/crypto_kem/firesaber/avx2/pack_unpack.h b/crypto_kem/firesaber/avx2/pack_unpack.h
index ba8a568f..eb6242be 100644
--- a/crypto_kem/firesaber/avx2/pack_unpack.h
+++ b/crypto_kem/firesaber/avx2/pack_unpack.h
@@ -1,56 +1,28 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
 #include "SABER_params.h"
+#include "poly.h"
 #include <stdint.h>
 #include <stdio.h>
 
-void PQCLEAN_FIRESABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes);
+void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data);
 
-void PQCLEAN_FIRESABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus);
-
-void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+void PQCLEAN_FIRESABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_FIRESABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]);
 
-void PQCLEAN_FIRESABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+void PQCLEAN_FIRESABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]);
 
 
-void PQCLEAN_FIRESABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data);
+void PQCLEAN_FIRESABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]);
 
-void PQCLEAN_FIRESABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+void PQCLEAN_FIRESABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
 
 
-void PQCLEAN_FIRESABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes);
+void PQCLEAN_FIRESABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]);
 
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_FIRESABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+void PQCLEAN_FIRESABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data);
 
 
 #endif
diff --git a/crypto_kem/firesaber/avx2/poly.c b/crypto_kem/firesaber/avx2/poly.c
new file mode 100644
index 00000000..2a7fa836
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/poly.c
@@ -0,0 +1,62 @@
+#include "cbd.h"
+#include "fips202.h"
+#include "pack_unpack.h"
+#include "poly.h"
+
+
+void PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) {
+    size_t i, j;
+    toom4_points_product c_eval;
+
+    if (transpose) {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1);
+            }
+            PQCLEAN_FIRESABER_AVX2_toom4_interp(&c[i], &c_eval);
+        }
+    } else {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1);
+            }
+            PQCLEAN_FIRESABER_AVX2_toom4_interp(&c[i], &c_eval);
+        }
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) {
+    size_t i;
+    toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time
+
+    PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0);
+    for (i = 1; i < SABER_L; i++) {
+        PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1);
+    }
+
+    PQCLEAN_FIRESABER_AVX2_toom4_interp(c, &c_eval);
+}
+
+void PQCLEAN_FIRESABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) {
+    size_t i;
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_FIRESABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES);
+    }
+}
+
+void PQCLEAN_FIRESABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) {
+    size_t i;
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_FIRESABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES);
+    }
+}
diff --git a/crypto_kem/firesaber/avx2/poly.h b/crypto_kem/firesaber/avx2/poly.h
index 8443de34..859fb95e 100644
--- a/crypto_kem/firesaber/avx2/poly.h
+++ b/crypto_kem/firesaber/avx2/poly.h
@@ -1,27 +1,38 @@
 #ifndef POLY_H
 #define POLY_H
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
 #include "SABER_params.h"
+#include <immintrin.h>
 #include <stdint.h>
 
-typedef struct {
+typedef union {
     uint16_t coeffs[SABER_N];
+    __m256i dummy;
 } poly;
 
-typedef struct {
-    poly vec[SABER_K];
-} polyvec;
+typedef union {
+    uint16_t coeffs[4 * SABER_N];
+    __m256i dummy;
+} toom4_points;
 
-void PQCLEAN_FIRESABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce);
+typedef union {
+    uint16_t coeffs[8 * SABER_N];
+    __m256i dummy;
+} toom4_points_product;
+
+void PQCLEAN_FIRESABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose);
+
+void PQCLEAN_FIRESABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]);
+
+void PQCLEAN_FIRESABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_FIRESABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]);
 
 
-void PQCLEAN_FIRESABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3);
+void PQCLEAN_FIRESABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval);
+
+void PQCLEAN_FIRESABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b);
+
+void PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate);
 
 
 #endif
diff --git a/crypto_kem/firesaber/avx2/poly_mul.c b/crypto_kem/firesaber/avx2/poly_mul.c
new file mode 100644
index 00000000..d4e37d59
--- /dev/null
+++ b/crypto_kem/firesaber/avx2/poly_mul.c
@@ -0,0 +1,1524 @@
+#include "SABER_params.h"
+#include "poly.h"
+
+
+#define L (SABER_N / 64)
+
+static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) {
+    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
+}
+
+static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+    __m256i temp;
+
+    a0 = a[0];
+    a1 = a[1];
+    a2 = a[2];
+    a3 = a[3];
+    a4 = a[4];
+    a5 = a[5];
+    a6 = a[6];
+    a7 = a[7];
+
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    b4 = b[4];
+    b5 = b[5];
+    b6 = b[6];
+    b7 = b[7];
+
+    c[0] = mul_add(a0, b0, c[0]);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    temp = mul_add(a1, b0, temp);
+    c[1] = _mm256_add_epi16(temp, c[1]);
+
+    temp = _mm256_mullo_epi16(a0, b2);
+    temp = mul_add(a1, b1, temp);
+    temp = mul_add(a2, b0, temp);
+    c[2] = _mm256_add_epi16(temp, c[2]);
+
+    temp = _mm256_mullo_epi16(a0, b3);
+    temp = mul_add(a1, b2, temp);
+    temp = mul_add(a2, b1, temp);
+    temp = mul_add(a3, b0, temp);
+    c[3] = _mm256_add_epi16(temp, c[3]);
+
+    temp = _mm256_mullo_epi16(a0, b4);
+    temp = mul_add(a1, b3, temp);
+    temp = mul_add(a3, b1, temp);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    c[4] = _mm256_add_epi16(temp, c[4]);
+
+    temp = _mm256_mullo_epi16(a0, b5);
+    temp = mul_add(a1, b4, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add( a4, b1, temp);
+    temp = mul_add(a5, b0, temp);
+    c[5] = _mm256_add_epi16(temp, c[5]);
+
+    temp = _mm256_mullo_epi16(a0, b6);
+    temp = mul_add(a1, b5, temp);
+    temp = mul_add(a5, b1, temp);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a4, b2, temp);
+    c[6] = _mm256_add_epi16(temp, c[6]);
+
+    temp = _mm256_mullo_epi16(a0, b7);
+    temp = mul_add(a1, b6, temp);
+    temp = mul_add(a6, b1, temp);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a5, b2, temp);
+    c[7] = _mm256_add_epi16(temp, c[7]);
+
+    temp = _mm256_mullo_epi16(a0, b[8]);
+    temp = mul_add(a1, b7, temp);
+    temp = mul_add(a7, b1, temp);
+    temp = mul_add(a[8], b0, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a6, b2, temp);
+    c[8] = _mm256_add_epi16(temp, c[8]);
+
+    temp = _mm256_mullo_epi16(a0, b[9]);
+    temp = mul_add(a1, b[8], temp);
+    temp = mul_add(a[8], b1, temp);
+    temp = mul_add(a[9], b0, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a7, b2, temp);
+    c[9] = _mm256_add_epi16(temp, c[9]);
+
+    temp = _mm256_mullo_epi16(a0, b[10]);
+    temp = mul_add(a1, b[9], temp);
+    temp = mul_add(a[9], b1, temp);
+    temp = mul_add(a[10], b0, temp);
+    temp = mul_add(a2, b[8], temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a[8], b2, temp);
+    c[10] = _mm256_add_epi16(temp, c[10]);
+
+    temp = _mm256_mullo_epi16(a0, b[11]);
+    temp = mul_add(a1, b[10], temp);
+    temp = mul_add(a[10], b1, temp);
+    temp = mul_add(a[11], b0, temp);
+    temp = mul_add(a2, b[9], temp);
+    temp = mul_add(a3, b[8], temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a[8], b3, temp);
+    temp = mul_add(a[9], b2, temp);
+    c[11] = _mm256_add_epi16(temp, c[11]);
+
+    temp = _mm256_mullo_epi16(a0, b[12]);
+    temp = mul_add(a1, b[11], temp);
+    temp = mul_add(a[11], b1, temp);
+    temp = mul_add(a[12], b0, temp);
+    temp = mul_add(a2, b[10], temp);
+    temp = mul_add(a3, b[9], temp);
+    temp = mul_add(a4, b[8], temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a[8], b4, temp);
+    temp = mul_add(a[9], b3, temp);
+    temp = mul_add(a[10], b2, temp);
+    c[12] = _mm256_add_epi16(temp, c[12]);
+
+    temp = _mm256_mullo_epi16(a0, b[13]);
+    temp = mul_add(a1, b[12], temp);
+    temp = mul_add(a[12], b1, temp);
+    temp = mul_add(a[13], b0, temp);
+    temp = mul_add(a2, b[11], temp);
+    temp = mul_add(a3, b[10], temp);
+    temp = mul_add(a4, b[9], temp);
+    temp = mul_add(a5, b[8], temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a[8], b5, temp);
+    temp = mul_add(a[9], b4, temp);
+    temp = mul_add(a[10], b3, temp);
+    temp = mul_add(a[11], b2, temp);
+    c[13] = _mm256_add_epi16(temp, c[13]);
+
+    temp = _mm256_mullo_epi16(a0, b[14]);
+    temp = mul_add(a1, b[13], temp);
+    temp = mul_add(a[13], b1, temp);
+    temp = mul_add(a[14], b0, temp);
+    temp = mul_add(a2, b[12], temp);
+    temp = mul_add(a3, b[11], temp);
+    temp = mul_add(a4, b[10], temp);
+    temp = mul_add(a5, b[9], temp);
+    temp = mul_add(a6, b[8], temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a[8], b6, temp);
+    temp = mul_add(a[9], b5, temp);
+    temp = mul_add(a[10], b4, temp);
+    temp = mul_add(a[11], b3, temp);
+    temp = mul_add(a[12], b2, temp);
+    c[14] = _mm256_add_epi16(temp, c[14]);
+
+    temp = _mm256_mullo_epi16(a0, b[15]);
+    temp = mul_add(a1, b[14], temp);
+    temp = mul_add(a[14], b1, temp);
+    temp = mul_add(a[15], b0, temp);
+    temp = mul_add(a2, b[13], temp);
+    temp = mul_add(a3, b[12], temp);
+    temp = mul_add(a4, b[11], temp);
+    temp = mul_add(a5, b[10], temp);
+    temp = mul_add(a6, b[9], temp);
+    temp = mul_add(a7, b[8], temp);
+    temp = mul_add(a[8], b7, temp);
+    temp = mul_add(a[9], b6, temp);
+    temp = mul_add(a[10], b5, temp);
+    temp = mul_add(a[11], b4, temp);
+    temp = mul_add(a[12], b3, temp);
+    temp = mul_add(a[13], b2, temp);
+    c[15] = _mm256_add_epi16(temp, c[15]);
+
+    a0 = a[14];
+    a1 = a[15];
+    a2 = a[13];
+    a3 = a[12];
+    a4 = a[11];
+    a5 = a[10];
+    a6 = a[9];
+    a7 = a[8];
+
+    b0 = b[14];
+    b1 = b[15];
+    b2 = b[13];
+    b3 = b[12];
+    b4 = b[11];
+    b5 = b[10];
+    b6 = b[9];
+    b7 = b[8];
+
+    temp = _mm256_mullo_epi16(a[1], b1);
+    temp = mul_add(a[2], b0, temp);
+    temp = mul_add(a[3], b2, temp);
+    temp = mul_add(a[4], b3, temp);
+    temp = mul_add(a[5], b4, temp);
+    temp = mul_add(a[6], b5, temp);
+    temp = mul_add(a[7], b6, temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a6, b[7], temp);
+    temp = mul_add(a5, b[6], temp);
+    temp = mul_add(a4, b[5], temp);
+    temp = mul_add(a3, b[4], temp);
+    temp = mul_add(a2, b[3], temp);
+    temp = mul_add(a0, b[2], temp);
+    temp = mul_add(a1, b[1], temp);
+    c[16] = _mm256_add_epi16(temp, c[16]);
+
+    temp = _mm256_mullo_epi16(a[2], b1);
+    temp = mul_add(a[3], b0, temp);
+    temp = mul_add(a[4], b2, temp);
+    temp = mul_add(a[5], b3, temp);
+    temp = mul_add(a[6], b4, temp);
+    temp = mul_add(a[7], b5, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a5, b[7], temp);
+    temp = mul_add(a4, b[6], temp);
+    temp = mul_add(a3, b[5], temp);
+    temp = mul_add(a2, b[4], temp);
+    temp = mul_add(a0, b[3], temp);
+    temp = mul_add(a1, b[2], temp);
+    c[17] = _mm256_add_epi16(temp, c[17]);
+
+    temp = _mm256_mullo_epi16(a[3], b1);
+    temp = mul_add(a[4], b0, temp);
+    temp = mul_add(a[5], b2, temp);
+    temp = mul_add(a[6], b3, temp);
+    temp = mul_add(a[7], b4, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a4, b[7], temp);
+    temp = mul_add(a3, b[6], temp);
+    temp = mul_add(a2, b[5], temp);
+    temp = mul_add(a0, b[4], temp);
+    temp = mul_add(a1, b[3], temp);
+    c[18] = _mm256_add_epi16(temp, c[18]);
+
+    temp = _mm256_mullo_epi16(a[4], b1);
+    temp = mul_add(a[5], b0, temp);
+    temp = mul_add(a[6], b2, temp);
+    temp = mul_add(a[7], b3, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a3, b[7], temp);
+    temp = mul_add(a2, b[6], temp);
+    temp = mul_add(a0, b[5], temp);
+    temp = mul_add(a1, b[4], temp);
+    c[19] = _mm256_add_epi16(temp, c[19]);
+
+    temp = _mm256_mullo_epi16(a[5], b1);
+    temp = mul_add(a[6], b0, temp);
+    temp = mul_add(a[7], b2, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a2, b[7], temp);
+    temp = mul_add(a0, b[6], temp);
+    temp = mul_add(a1, b[5], temp);
+    c[20] = _mm256_add_epi16(temp, c[20]);
+
+    temp = _mm256_mullo_epi16(a[6], b1);
+    temp = mul_add(a[7], b0, temp);
+    temp = mul_add(a7, b2, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a0, b[7], temp);
+    temp = mul_add(a1, b[6], temp);
+    c[21] = _mm256_add_epi16(temp, c[21]);
+
+    temp = _mm256_mullo_epi16(a[7], b1);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a6, b2, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a0, b7, temp);
+    temp = mul_add(a1, b[7], temp);
+    c[22] = _mm256_add_epi16(temp, c[22]);
+
+    temp = _mm256_mullo_epi16(a7, b1);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a5, b2, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a0, b6, temp);
+    temp = mul_add(a1, b7, temp);
+    c[23] = _mm256_add_epi16(temp, c[23]);
+
+    temp = _mm256_mullo_epi16(a6, b1);
+    temp = mul_add(a5, b0, temp);
+    temp = mul_add(a4, b2, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a0, b5, temp);
+    temp = mul_add(a1, b6, temp);
+    c[24] = _mm256_add_epi16(temp, c[24]);
+
+    temp = _mm256_mullo_epi16(a5, b1);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a0, b4, temp);
+    temp = mul_add(a1, b5, temp);
+    c[25] = _mm256_add_epi16(temp, c[25]);
+
+    temp = _mm256_mullo_epi16(a4, b1);
+    temp = mul_add(a3, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    temp = mul_add(a0, b3, temp);
+    temp = mul_add(a1, b4, temp);
+    c[26] = _mm256_add_epi16(temp, c[26]);
+
+    temp = _mm256_mullo_epi16(a3, b1);
+    temp = mul_add(a2, b0, temp);
+    temp = mul_add(a0, b2, temp);
+    temp = mul_add(a1, b3, temp);
+    c[27] = _mm256_add_epi16(temp, c[27]);
+
+    temp = _mm256_mullo_epi16(a2, b1);
+    temp = mul_add(a0, b0, temp);
+    temp = mul_add(a1, b2, temp);
+    c[28] = _mm256_add_epi16(temp, c[28]);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    temp = mul_add(a1, b0, temp);
+    c[29] = _mm256_add_epi16(temp, c[29]);
+
+    c[30] = mul_add(a1, b1, c[30]);
+
+    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+}
+
+
+static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+    __m256i temp;
+
+    a0 = a[0];
+    a1 = a[1];
+    a2 = a[2];
+    a3 = a[3];
+    a4 = a[4];
+    a5 = a[5];
+    a6 = a[6];
+    a7 = a[7];
+
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    b4 = b[4];
+    b5 = b[5];
+    b6 = b[6];
+    b7 = b[7];
+
+    c[0] = _mm256_mullo_epi16(a0, b0);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    c[1] = mul_add(a1, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b2);
+    temp = mul_add(a1, b1, temp);
+    c[2] = mul_add(a2, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b3);
+    temp = mul_add(a1, b2, temp);
+    temp = mul_add(a2, b1, temp);
+    c[3] = mul_add(a3, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b4);
+    temp = mul_add(a1, b3, temp);
+    temp = mul_add(a3, b1, temp);
+    temp = mul_add(a4, b0, temp);
+    c[4] = mul_add(a2, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b5);
+    temp = mul_add(a1, b4, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add( a4, b1, temp);
+    c[5] = mul_add(a5, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b6);
+    temp = mul_add(a1, b5, temp);
+    temp = mul_add(a5, b1, temp);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a3, b3, temp);
+    c[6] = mul_add(a4, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b7);
+    temp = mul_add(a1, b6, temp);
+    temp = mul_add(a6, b1, temp);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a4, b3, temp);
+    c[7] = mul_add(a5, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[8]);
+    temp = mul_add(a1, b7, temp);
+    temp = mul_add(a7, b1, temp);
+    temp = mul_add(a[8], b0, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a5, b3, temp);
+    c[8] = mul_add(a6, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[9]);
+    temp = mul_add(a1, b[8], temp);
+    temp = mul_add(a[8], b1, temp);
+    temp = mul_add(a[9], b0, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a6, b3, temp);
+    c[9] = mul_add(a7, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[10]);
+    temp = mul_add(a1, b[9], temp);
+    temp = mul_add(a[9], b1, temp);
+    temp = mul_add(a[10], b0, temp);
+    temp = mul_add(a2, b[8], temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a7, b3, temp);
+    c[10] = mul_add(a[8], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[11]);
+    temp = mul_add(a1, b[10], temp);
+    temp = mul_add(a[10], b1, temp);
+    temp = mul_add(a[11], b0, temp);
+    temp = mul_add(a2, b[9], temp);
+    temp = mul_add(a3, b[8], temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a[8], b3, temp);
+    c[11] = mul_add(a[9], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[12]);
+    temp = mul_add(a1, b[11], temp);
+    temp = mul_add(a[11], b1, temp);
+    temp = mul_add(a[12], b0, temp);
+    temp = mul_add(a2, b[10], temp);
+    temp = mul_add(a3, b[9], temp);
+    temp = mul_add(a4, b[8], temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a[8], b4, temp);
+    temp = mul_add(a[9], b3, temp);
+    c[12] = mul_add(a[10], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[13]);
+    temp = mul_add(a1, b[12], temp);
+    temp = mul_add(a[12], b1, temp);
+    temp = mul_add(a[13], b0, temp);
+    temp = mul_add(a2, b[11], temp);
+    temp = mul_add(a3, b[10], temp);
+    temp = mul_add(a4, b[9], temp);
+    temp = mul_add(a5, b[8], temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a[8], b5, temp);
+    temp = mul_add(a[9], b4, temp);
+    temp = mul_add(a[10], b3, temp);
+    c[13] = mul_add(a[11], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[14]);
+    temp = mul_add(a1, b[13], temp);
+    temp = mul_add(a[13], b1, temp);
+    temp = mul_add(a[14], b0, temp);
+    temp = mul_add(a2, b[12], temp);
+    temp = mul_add(a3, b[11], temp);
+    temp = mul_add(a4, b[10], temp);
+    temp = mul_add(a5, b[9], temp);
+    temp = mul_add(a6, b[8], temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a[8], b6, temp);
+    temp = mul_add(a[9], b5, temp);
+    temp = mul_add(a[10], b4, temp);
+    temp = mul_add(a[11], b3, temp);
+    c[14] = mul_add(a[12], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[15]);
+    temp = mul_add(a1, b[14], temp);
+    temp = mul_add(a[14], b1, temp);
+    temp = mul_add(a[15], b0, temp);
+    temp = mul_add(a2, b[13], temp);
+    temp = mul_add(a3, b[12], temp);
+    temp = mul_add(a4, b[11], temp);
+    temp = mul_add(a5, b[10], temp);
+    temp = mul_add(a6, b[9], temp);
+    temp = mul_add(a7, b[8], temp);
+    temp = mul_add(a[8], b7, temp);
+    temp = mul_add(a[9], b6, temp);
+    temp = mul_add(a[10], b5, temp);
+    temp = mul_add(a[11], b4, temp);
+    temp = mul_add(a[12], b3, temp);
+    c[15] = mul_add(a[13], b2, temp);
+
+    // unrolled second triangle
+    a0 = a[14];
+    a1 = a[15];
+    a2 = a[13];
+    a3 = a[12];
+    a4 = a[11];
+    a5 = a[10];
+    a6 = a[9];
+    a7 = a[8];
+
+    b0 = b[14];
+    b1 = b[15];
+    b2 = b[13];
+    b3 = b[12];
+    b4 = b[11];
+    b5 = b[10];
+    b6 = b[9];
+    b7 = b[8];
+
+    temp = _mm256_mullo_epi16(a[1], b1);
+    temp = mul_add(a[2], b0, temp);
+    temp = mul_add(a[3], b2, temp);
+    temp = mul_add(a[4], b3, temp);
+    temp = mul_add(a[5], b4, temp);
+    temp = mul_add(a[6], b5, temp);
+    temp = mul_add(a[7], b6, temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a6, b[7], temp);
+    temp = mul_add(a5, b[6], temp);
+    temp = mul_add(a4, b[5], temp);
+    temp = mul_add(a3, b[4], temp);
+    temp = mul_add(a2, b[3], temp);
+    temp = mul_add(a0, b[2], temp);
+    c[16] = mul_add(a1, b[1], temp);
+
+    temp = _mm256_mullo_epi16(a[2], b1);
+    temp = mul_add(a[3], b0, temp);
+    temp = mul_add(a[4], b2, temp);
+    temp = mul_add(a[5], b3, temp);
+    temp = mul_add(a[6], b4, temp);
+    temp = mul_add(a[7], b5, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a5, b[7], temp);
+    temp = mul_add(a4, b[6], temp);
+    temp = mul_add(a3, b[5], temp);
+    temp = mul_add(a2, b[4], temp);
+    temp = mul_add(a0, b[3], temp);
+    c[17] = mul_add(a1, b[2], temp);
+
+    temp = _mm256_mullo_epi16(a[3], b1);
+    temp = mul_add(a[4], b0, temp);
+    temp = mul_add(a[5], b2, temp);
+    temp = mul_add(a[6], b3, temp);
+    temp = mul_add(a[7], b4, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a4, b[7], temp);
+    temp = mul_add(a3, b[6], temp);
+    temp = mul_add(a2, b[5], temp);
+    temp = mul_add(a0, b[4], temp);
+    c[18] = mul_add(a1, b[3], temp);
+
+    temp = _mm256_mullo_epi16(a[4], b1);
+    temp = mul_add(a[5], b0, temp);
+    temp = mul_add(a[6], b2, temp);
+    temp = mul_add(a[7], b3, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a3, b[7], temp);
+    temp = mul_add(a2, b[6], temp);
+    temp = mul_add(a0, b[5], temp);
+    c[19] = mul_add(a1, b[4], temp);
+
+    temp = _mm256_mullo_epi16(a[5], b1);
+    temp = mul_add(a[6], b0, temp);
+    temp = mul_add(a[7], b2, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a2, b[7], temp);
+    temp = mul_add(a0, b[6], temp);
+    c[20] = mul_add(a1, b[5], temp);
+
+    temp = _mm256_mullo_epi16(a[6], b1);
+    temp = mul_add(a[7], b0, temp);
+    temp = mul_add(a7, b2, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a0, b[7], temp);
+    c[21] = mul_add(a1, b[6], temp);
+
+    temp = _mm256_mullo_epi16(a[7], b1);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a6, b2, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a0, b7, temp);
+    c[22] = mul_add(a1, b[7], temp);
+
+    temp = _mm256_mullo_epi16(a7, b1);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a5, b2, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a0, b6, temp);
+    c[23] = mul_add(a1, b7, temp);
+
+    temp = _mm256_mullo_epi16(a6, b1);
+    temp = mul_add(a5, b0, temp);
+    temp = mul_add(a4, b2, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a0, b5, temp);
+    c[24] = mul_add(a1, b6, temp);
+
+    temp = _mm256_mullo_epi16(a5, b1);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a0, b4, temp);
+    c[25] = mul_add(a1, b5, temp);
+
+    temp = _mm256_mullo_epi16(a4, b1);
+    temp = mul_add(a3, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    temp = mul_add(a0, b3, temp);
+    c[26] = mul_add(a1, b4, temp);
+
+    temp = _mm256_mullo_epi16(a3, b1);
+    temp = mul_add(a2, b0, temp);
+    temp = mul_add(a0, b2, temp);
+    c[27] = mul_add(a1, b3, temp);
+
+    temp = _mm256_mullo_epi16(a2, b1);
+    temp = mul_add(a0, b0, temp);
+    c[28] = mul_add(a1, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    c[29] = mul_add(a1, b0, temp);
+
+    c[30] = _mm256_mullo_epi16(a1, b1);
+
+    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+}
+
+static void transpose(__m256i *M) {
+    __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
+    __m256i temp, temp0, temp1, temp2;
+
+    r0 = _mm256_unpacklo_epi16(M[0], M[1]);
+    r1 = _mm256_unpacklo_epi16(M[2], M[3]);
+    r2 = _mm256_unpacklo_epi16(M[4], M[5]);
+    r3 = _mm256_unpacklo_epi16(M[6], M[7]);
+    r4 = _mm256_unpacklo_epi16(M[8], M[9]);
+    r5 = _mm256_unpacklo_epi16(M[10], M[11]);
+    r6 = _mm256_unpacklo_epi16(M[12], M[13]);
+    r7 = _mm256_unpacklo_epi16(M[14], M[15]);
+
+    temp = _mm256_unpacklo_epi32(r0, r1);
+    temp0 = _mm256_unpacklo_epi32(r2, r3);
+    temp1 = _mm256_unpacklo_epi32(r4, r5);
+    temp2 = _mm256_unpacklo_epi32(r6, r7);
+
+    r8 = _mm256_unpackhi_epi32(r0, r1);
+    r9 = _mm256_unpackhi_epi32(r2, r3);
+    r10 = _mm256_unpackhi_epi32(r4, r5);
+    r11 = _mm256_unpackhi_epi32(r6, r7);
+
+    r0 = _mm256_unpacklo_epi64(temp, temp0);
+    r2 = _mm256_unpackhi_epi64(temp, temp0);
+    r1 = _mm256_unpacklo_epi64(temp1, temp2);
+    r3 = _mm256_unpackhi_epi64(temp1, temp2);
+
+    temp = _mm256_unpackhi_epi16(M[0], M[1]);
+    temp0 = _mm256_unpackhi_epi16(M[2], M[3]);
+    temp1 = _mm256_unpackhi_epi16(M[4], M[5]);
+    temp2 = _mm256_unpackhi_epi16(M[6], M[7]);
+
+    r4 = _mm256_unpackhi_epi16(M[8], M[9]);
+    M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
+    M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
+    M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
+    M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
+    r5 = _mm256_unpackhi_epi16(M[10], M[11]);
+    r6 = _mm256_unpackhi_epi16(M[12], M[13]);
+    r7 = _mm256_unpackhi_epi16(M[14], M[15]);
+
+    r0 = _mm256_unpacklo_epi64(r8, r9);
+    r1 = _mm256_unpacklo_epi64(r10, r11);
+    r2 = _mm256_unpackhi_epi64(r8, r9);
+    r3 = _mm256_unpackhi_epi64(r10, r11);
+
+    M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
+    M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
+    M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
+    M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
+
+    r0 = _mm256_unpacklo_epi32(temp, temp0);
+    r1 = _mm256_unpacklo_epi32(temp1, temp2);
+    r2 = _mm256_unpacklo_epi32(r4, r5);
+    r3 = _mm256_unpacklo_epi32(r6, r7);
+
+    r8 = _mm256_unpacklo_epi64(r0, r1);
+    r10 = _mm256_unpackhi_epi64(r0, r1);
+    r9 = _mm256_unpacklo_epi64(r2, r3);
+    r11 = _mm256_unpackhi_epi64(r2, r3);
+
+    M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
+    M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
+    M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
+    M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
+
+    r0 = _mm256_unpackhi_epi32(temp, temp0);
+    r1 = _mm256_unpackhi_epi32(temp1, temp2);
+    r2 = _mm256_unpackhi_epi32(r4, r5);
+    r3 = _mm256_unpackhi_epi32(r6, r7);
+
+    r4 = _mm256_unpacklo_epi64(r0, r1);
+    r6 = _mm256_unpackhi_epi64(r0, r1);
+    r5 = _mm256_unpacklo_epi64(r2, r3);
+    r7 = _mm256_unpackhi_epi64(r2, r3);
+
+    M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
+    M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
+    M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
+    M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
+}
+
+static void batch_64coefficient_multiplications(toom4_points_product *c_eval, const __m256i *a, const toom4_points *b_eval, int accumulate) {
+    toom4_points a_eval;// Holds evaluation (a & b) for 7 Karatsuba at a time
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+    __m256i *va = (__m256i *)a_eval.coeffs;
+    __m256i *vb = (__m256i *)b_eval->coeffs;
+    __m256i *vc = (__m256i *)c_eval->coeffs;
+
+    //------------------AVX evaluation for 1st poly-----------------------
+    r0_avx = a[0 * L + 0];
+    r1_avx = a[0 * L + 1];
+    r2_avx = a[0 * L + 2];
+    r3_avx = a[0 * L + 3];
+
+    va[0] = r0_avx;
+    va[1] = r1_avx;
+    va[2] = r2_avx;
+    va[3] = r3_avx;
+    va[4] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8] = _mm256_add_epi16(va[6], va[7]);
+    //------------------AVX evaluation for 1st poly ends------------------
+
+    //------------------AVX evaluation for 2nd poly-----------------------
+    r0_avx = a[1 * L + 0];
+    r1_avx = a[1 * L + 1];
+    r2_avx = a[1 * L + 2];
+    r3_avx = a[1 * L + 3];
+
+    va[0 + 9] = r0_avx;
+    va[1 + 9] = r1_avx;
+    va[2 + 9] = r2_avx;
+    va[3 + 9] = r3_avx;
+    va[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 9] = _mm256_add_epi16(va[6 + 9], va[7 + 9]);
+    //------------------AVX evaluation for 2nd poly ends------------------
+
+    //------------------AVX evaluation for 3rd poly-----------------------
+    r0_avx = a[2 * L + 0];
+    r1_avx = a[2 * L + 1];
+    r2_avx = a[2 * L + 2];
+    r3_avx = a[2 * L + 3];
+
+    va[0 + 18] = r0_avx;
+    va[1 + 18] = r1_avx;
+    va[2 + 18] = r2_avx;
+    va[3 + 18] = r3_avx;
+    va[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 18] = _mm256_add_epi16(va[6 + 18], va[7 + 18]);
+    //------------------AVX evaluation for 3rd poly ends------------------
+
+    //------------------AVX evaluation for 4th poly-----------------------
+    r0_avx = a[3 * L + 0];
+    r1_avx = a[3 * L + 1];
+    r2_avx = a[3 * L + 2];
+    r3_avx = a[3 * L + 3];
+
+    va[0 + 27] = r0_avx;
+    va[1 + 27] = r1_avx;
+    va[2 + 27] = r2_avx;
+    va[3 + 27] = r3_avx;
+    va[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 27] = _mm256_add_epi16(va[6 + 27], va[7 + 27]);
+    //------------------AVX evaluation for 4th poly ends------------------
+
+    //------------------AVX evaluation for 5th poly-----------------------
+    r0_avx = a[4 * L + 0];
+    r1_avx = a[4 * L + 1];
+    r2_avx = a[4 * L + 2];
+    r3_avx = a[4 * L + 3];
+
+    va[0 + 36] = r0_avx;
+    va[1 + 36] = r1_avx;
+    va[2 + 36] = r2_avx;
+    va[3 + 36] = r3_avx;
+    va[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 36] = _mm256_add_epi16(va[6 + 36], va[7 + 36]);
+    //------------------AVX evaluation for 5th poly ends------------------
+
+    //------------------AVX evaluation for 6th poly-----------------------
+    r0_avx = a[5 * L + 0];
+    r1_avx = a[5 * L + 1];
+    r2_avx = a[5 * L + 2];
+    r3_avx = a[5 * L + 3];
+
+    va[0 + 45] = r0_avx;
+    va[1 + 45] = r1_avx;
+    va[2 + 45] = r2_avx;
+    va[3 + 45] = r3_avx;
+    va[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 45] = _mm256_add_epi16(va[6 + 45], va[7 + 45]);
+    //------------------AVX evaluation for 6th poly ends------------------
+
+    //------------------AVX evaluation for 7th poly-----------------------
+    r0_avx = a[6 * L + 0];
+    r1_avx = a[6 * L + 1];
+    r2_avx = a[6 * L + 2];
+    r3_avx = a[6 * L + 3];
+
+    va[0 + 54] = r0_avx;
+    va[1 + 54] = r1_avx;
+    va[2 + 54] = r2_avx;
+    va[3 + 54] = r3_avx;
+    va[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 54] = _mm256_add_epi16(va[6 + 54], va[7 + 54]);
+    //------------------AVX evaluation for 7th poly ends------------------
+
+    //-----------------Forward transposes--------------------------------------
+    transpose(va);
+    transpose(va + 16);
+    transpose(va + 32);
+    transpose(va + 48);
+    //-----------------Forward transposes ends---------------------------------
+
+    if (accumulate == 0) {
+        schoolbook_avx(vc, va, vb);
+        schoolbook_avx(vc + 32, va + 16, vb + 16);
+        schoolbook_avx(vc + 64, va + 32, vb + 32);
+        schoolbook_avx(vc + 96, va + 48, vb + 48);
+    } else {
+        schoolbook_avx_acc(vc, va, vb);
+        schoolbook_avx_acc(vc + 32, va + 16, vb + 16);
+        schoolbook_avx_acc(vc + 64, va + 32, vb + 32);
+        schoolbook_avx_acc(vc + 96, va + 48, vb + 48);
+    }
+}
+
+static void karatsuba_eval(__m256i *b_eval, const __m256i *b) {
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+    //-------1st poly----------------------------------------------------
+    r0_avx = b[0 * L + 0];
+    r1_avx = b[0 * L + 1];
+    r2_avx = b[0 * L + 2];
+    r3_avx = b[0 * L + 3];
+
+    b_eval[0] = r0_avx;
+    b_eval[1] = r1_avx;
+    b_eval[2] = r2_avx;
+    b_eval[3] = r3_avx;
+    b_eval[4] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8] = _mm256_add_epi16(b_eval[6], b_eval[7]);
+
+    //-------2nd poly----------------------------------------------------
+    r0_avx = b[1 * L + 0];
+    r1_avx = b[1 * L + 1];
+    r2_avx = b[1 * L + 2];
+    r3_avx = b[1 * L + 3];
+
+    b_eval[0 + 9] = r0_avx;
+    b_eval[1 + 9] = r1_avx;
+    b_eval[2 + 9] = r2_avx;
+    b_eval[3 + 9] = r3_avx;
+    b_eval[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 9] = _mm256_add_epi16(b_eval[6 + 9], b_eval[7 + 9]);
+
+    //-------3rd poly----------------------------------------------------
+    r0_avx = b[2 * L + 0];
+    r1_avx = b[2 * L + 1];
+    r2_avx = b[2 * L + 2];
+    r3_avx = b[2 * L + 3];
+
+    b_eval[0 + 18] = r0_avx;
+    b_eval[1 + 18] = r1_avx;
+    b_eval[2 + 18] = r2_avx;
+    b_eval[3 + 18] = r3_avx;
+    b_eval[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 18] = _mm256_add_epi16(b_eval[6 + 18], b_eval[7 + 18]);
+
+    //-------4th poly----------------------------------------------------
+    r0_avx = b[3 * L + 0];
+    r1_avx = b[3 * L + 1];
+    r2_avx = b[3 * L + 2];
+    r3_avx = b[3 * L + 3];
+
+    b_eval[0 + 27] = r0_avx;
+    b_eval[1 + 27] = r1_avx;
+    b_eval[2 + 27] = r2_avx;
+    b_eval[3 + 27] = r3_avx;
+    b_eval[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 27] = _mm256_add_epi16(b_eval[6 + 27], b_eval[7 + 27]);
+
+    //-------5th poly----------------------------------------------------
+    r0_avx = b[4 * L + 0];
+    r1_avx = b[4 * L + 1];
+    r2_avx = b[4 * L + 2];
+    r3_avx = b[4 * L + 3];
+
+    b_eval[0 + 36] = r0_avx;
+    b_eval[1 + 36] = r1_avx;
+    b_eval[2 + 36] = r2_avx;
+    b_eval[3 + 36] = r3_avx;
+    b_eval[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 36] = _mm256_add_epi16(b_eval[6 + 36], b_eval[7 + 36]);
+
+    //-------6th poly----------------------------------------------------
+    r0_avx = b[5 * L + 0];
+    r1_avx = b[5 * L + 1];
+    r2_avx = b[5 * L + 2];
+    r3_avx = b[5 * L + 3];
+
+    b_eval[0 + 45] = r0_avx;
+    b_eval[1 + 45] = r1_avx;
+    b_eval[2 + 45] = r2_avx;
+    b_eval[3 + 45] = r3_avx;
+    b_eval[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 45] = _mm256_add_epi16(b_eval[6 + 45], b_eval[7 + 45]);
+
+    //-------7th poly----------------------------------------------------
+    r0_avx = b[6 * L + 0];
+    r1_avx = b[6 * L + 1];
+    r2_avx = b[6 * L + 2];
+    r3_avx = b[6 * L + 3];
+
+    b_eval[0 + 54] = r0_avx;
+    b_eval[1 + 54] = r1_avx;
+    b_eval[2 + 54] = r2_avx;
+    b_eval[3 + 54] = r3_avx;
+    b_eval[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 54] = _mm256_add_epi16(b_eval[6 + 54], b_eval[7 + 54]);
+
+    //--------------Evaluating B poly ends-------------------------------
+    transpose(b_eval);
+    transpose(b_eval + 16);
+    transpose(b_eval + 32);
+    transpose(b_eval + 48);
+}
+
+static void karatsuba_interp(__m256i *result_final0, __m256i *result_final1, __m256i *result_final2, __m256i *result_final3, __m256i *result_final4, __m256i *result_final5, __m256i *result_final6, const __m256i *c_eval) {
+    __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
+    __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
+
+    //------------------------AVX interpolation for 1st poly external-------------------
+    res_avx0 = c_eval[0];
+    res_avx2 = c_eval[1];
+    res_avx4 = c_eval[2];
+    res_avx6 = c_eval[3];
+    c6_avx = c_eval[6];
+    c7_avx = c_eval[7];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[8], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[16];
+    res_avx3 = c_eval[17];
+    res_avx5 = c_eval[18];
+    res_avx7 = c_eval[19];
+    c22_avx = c_eval[22];
+    c23_avx = c_eval[23];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[21], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[24], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[20], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[5], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[4], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final0[0] = res_avx0;
+    result_final0[1] = res_avx1;
+    result_final0[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final0[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final0[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final0[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final0[6] = res_avx6;
+    result_final0[7] = res_avx7;
+    //------------------------AVX interpolation for 1st poly ends--------------
+
+
+    //------------------------AVX interpolation for 2nd poly external-------------------
+    res_avx0 = c_eval[9]; //c_eval0
+    res_avx2 = c_eval[10]; //c_eval1
+    res_avx4 = c_eval[11]; //c_eval2
+    res_avx6 = c_eval[12]; //c_eval3
+    c6_avx = c_eval[15]; //c_eval6
+    c7_avx = c_eval[32]; //c_eval7
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[33], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[25]; //c_eval0
+    res_avx3 = c_eval[26]; //c_eval1
+    res_avx5 = c_eval[27]; //c_eval2
+    res_avx7 = c_eval[28]; //c_eval3
+    c22_avx = c_eval[31];
+    c23_avx = c_eval[48];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[30], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[49], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[29], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[14], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[13], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final1[0] = res_avx0;
+    result_final1[1] = res_avx1;
+    result_final1[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final1[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final1[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final1[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final1[6] = res_avx6;
+    result_final1[7] = res_avx7;
+    //------------------------AVX interpolation for 2nd poly ends--------------
+
+    //------------------------AVX interpolation for 3rd poly external-------------------
+    res_avx0 = c_eval[34]; //c_eval0
+    res_avx2 = c_eval[35]; //c_eval1
+    res_avx4 = c_eval[36];
+    res_avx6 = c_eval[37];
+    c6_avx = c_eval[40];
+    c7_avx = c_eval[41];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[42], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[50]; //c_eval0
+    res_avx3 = c_eval[51]; //c_eval1
+    res_avx5 = c_eval[52];
+    res_avx7 = c_eval[53];
+    c22_avx = c_eval[56];
+    c23_avx = c_eval[57];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[55], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[58], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[54], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[39], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[38], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final2[0] = res_avx0;
+    result_final2[1] = res_avx1;
+    result_final2[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final2[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final2[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final2[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final2[6] = res_avx6;
+    result_final2[7] = res_avx7;
+    //------------------------AVX interpolation for 3rd poly ends--------------
+
+    //------------------------AVX interpolation for 4th poly external-------------------
+    res_avx0 = c_eval[43];
+    res_avx2 = c_eval[44];
+    res_avx4 = c_eval[45];
+    res_avx6 = c_eval[46];
+    c6_avx = c_eval[65];
+    c7_avx = c_eval[66];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[67], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[59];
+    res_avx3 = c_eval[60];
+    res_avx5 = c_eval[61];
+    res_avx7 = c_eval[62];
+    c22_avx = c_eval[81];
+    c23_avx = c_eval[82];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[80], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[83], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[63], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[64], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[47], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final3[0] = res_avx0;
+    result_final3[1] = res_avx1;
+    result_final3[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final3[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final3[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final3[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final3[6] = res_avx6;
+    result_final3[7] = res_avx7;
+    //------------------------AVX interpolation for 4th poly ends--------------
+
+    //------------------------AVX interpolation for 5th poly external-------------------
+    res_avx0 = c_eval[68];
+    res_avx2 = c_eval[69];
+    res_avx4 = c_eval[70];
+    res_avx6 = c_eval[71];
+    c6_avx = c_eval[74];
+    c7_avx = c_eval[75];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[76], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[84];
+    res_avx3 = c_eval[85];
+    res_avx5 = c_eval[86];
+    res_avx7 = c_eval[87];
+    c22_avx = c_eval[90];
+    c23_avx = c_eval[91];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[89], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[92], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[88], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[73], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[72], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final4[0] = res_avx0;
+    result_final4[1] = res_avx1;
+    result_final4[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final4[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final4[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final4[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final4[6] = res_avx6;
+    result_final4[7] = res_avx7;
+    //------------------------AVX interpolation for 5th poly ends--------------
+
+    //------------------------AVX interpolation for 6th poly external-------------------
+    res_avx0 = c_eval[77];
+    res_avx2 = c_eval[78];
+    res_avx4 = c_eval[79];
+    res_avx6 = c_eval[96];
+    c6_avx = c_eval[99];
+    c7_avx = c_eval[100];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[101], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[93];
+    res_avx3 = c_eval[94];
+    res_avx5 = c_eval[95];
+    res_avx7 = c_eval[112];
+    c22_avx = c_eval[115];
+    c23_avx = c_eval[116];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[114], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[117], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[113], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[98], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[97], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final5[0] = res_avx0;
+    result_final5[1] = res_avx1;
+    result_final5[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final5[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final5[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final5[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final5[6] = res_avx6;
+    result_final5[7] = res_avx7;
+    //------------------------AVX interpolation for 6th poly ends--------------
+
+    //------------------------AVX interpolation for 7th poly external-------------------
+    res_avx0 = c_eval[102];
+    res_avx2 = c_eval[103];
+    res_avx4 = c_eval[104];
+    res_avx6 = c_eval[105];
+    c6_avx = c_eval[108];
+    c7_avx = c_eval[109];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[110], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[118];
+    res_avx3 = c_eval[119];
+    res_avx5 = c_eval[120];
+    res_avx7 = c_eval[121];
+    c22_avx = c_eval[124];
+    c23_avx = c_eval[125];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[123], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[126], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[122], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[107], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[106], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final6[0] = res_avx0;
+    result_final6[1] = res_avx1;
+    result_final6[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final6[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final6[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final6[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final6[6] = res_avx6;
+    result_final6[7] = res_avx7;
+    //------------------------AVX interpolation for 7th poly ends--------------
+}
+
+void PQCLEAN_FIRESABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a, const toom4_points *b_eval, int accumulate) {
+    size_t i;
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+    __m256i aw_avx[7 * L];
+    __m256i *va = (__m256i *)a->coeffs;
+
+    for (i = 0; i < L; i++) {
+        r0_avx = va[0 * L + i];
+        r1_avx = va[1 * L + i];
+        r2_avx = va[2 * L + i];
+        r3_avx = va[3 * L + i];
+        r4_avx = _mm256_add_epi16(r0_avx, r2_avx);
+        r5_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        aw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        aw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r0_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r2_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r5_avx = _mm256_slli_epi16(r1_avx, 2);
+        r5_avx = _mm256_add_epi16(r5_avx, r3_avx);
+        aw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        aw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r3_avx, 3);
+        r6_avx = _mm256_slli_epi16(r2_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        r6_avx = _mm256_slli_epi16(r1_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        aw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx);
+        aw_avx[6 * L + i] = r0_avx;
+        aw_avx[0 * L + i] = r3_avx;
+    }
+
+    batch_64coefficient_multiplications(c_eval, aw_avx, b_eval, accumulate);
+}
+
+void PQCLEAN_FIRESABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b) {
+    size_t i;
+    __m256i bw_avx[7 * L];
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+    __m256i *vb = (__m256i *)b->coeffs;
+    __m256i *vb_eval = (__m256i *)b_eval->coeffs;
+
+    for (i = 0; i < L; i++) {
+        r0_avx = vb[0 * L + i];
+        r1_avx = vb[1 * L + i];
+        r2_avx = vb[2 * L + i];
+        r3_avx = vb[3 * L + i];
+        r4_avx = _mm256_add_epi16(r0_avx, r2_avx);
+        r5_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        bw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        bw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r0_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r2_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r5_avx = _mm256_slli_epi16(r1_avx, 2);
+        r5_avx = _mm256_add_epi16(r5_avx, r3_avx);
+        bw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        bw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r3_avx, 3);
+        r6_avx = _mm256_slli_epi16(r2_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        r6_avx = _mm256_slli_epi16(r1_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        bw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx);
+        bw_avx[6 * L + i] = r0_avx;
+        bw_avx[0 * L + i] = r3_avx;
+    }
+
+    karatsuba_eval(vb_eval, bw_avx);
+}
+
+
+void PQCLEAN_FIRESABER_AVX2_toom4_interp(poly *res, const toom4_points_product *c_eval) {
+    size_t i;
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
+    __m256i w1_avx[2 * L], w2_avx[2 * L], w3_avx[2 * L], w4_avx[2 * L], w5_avx[2 * L], w6_avx[2 * L], w7_avx[2 * L];
+    __m256i res_full[32];
+    __m256i *vc = (__m256i *)c_eval->coeffs;
+    __m256i *vres = (__m256i *)res->coeffs;
+
+    transpose(vc);
+    transpose(vc + 16);
+    transpose(vc + 32);
+    transpose(vc + 48);
+    transpose(vc + 64);
+    transpose(vc + 80);
+    transpose(vc + 96);
+    transpose(vc + 112);
+
+    karatsuba_interp(w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx, vc);
+
+    for (i = 0; i < 2 * L; i++) {
+        r0_avx = w1_avx[i];
+        r1_avx = w2_avx[i];
+        r2_avx = w3_avx[i];
+        r3_avx = w4_avx[i];
+        r4_avx = w5_avx[i];
+        r5_avx = w6_avx[i];
+        r6_avx = w7_avx[i];
+
+        r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
+        r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
+        r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
+        r3_avx = _mm256_srli_epi16(r3_avx, 1);
+        r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
+        temp_avx = _mm256_slli_epi16(r6_avx, 6);
+
+        r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
+        r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
+        temp_avx = _mm256_slli_epi16(r2_avx, 6);
+
+        r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
+        r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
+        r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
+        r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
+        temp_avx = _mm256_mullo_epi16(r2_avx, _mm256_set1_epi16(45));
+
+        r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+        temp_avx = _mm256_slli_epi16(r2_avx, 3);
+
+        r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+        r4_avx = _mm256_mullo_epi16(r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
+        r4_avx = _mm256_srli_epi16(r4_avx, 3);
+        r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
+        temp_avx = _mm256_slli_epi16(r3_avx, 4);
+
+        r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+        r1_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
+        r1_avx = _mm256_srli_epi16(r1_avx, 1);
+        r3_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        r3_avx = _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
+        temp_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(30));
+        temp_avx = _mm256_sub_epi16(temp_avx, r5_avx);
+        temp_avx = _mm256_mullo_epi16(temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
+
+        r5_avx = _mm256_srli_epi16(temp_avx, 2);
+        r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
+        r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
+
+        if (i < L) {
+            res_full[0 * L + i] = r6_avx;
+            res_full[1 * L + i] = r5_avx;
+            res_full[2 * L + i] = r4_avx;
+            res_full[3 * L + i] = r3_avx;
+            res_full[4 * L + i] = r2_avx;
+            res_full[5 * L + i] = r1_avx;
+            res_full[6 * L + i] = r0_avx;
+        } else {
+            res_full[0 * L + i] = _mm256_add_epi16(res_full[0 * L + i], r6_avx);
+            res_full[1 * L + i] = _mm256_add_epi16(res_full[1 * L + i], r5_avx);
+            res_full[2 * L + i] = _mm256_add_epi16(res_full[2 * L + i], r4_avx);
+            res_full[3 * L + i] = _mm256_add_epi16(res_full[3 * L + i], r3_avx);
+            res_full[4 * L + i] = _mm256_add_epi16(res_full[4 * L + i], r2_avx);
+            res_full[5 * L + i] = _mm256_add_epi16(res_full[5 * L + i], r1_avx);
+            res_full[6 * L + i] = r0_avx;
+        }
+    }
+
+    // Reduction by X^256 + 1
+    for (i = 0; i < 16; i++) {
+        vres[i] = _mm256_sub_epi16(res_full[i], res_full[i + 16]);
+    }
+}
diff --git a/crypto_kem/firesaber/avx2/polymul/consts.h b/crypto_kem/firesaber/avx2/polymul/consts.h
deleted file mode 100644
index 40826398..00000000
--- a/crypto_kem/firesaber/avx2/polymul/consts.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "../SABER_params.h"
-
-#define AVX_N (SABER_N >> 4)
-#define small_len_avx (AVX_N >> 2)
-
-#define SCHB_N 16
-
-#define N_SB (SABER_N >> 2)
-#define N_SB_RES (2*N_SB-1)
-
-#define N_SB_16 (N_SB >> 2)
-#define N_SB_16_RES (2*N_SB_16-1)
-
-#define AVX_N1 16 /*N/16*/ 
-
-#define SCM_SIZE 16
-
-// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements
-#define NUM_POLY SABER_K
-//int NUM_POLY=2; 
diff --git a/crypto_kem/firesaber/avx2/polymul/matrix.c b/crypto_kem/firesaber/avx2/polymul/matrix.c
deleted file mode 100644
index 5fa35783..00000000
--- a/crypto_kem/firesaber/avx2/polymul/matrix.c
+++ /dev/null
@@ -1,303 +0,0 @@
-#include <immintrin.h>
-
-static void transpose_n1(__m256i *M)
-{
-	//int i;
-	register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
-	register __m256i temp, temp0, temp1, temp2;
-
-	//for(i=0; i<8; i=i+1)
-	//{
-		r0 = _mm256_unpacklo_epi16(M[0], M[1]); 
-		r1 = _mm256_unpacklo_epi16(M[2], M[3]); 
-		r2 = _mm256_unpacklo_epi16(M[4], M[5]); 
-		r3 = _mm256_unpacklo_epi16(M[6], M[7]);
-		r4 = _mm256_unpacklo_epi16(M[8], M[9]); 
-		r5 = _mm256_unpacklo_epi16(M[10], M[11]);
-		r6 = _mm256_unpacklo_epi16(M[12], M[13]); 
-		r7 = _mm256_unpacklo_epi16(M[14], M[15]); 
-
-
-		temp = _mm256_unpacklo_epi32(r0, r1); 
-		temp0 = _mm256_unpacklo_epi32(r2, r3); 
-		temp1 = _mm256_unpacklo_epi32(r4, r5); 
-		temp2 = _mm256_unpacklo_epi32(r6, r7); 
-
-		r8 = _mm256_unpackhi_epi32(r0, r1); 
-		r9 = _mm256_unpackhi_epi32(r2, r3); 
-		r10 = _mm256_unpackhi_epi32(r4, r5); 
-		r11 = _mm256_unpackhi_epi32(r6, r7);
-
-		r0 = _mm256_unpacklo_epi64(temp, temp0); 
-		r2 = _mm256_unpackhi_epi64(temp, temp0); 
-
-		r1 = _mm256_unpacklo_epi64(temp1, temp2); 
-		r3 = _mm256_unpackhi_epi64(temp1, temp2);
-
-		temp = _mm256_unpackhi_epi16(M[0], M[1]); 
-		temp0 = _mm256_unpackhi_epi16(M[2], M[3]); 
-		temp1 = _mm256_unpackhi_epi16(M[4], M[5]); 
-		temp2 = _mm256_unpackhi_epi16(M[6], M[7]); 
-		r4 = _mm256_unpackhi_epi16(M[8], M[9]); 
-
-		M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
-		M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
-		M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
-		M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
-
-
-		r5 = _mm256_unpackhi_epi16(M[10], M[11]); 
-		r6 = _mm256_unpackhi_epi16(M[12], M[13]); 
-		r7 = _mm256_unpackhi_epi16(M[14], M[15]); 
-
-
-
-		r0 = _mm256_unpacklo_epi64(r8, r9); 
-		r1 = _mm256_unpacklo_epi64(r10, r11); 
-
-		r2 = _mm256_unpackhi_epi64(r8, r9); 
-		r3 = _mm256_unpackhi_epi64(r10, r11); 
-
-
-
-		M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
-		M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
-		M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
-		M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
-
-
-	//for(i=0; i<4; i=i+1)
-	//{
-		r0 = _mm256_unpacklo_epi32(temp, temp0); 
-		r1 = _mm256_unpacklo_epi32(temp1, temp2);
-		r2 = _mm256_unpacklo_epi32(r4, r5); 
-		r3 = _mm256_unpacklo_epi32(r6, r7); 
-
-	//}
-
-
-	//for(i=0; i<2; i=i+1)
-	//{
-		r8 = _mm256_unpacklo_epi64(r0, r1); 
-		r10 = _mm256_unpackhi_epi64(r0, r1); 
-
-		r9 = _mm256_unpacklo_epi64(r2, r3); 
-		r11 = _mm256_unpackhi_epi64(r2, r3); 
-
-		M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
-		M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
-		M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
-		M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
-
-		r0 = _mm256_unpackhi_epi32(temp, temp0); 
-		r1 = _mm256_unpackhi_epi32(temp1, temp2); 
-		r2 = _mm256_unpackhi_epi32(r4, r5); 
-		r3 = _mm256_unpackhi_epi32(r6, r7); 
-
-	//}
-//	for(i=0; i<2; i=i+1)
-//	{
-		r4 = _mm256_unpacklo_epi64(r0, r1); 
-		r6 = _mm256_unpackhi_epi64(r0, r1); 
-
-		r5 = _mm256_unpacklo_epi64(r2, r3); 
-		r7 = _mm256_unpackhi_epi64(r2, r3); 
-
-//	}
-
-	//-------------------------------------------------------
-
-	M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
-	M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
-	M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
-	M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
-}
-
-/*
-void transpose_unrolled(__m256i *M)
-{
-	int i;
-	__m256i tL[8], tH[8];
-	__m256i bL[4], bH[4], cL[4], cH[4];
-	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
-
-	__m256i r0, r1, r2, r3, r4, r5, r6, r7;
-
-	//for(i=0; i<8; i=i+1)
-	//{
-		tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); 
-		tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); 
-
-		tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); 
-		tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); 
-
-		tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); 
-		tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); 
-
-		tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); 
-		tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); 
-
-		tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); 
-		tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); 
-
-		tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); 
-		tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); 
-
-		tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); 
-		tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); 
-
-		tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); 
-		tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); 
-
-	//}
-
-	//-------------------------------------------------------
-	//for(i=0; i<4; i=i+1)
-	//{
-		bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); 
-		bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); 
-
-		bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); 
-		bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); 
-
-		bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); 
-		bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); 
-
-		bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); 
-		bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); 
-
-	//}
-
-	//for(i=0; i<2; i=i+1)
-	//{
-		dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); 
-		dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); 
-
-		dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); 
-		dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]);
-
-		M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
-		M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
-		M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
-		M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
-
-	//}
-	//for(i=0; i<2; i=i+1)
-	//{
-		eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); 
-		eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); 
-
-		eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); 
-		eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); 
-
-	//}
-
-	//-------------------------------------------------------
-
-	//-------------------------------------------------------
-	for(i=0; i<4; i=i+1)
-	{
-		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
-		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
-	}
-
-
-	for(i=0; i<2; i=i+1)
-	{
-		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
-		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
-		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
-	}
-
-	//-------------------------------------------------------
-
-
-
-	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
-	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
-	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
-	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
-
-	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
-	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
-	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
-	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
-
-	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
-	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
-	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
-	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
-}
-
-
-void transpose1(__m256i *M)
-{
-	int i;
-	__m256i tL[8], tH[8];
-	__m256i bL[4], bH[4], cL[4], cH[4];
-	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
-
-	for(i=0; i<8; i=i+1)
-	{
-		tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); 
-		tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); 
-	}
-
-	for(i=0; i<4; i=i+1)
-	{
-		bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); 
-		bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); 
-	}
-	for(i=0; i<4; i=i+1)
-	{
-		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
-		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
-	}
-
-	for(i=0; i<2; i=i+1)
-	{
-		dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); 
-		dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); 
-		eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); 
-	}
-
-	for(i=0; i<2; i=i+1)
-	{
-		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
-		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
-		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
-	}
-
-	M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
-	M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
-	M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
-	M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
-
-	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
-	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
-	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
-	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
-
-	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
-	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
-	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
-	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
-
-	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
-	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
-	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
-	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
-}
-*/
diff --git a/crypto_kem/firesaber/avx2/polymul/scm_avx.c b/crypto_kem/firesaber/avx2/polymul/scm_avx.c
deleted file mode 100644
index 48870f51..00000000
--- a/crypto_kem/firesaber/avx2/polymul/scm_avx.c
+++ /dev/null
@@ -1,753 +0,0 @@
-//#define SCM_SIZE 16
-
-//#pragma STDC FP_CONTRACT ON
-
-#include <immintrin.h>
-
-static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
-    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
-}
-
-
-static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
-									      //the c_avx are added cummulatively
-{
-
-	register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-	register __m256i temp;
-
-
-	a0=a[0];
-	a1=a[1];
-	a2=a[2];
-	a3=a[3];
-	a4=a[4];
-	a5=a[5];
-	a6=a[6];
-	a7=a[7];
-
-	b0=b[0];
-	b1=b[1];
-	b2=b[2];
-	b3=b[3];
-	b4=b[4];
-	b5=b[5];
-	b6=b[6];
-	b7=b[7];
-
-	// New Unrolled first triangle
-
-	//otherwise accumulate
-	c_avx[0] = mul_add(a0, b0, c_avx[0]);
-	
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	temp=mul_add(a1, b0, temp);
-	c_avx[1] = _mm256_add_epi16(temp, c_avx[1]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b2);
-	temp = mul_add(a1, b1, temp);
-	temp=mul_add(a2, b0, temp);
-	c_avx[2] = _mm256_add_epi16(temp, c_avx[2]);
-	
-
-	temp = _mm256_mullo_epi16 (a0, b3);
-	temp = mul_add(a1, b2, temp);
-	temp = mul_add(a2, b1, temp);
-	temp=mul_add(a3, b0, temp);
-	c_avx[3] = _mm256_add_epi16(temp, c_avx[3]);
-
-	temp = _mm256_mullo_epi16 (a0, b4);
-	temp = mul_add(a1, b3, temp);
-	temp = mul_add(a3, b1, temp);
-	temp = mul_add(a4, b0, temp);
-	temp=mul_add(a2, b2, temp);
-	c_avx[4] = _mm256_add_epi16(temp, c_avx[4]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b5);
-	temp = mul_add(a1, b4 , temp);
-	temp = mul_add(a2, b3, temp);
-	temp = mul_add(a3, b2, temp);
-	temp = mul_add( a4, b1, temp);
-	temp=mul_add(a5, b0, temp);
-	c_avx[5] = _mm256_add_epi16(temp, c_avx[5]);
-	
-	temp = _mm256_mullo_epi16 (a0, b6);
-	temp = mul_add(a1, b5, temp);
-	temp = mul_add(a5, b1, temp);
-	temp = mul_add(a6, b0, temp);
-	temp = mul_add(a2, b4, temp);
-	temp = mul_add(a3, b3, temp);
-	temp=mul_add(a4, b2, temp);
-	c_avx[6] = _mm256_add_epi16(temp, c_avx[6]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b7);
-	temp = mul_add(a1, b6, temp);
-	temp = mul_add (a6, b1, temp);
-	temp = mul_add (a7, b0, temp);
-	temp = mul_add(a2, b5, temp);
-	temp = mul_add (a3, b4, temp);
-	temp = mul_add (a4, b3, temp);
-	temp=mul_add(a5, b2, temp);
-	c_avx[7] = _mm256_add_epi16(temp, c_avx[7]);
-
-	temp = _mm256_mullo_epi16 (a0, b[8]);
-	temp = mul_add (a1, b7, temp);
-	temp = mul_add (a7, b1, temp);
-	temp = mul_add (a[8], b0, temp);
-	temp = mul_add (a2, b6,temp);
-	temp = mul_add(a3, b5, temp);
-	temp = mul_add (a4, b4,temp);
-	temp = mul_add (a5, b3, temp);
-	
-		temp=mul_add(a6, b2, temp);
-		c_avx[8] = _mm256_add_epi16(temp, c_avx[8]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[9]);
-	temp = mul_add (a1, b[8], temp);
-	temp = mul_add (a[8], b1, temp);
-	temp = mul_add (a[9], b0, temp);
-	temp = mul_add (a2, b7, temp);
-	temp = mul_add (a3, b6, temp);
-	temp = mul_add (a4, b5, temp);
-	temp = mul_add (a5, b4, temp);
-	temp = mul_add (a6, b3, temp);
-		temp=mul_add(a7, b2, temp);
-		c_avx[9] = _mm256_add_epi16(temp, c_avx[9]);
-
-
-	temp= _mm256_mullo_epi16 (a0, b[10]);
-	temp = mul_add (a1, b[9], temp);
-	temp = mul_add (a[9], b1, temp);
-	temp = mul_add (a[10], b0, temp);
-	temp = mul_add (a2, b[8], temp);
-	temp = mul_add (a3, b7, temp);
-	temp = mul_add (a4, b6, temp);
-	temp = mul_add (a5, b5, temp);
-	temp = mul_add (a6, b4, temp);
-	temp = mul_add (a7, b3, temp);
-		temp=mul_add(a[8], b2, temp);
-		c_avx[10] = _mm256_add_epi16(temp, c_avx[10]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[11]);
-	temp = mul_add (a1, b[10], temp );
-	temp = mul_add (a[10], b1, temp );
-	temp = mul_add (a[11], b0, temp );
-	temp = mul_add (a2, b[9], temp );
-	temp = mul_add (a3, b[8], temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a[8], b3, temp );
-		temp=mul_add(a[9], b2, temp);
-		c_avx[11] = _mm256_add_epi16(temp, c_avx[11]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[12]);
-	temp = mul_add (a1, b[11], temp);
-	temp = mul_add (a[11], b1, temp);
-	temp = mul_add (a[12], b0, temp);
-	temp = mul_add (a2, b[10], temp);
-	temp = mul_add (a3, b[9], temp);
-	temp = mul_add (a4, b[8], temp);
-	temp = mul_add (a5, b7, temp);
-	temp = mul_add (a6, b6, temp);
-	temp = mul_add (a7, b5, temp);
-	temp = mul_add (a[8], b4, temp);
-	temp = mul_add (a[9], b3, temp);
-		temp=mul_add(a[10], b2, temp);
-		c_avx[12] = _mm256_add_epi16(temp, c_avx[12]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[13]);
-	temp = mul_add (a1, b[12], temp );
-	temp = mul_add (a[12], b1, temp );
-	temp = mul_add (a[13], b0, temp );
-	temp = mul_add (a2, b[11], temp );
-	temp = mul_add (a3, b[10], temp );
-	temp = mul_add (a4, b[9], temp );
-	temp = mul_add (a5, b[8], temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a[8], b5, temp );
-	temp = mul_add (a[9], b4, temp );
-	temp = mul_add (a[10], b3, temp );
-		temp=mul_add(a[11], b2, temp);
-		c_avx[13] = _mm256_add_epi16(temp, c_avx[13]);
-
-
-
-	temp = _mm256_mullo_epi16 (a0, b[14]);
-	temp = mul_add (a1, b[13], temp );
-	temp = mul_add (a[13], b1, temp );
-	temp = mul_add (a[14], b0, temp );
-	temp = mul_add (a2, b[12], temp );
-	temp = mul_add (a3, b[11], temp );
-	temp = mul_add (a4, b[10], temp );
-	temp = mul_add (a5, b[9], temp );
-	temp = mul_add (a6, b[8], temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a[8], b6, temp );
-	temp = mul_add (a[9], b5, temp );
-	temp = mul_add (a[10], b4, temp );
-	temp = mul_add (a[11], b3, temp );
-		temp=mul_add(a[12], b2, temp);
-		c_avx[14] = _mm256_add_epi16(temp, c_avx[14]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[15]);
-	temp = mul_add (a1, b[14], temp );
-	temp = mul_add (a[14], b1, temp );
-	temp = mul_add (a[15], b0, temp );
-	temp = mul_add (a2, b[13], temp );
-	temp = mul_add (a3, b[12], temp );
-	temp = mul_add (a4, b[11], temp );
-	temp = mul_add (a5, b[10], temp );
-	temp = mul_add (a6, b[9], temp );
-	temp = mul_add (a7, b[8], temp );
-	temp = mul_add (a[8], b7, temp );
-	temp = mul_add (a[9], b6, temp );
-	temp = mul_add (a[10], b5, temp );
-	temp = mul_add (a[11], b4, temp );
-	temp = mul_add (a[12], b3, temp );
-		temp=mul_add(a[13], b2, temp);
-		c_avx[15] = _mm256_add_epi16(temp, c_avx[15]);
-
-
-	// unrolled second triangle
-	a0=a[14];
-	a1=a[15];
-	a2=a[13];
-	a3=a[12];
-	a4=a[11];
-	a5=a[10];
-	a6=a[9];
-	a7=a[8];
-
-	b0=b[14];
-	b1=b[15];
-	b2=b[13];
-	b3=b[12];
-	b4=b[11];
-	b5=b[10];
-	b6=b[9];
-	b7=b[8];
-
-	temp = _mm256_mullo_epi16 (a[1], b1);
-	temp = mul_add (a[2], b0, temp );
-	temp = mul_add (a[3], b2, temp );
-	temp = mul_add (a[4], b3, temp );
-	temp = mul_add (a[5], b4, temp );
-	temp = mul_add (a[6], b5, temp );
-	temp = mul_add (a[7], b6, temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a6, b[7], temp );
-	temp = mul_add (a5, b[6], temp );
-	temp = mul_add (a4, b[5], temp );
-	temp = mul_add (a3, b[4], temp );
-	temp = mul_add (a2, b[3], temp );
-	temp = mul_add (a0, b[2], temp );
-		temp=mul_add(a1, b[1], temp);
-		c_avx[16] = _mm256_add_epi16(temp, c_avx[16]);
-
-
-	temp = _mm256_mullo_epi16 (a[2], b1);
-	temp = mul_add (a[3], b0, temp );
-	temp = mul_add (a[4], b2, temp );
-	temp = mul_add (a[5], b3, temp );
-	temp = mul_add (a[6], b4, temp );
-	temp = mul_add (a[7], b5, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a5, b[7], temp );
-	temp = mul_add (a4, b[6], temp );
-	temp = mul_add (a3, b[5], temp );
-	temp = mul_add (a2, b[4], temp );
-	temp = mul_add (a0, b[3], temp );
-		temp=mul_add(a1, b[2], temp);
-		c_avx[17] = _mm256_add_epi16(temp, c_avx[17]);
-
-
-	temp = _mm256_mullo_epi16 (a[3], b1);
-	temp = mul_add (a[4], b0, temp );
-	temp = mul_add (a[5], b2, temp );
-	temp = mul_add (a[6], b3, temp );
-	temp = mul_add (a[7], b4, temp );
-	temp = mul_add (a7, b5, temp );
-	temp = mul_add (a6, b6, temp );
-	temp = mul_add (a5, b7, temp );
-	temp = mul_add (a4, b[7], temp );
-	temp = mul_add (a3, b[6], temp );
-	temp = mul_add (a2, b[5], temp );
-	temp = mul_add (a0, b[4], temp );
-		temp=mul_add(a1, b[3], temp);
-		c_avx[18] = _mm256_add_epi16(temp, c_avx[18]);
-
-
-	temp = _mm256_mullo_epi16 (a[4], b1);
-	temp = mul_add (a[5], b0, temp );
-	temp = mul_add (a[6], b2, temp );
-	temp = mul_add (a[7], b3, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a3, b[7], temp );
-	temp = mul_add (a2, b[6], temp );
-	temp = mul_add (a0, b[5], temp );
-		temp=mul_add(a1, b[4], temp);
-		c_avx[19] = _mm256_add_epi16(temp, c_avx[19]);
-
-
-	temp = _mm256_mullo_epi16 (a[5], b1);
-	temp = mul_add (a[6], b0, temp );
-	temp = mul_add (a[7], b2, temp );
-	temp = mul_add (a7, b3, temp );
-	temp = mul_add (a6, b4, temp );
-	temp = mul_add (a5, b5, temp );
-	temp = mul_add (a4, b6, temp );
-	temp = mul_add (a3, b7, temp );
-	temp = mul_add (a2, b[7], temp );
-	temp = mul_add (a0, b[6], temp );
-		temp=mul_add(a1, b[5], temp);
-		c_avx[20] = _mm256_add_epi16(temp, c_avx[20]);
-
-
-	temp = _mm256_mullo_epi16 (a[6], b1);
-	temp = mul_add (a[7], b0, temp );
-	temp = mul_add (a7, b2, temp );
-	temp = mul_add (a6, b3, temp );
-	temp = mul_add (a5, b4, temp );
-	temp = mul_add (a4, b5, temp );
-	temp = mul_add (a3, b6, temp );
-	temp = mul_add (a2, b7, temp );
-	temp = mul_add (a0, b[7], temp );
-		temp=mul_add(a1, b[6], temp);
-		c_avx[21] = _mm256_add_epi16(temp, c_avx[21]);
-
-
-	temp = _mm256_mullo_epi16 (a[7], b1);
-	temp = mul_add (a7, b0, temp );
-	temp = mul_add (a6, b2, temp );
-	temp = mul_add (a5, b3, temp );
-	temp = mul_add (a4, b4, temp );
-	temp = mul_add (a3, b5, temp );
-	temp = mul_add (a2, b6, temp );
-	temp = mul_add (a0, b7, temp );
-		temp=mul_add(a1, b[7], temp);
-		c_avx[22] = _mm256_add_epi16(temp, c_avx[22]);
-
-
-	temp = _mm256_mullo_epi16 (a7, b1);
-	temp = mul_add (a6, b0, temp );
-	temp = mul_add (a5, b2, temp );
-	temp = mul_add (a4, b3, temp );
-	temp = mul_add (a3, b4, temp );
-	temp = mul_add (a2, b5, temp );
-	temp = mul_add (a0, b6, temp );
-		temp=mul_add(a1, b7, temp);
-		c_avx[23] = _mm256_add_epi16(temp, c_avx[23]);
-
-
-	temp = _mm256_mullo_epi16 (a6, b1);
-	temp = mul_add (a5, b0, temp );
-	temp = mul_add (a4, b2, temp );
-	temp = mul_add (a3, b3, temp );
-	temp = mul_add (a2, b4, temp );
-	temp = mul_add (a0, b5, temp );
-		temp=mul_add(a1, b6, temp);
-		c_avx[24] = _mm256_add_epi16(temp, c_avx[24]);
-
-
-	temp = _mm256_mullo_epi16 (a5, b1);
-	temp = mul_add (a4, b0, temp );
-	temp = mul_add (a3, b2, temp );
-	temp = mul_add (a2, b3, temp );
-	temp = mul_add (a0, b4, temp );
-		temp=mul_add(a1, b5, temp);
-		c_avx[25] = _mm256_add_epi16(temp, c_avx[25]);
-
-
-	temp = _mm256_mullo_epi16 (a4, b1);
-	temp = mul_add (a3, b0, temp );
-	temp = mul_add (a2, b2, temp );
-	temp = mul_add (a0, b3, temp );
-		temp=mul_add(a1, b4, temp);
-		c_avx[26] = _mm256_add_epi16(temp, c_avx[26]);
-
-
-	temp = _mm256_mullo_epi16 (a3, b1);
-	temp = mul_add (a2, b0, temp );
-	temp = mul_add (a0, b2, temp );
-		temp=mul_add(a1, b3, temp);
-		c_avx[27] = _mm256_add_epi16(temp, c_avx[27]);
-
-
-	temp = _mm256_mullo_epi16 (a2, b1);
-	temp = mul_add (a0, b0, temp );
-		temp=mul_add(a1, b2, temp);
-		c_avx[28] = _mm256_add_epi16(temp, c_avx[28]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-		temp=mul_add(a1, b0, temp);
-		c_avx[29] = _mm256_add_epi16(temp, c_avx[29]);
-
-
-		c_avx[30] = mul_add(a1, b1, c_avx[30]);
-
-
-
-	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
-
-
-}
-
-
-
-static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
-									      //the c_avx are not added cummulatively
-{
-
-	__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-	__m256i temp;
-
-
-	a0=a[0];
-	a1=a[1];
-	a2=a[2];
-	a3=a[3];
-	a4=a[4];
-	a5=a[5];
-	a6=a[6];
-	a7=a[7];
-
-	b0=b[0];
-	b1=b[1];
-	b2=b[2];
-	b3=b[3];
-	b4=b[4];
-	b5=b[5];
-	b6=b[6];
-	b7=b[7];
-
-	// New Unrolled first triangle
-	c_avx[0] = _mm256_mullo_epi16 (a0, b0);
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	c_avx[1]=mul_add(a1, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b2);
-
-	temp = mul_add(a1, b1, temp);
-	c_avx[2]= mul_add(a2, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b3);
-	temp = mul_add(a1, b2, temp);
-	temp = mul_add(a2, b1, temp);
-	c_avx[3]= mul_add(a3, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b4);
-	temp = mul_add(a1, b3, temp);
-	temp = mul_add(a3, b1, temp);
-	temp = mul_add(a4, b0, temp);
-	c_avx[4]= mul_add(a2, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b5);
-	temp = mul_add(a1, b4 , temp);
-	temp = mul_add(a2, b3, temp);
-	temp = mul_add(a3, b2, temp);
-	temp = mul_add( a4, b1, temp);
-	c_avx[5] = mul_add(a5, b0, temp);
-	
-	temp = _mm256_mullo_epi16 (a0, b6);
-	temp = mul_add(a1, b5, temp);
-	temp = mul_add(a5, b1, temp);
-	temp = mul_add(a6, b0, temp);
-	temp = mul_add(a2, b4, temp);
-	temp = mul_add(a3, b3, temp);
-	c_avx[6] = mul_add(a4, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b7);
-	temp = mul_add(a1, b6, temp);
-	temp = mul_add (a6, b1, temp);
-	temp = mul_add (a7, b0, temp);
-	temp = mul_add(a2, b5, temp);
-	temp = mul_add (a3, b4, temp);
-	temp = mul_add (a4, b3, temp);
-	c_avx[7] = mul_add (a5, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[8]);
-	temp = mul_add (a1, b7, temp);
-	temp = mul_add (a7, b1, temp);
-	temp = mul_add (a[8], b0, temp);
-	temp = mul_add (a2, b6,temp);
-	temp = mul_add(a3, b5, temp);
-	temp = mul_add (a4, b4,temp);
-	temp = mul_add (a5, b3, temp);
-	c_avx[8] = mul_add (a6, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[9]);
-	temp = mul_add (a1, b[8], temp);
-	temp = mul_add (a[8], b1, temp);
-	temp = mul_add (a[9], b0, temp);
-	temp = mul_add (a2, b7, temp);
-	temp = mul_add (a3, b6, temp);
-	temp = mul_add (a4, b5, temp);
-	temp = mul_add (a5, b4, temp);
-	temp = mul_add (a6, b3, temp);
-	c_avx[9] = mul_add (a7, b2, temp);
-
-	temp= _mm256_mullo_epi16 (a0, b[10]);
-	temp = mul_add (a1, b[9], temp);
-	temp = mul_add (a[9], b1, temp);
-	temp = mul_add (a[10], b0, temp);
-	temp = mul_add (a2, b[8], temp);
-	temp = mul_add (a3, b7, temp);
-	temp = mul_add (a4, b6, temp);
-	temp = mul_add (a5, b5, temp);
-	temp = mul_add (a6, b4, temp);
-	temp = mul_add (a7, b3, temp);
-	c_avx[10] = mul_add (a[8], b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[11]);
-	temp = mul_add (a1, b[10], temp );
-	temp = mul_add (a[10], b1, temp );
-	temp = mul_add (a[11], b0, temp );
-	temp = mul_add (a2, b[9], temp );
-	temp = mul_add (a3, b[8], temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a[8], b3, temp );
-	c_avx[11] = mul_add (a[9], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[12]);
-	temp = mul_add (a1, b[11], temp);
-	temp = mul_add (a[11], b1, temp);
-	temp = mul_add (a[12], b0, temp);
-	temp = mul_add (a2, b[10], temp);
-	temp = mul_add (a3, b[9], temp);
-	temp = mul_add (a4, b[8], temp);
-	temp = mul_add (a5, b7, temp);
-	temp = mul_add (a6, b6, temp);
-	temp = mul_add (a7, b5, temp);
-	temp = mul_add (a[8], b4, temp);
-	temp = mul_add (a[9], b3, temp);
-	c_avx[12] = mul_add (a[10], b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[13]);
-	temp = mul_add (a1, b[12], temp );
-	temp = mul_add (a[12], b1, temp );
-	temp = mul_add (a[13], b0, temp );
-	temp = mul_add (a2, b[11], temp );
-	temp = mul_add (a3, b[10], temp );
-	temp = mul_add (a4, b[9], temp );
-	temp = mul_add (a5, b[8], temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a[8], b5, temp );
-	temp = mul_add (a[9], b4, temp );
-	temp = mul_add (a[10], b3, temp );
-	c_avx[13] = mul_add (a[11], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[14]);
-	temp = mul_add (a1, b[13], temp );
-	temp = mul_add (a[13], b1, temp );
-	temp = mul_add (a[14], b0, temp );
-	temp = mul_add (a2, b[12], temp );
-	temp = mul_add (a3, b[11], temp );
-	temp = mul_add (a4, b[10], temp );
-	temp = mul_add (a5, b[9], temp );
-	temp = mul_add (a6, b[8], temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a[8], b6, temp );
-	temp = mul_add (a[9], b5, temp );
-	temp = mul_add (a[10], b4, temp );
-	temp = mul_add (a[11], b3, temp );
-	c_avx[14] = mul_add (a[12], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[15]);
-	temp = mul_add (a1, b[14], temp );
-	temp = mul_add (a[14], b1, temp );
-	temp = mul_add (a[15], b0, temp );
-	temp = mul_add (a2, b[13], temp );
-	temp = mul_add (a3, b[12], temp );
-	temp = mul_add (a4, b[11], temp );
-	temp = mul_add (a5, b[10], temp );
-	temp = mul_add (a6, b[9], temp );
-	temp = mul_add (a7, b[8], temp );
-	temp = mul_add (a[8], b7, temp );
-	temp = mul_add (a[9], b6, temp );
-	temp = mul_add (a[10], b5, temp );
-	temp = mul_add (a[11], b4, temp );
-	temp = mul_add (a[12], b3, temp );
-	c_avx[15] = mul_add (a[13], b2, temp );
-
-
-	// unrolled second triangle
-	a0=a[14];
-	a1=a[15];
-	a2=a[13];
-	a3=a[12];
-	a4=a[11];
-	a5=a[10];
-	a6=a[9];
-	a7=a[8];
-
-	b0=b[14];
-	b1=b[15];
-	b2=b[13];
-	b3=b[12];
-	b4=b[11];
-	b5=b[10];
-	b6=b[9];
-	b7=b[8];
-	
-
-	temp = _mm256_mullo_epi16 (a[1], b1);
-	temp = mul_add (a[2], b0, temp );
-	temp = mul_add (a[3], b2, temp );
-	temp = mul_add (a[4], b3, temp );
-	temp = mul_add (a[5], b4, temp );
-	temp = mul_add (a[6], b5, temp );
-	temp = mul_add (a[7], b6, temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a6, b[7], temp );
-	temp = mul_add (a5, b[6], temp );
-	temp = mul_add (a4, b[5], temp );
-	temp = mul_add (a3, b[4], temp );
-	temp = mul_add (a2, b[3], temp );
-	temp = mul_add (a0, b[2], temp );
-	c_avx[16] = mul_add (a1, b[1], temp );
-
-	temp = _mm256_mullo_epi16 (a[2], b1);
-	temp = mul_add (a[3], b0, temp );
-	temp = mul_add (a[4], b2, temp );
-	temp = mul_add (a[5], b3, temp );
-	temp = mul_add (a[6], b4, temp );
-	temp = mul_add (a[7], b5, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a5, b[7], temp );
-	temp = mul_add (a4, b[6], temp );
-	temp = mul_add (a3, b[5], temp );
-	temp = mul_add (a2, b[4], temp );
-	temp = mul_add (a0, b[3], temp );
-	c_avx[17] = mul_add (a1, b[2], temp );
-
-	temp = _mm256_mullo_epi16 (a[3], b1);
-	temp = mul_add (a[4], b0, temp );
-	temp = mul_add (a[5], b2, temp );
-	temp = mul_add (a[6], b3, temp );
-	temp = mul_add (a[7], b4, temp );
-	temp = mul_add (a7, b5, temp );
-	temp = mul_add (a6, b6, temp );
-	temp = mul_add (a5, b7, temp );
-	temp = mul_add (a4, b[7], temp );
-	temp = mul_add (a3, b[6], temp );
-	temp = mul_add (a2, b[5], temp );
-	temp = mul_add (a0, b[4], temp );
-	c_avx[18] = mul_add (a1, b[3], temp );
-
-	temp = _mm256_mullo_epi16 (a[4], b1);
-	temp = mul_add (a[5], b0, temp );
-	temp = mul_add (a[6], b2, temp );
-	temp = mul_add (a[7], b3, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a3, b[7], temp );
-	temp = mul_add (a2, b[6], temp );
-	temp = mul_add (a0, b[5], temp );
-	c_avx[19] = mul_add (a1, b[4], temp );
-
-	temp = _mm256_mullo_epi16 (a[5], b1);
-	temp = mul_add (a[6], b0, temp );
-	temp = mul_add (a[7], b2, temp );
-	temp = mul_add (a7, b3, temp );
-	temp = mul_add (a6, b4, temp );
-	temp = mul_add (a5, b5, temp );
-	temp = mul_add (a4, b6, temp );
-	temp = mul_add (a3, b7, temp );
-	temp = mul_add (a2, b[7], temp );
-	temp = mul_add (a0, b[6], temp );
-	c_avx[20] = mul_add (a1, b[5], temp );
-
-	temp = _mm256_mullo_epi16 (a[6], b1);
-	temp = mul_add (a[7], b0, temp );
-	temp = mul_add (a7, b2, temp );
-	temp = mul_add (a6, b3, temp );
-	temp = mul_add (a5, b4, temp );
-	temp = mul_add (a4, b5, temp );
-	temp = mul_add (a3, b6, temp );
-	temp = mul_add (a2, b7, temp );
-	temp = mul_add (a0, b[7], temp );
-	c_avx[21] = mul_add (a1, b[6], temp );
-
-	temp = _mm256_mullo_epi16 (a[7], b1);
-	temp = mul_add (a7, b0, temp );
-	temp = mul_add (a6, b2, temp );
-	temp = mul_add (a5, b3, temp );
-	temp = mul_add (a4, b4, temp );
-	temp = mul_add (a3, b5, temp );
-	temp = mul_add (a2, b6, temp );
-	temp = mul_add (a0, b7, temp );
-	c_avx[22] = mul_add (a1, b[7], temp );
-
-	temp = _mm256_mullo_epi16 (a7, b1);
-	temp = mul_add (a6, b0, temp );
-	temp = mul_add (a5, b2, temp );
-	temp = mul_add (a4, b3, temp );
-	temp = mul_add (a3, b4, temp );
-	temp = mul_add (a2, b5, temp );
-	temp = mul_add (a0, b6, temp );
-	c_avx[23] = mul_add (a1, b7, temp );
-
-	temp = _mm256_mullo_epi16 (a6, b1);
-	temp = mul_add (a5, b0, temp );
-	temp = mul_add (a4, b2, temp );
-	temp = mul_add (a3, b3, temp );
-	temp = mul_add (a2, b4, temp );
-	temp = mul_add (a0, b5, temp );
-	c_avx[24] = mul_add (a1, b6, temp );
-
-	temp = _mm256_mullo_epi16 (a5, b1);
-	temp = mul_add (a4, b0, temp );
-	temp = mul_add (a3, b2, temp );
-	temp = mul_add (a2, b3, temp );
-	temp = mul_add (a0, b4, temp );
-	c_avx[25] = mul_add (a1, b5, temp );
-
-	temp = _mm256_mullo_epi16 (a4, b1);
-	temp = mul_add (a3, b0, temp );
-	temp = mul_add (a2, b2, temp );
-	temp = mul_add (a0, b3, temp );
-	c_avx[26] = mul_add (a1, b4, temp );
-
-	temp = _mm256_mullo_epi16 (a3, b1);
-	temp = mul_add (a2, b0, temp );
-	temp = mul_add (a0, b2, temp );
-	c_avx[27] = mul_add (a1, b3, temp );
-
-	temp = _mm256_mullo_epi16 (a2, b1);
-	temp = mul_add (a0, b0, temp );
-	c_avx[28] = mul_add (a1, b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	c_avx[29] = mul_add (a1, b0, temp);
-
-	c_avx[30] = _mm256_mullo_epi16 (a1, b1);
-
-
-	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
-
-}
diff --git a/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c b/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c
deleted file mode 100644
index 78fb86c2..00000000
--- a/crypto_kem/firesaber/avx2/polymul/toom-cook_4way.c
+++ /dev/null
@@ -1,1010 +0,0 @@
-/*
-Cleaned version for step by step approach look into the _debug file
-*/
-//#include "timing.c"
-#include "consts.h"
-#include "matrix.c"
-#include "scm_avx.c"
-
-static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX.
-{
-	__m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time
-
-	//uint16_t i;
-
-	register __m256i r0_avx, r1_avx, r2_avx, r3_avx;
-
-
-
-		//CLOCK1=cpucycles();
-		
-		//------------------AVX evaluation for 1st poly-----------------------
-
-                    r0_avx=a[0];
-                    r1_avx=a[1];
-                    r2_avx=a[2];
-                    r3_avx=a[3];
-		    a_bucket[0]=r0_avx;
-		    a_bucket[1]=r1_avx;
-		    a_bucket[2]=r2_avx;
-		    a_bucket[3]=r3_avx;
-		    a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]);
-
-
-		//------------------AVX evaluation for 1st poly ends------------------
-
-
-		//------------------AVX evaluation for 2nd poly-----------------------
-                    r0_avx=a[small_len_avx];
-                    r1_avx=a[small_len_avx+1];
-                    r2_avx=a[small_len_avx+2];
-                    r3_avx=a[small_len_avx+3];
-		    a_bucket[0+9]=r0_avx;
-		    a_bucket[1+9]=r1_avx;
-		    a_bucket[2+9]=r2_avx;
-		    a_bucket[3+9]=r3_avx;
-		    a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]);
-
-	
-		//------------------AVX evaluation for 2nd poly ends------------------
-
-
-		//------------------AVX evaluation for 3rd poly-----------------------
-                    r0_avx=a[2*small_len_avx];
-                    r1_avx=a[2*small_len_avx+1];
-                    r2_avx=a[2*small_len_avx+2];
-                    r3_avx=a[2*small_len_avx+3];
-		    a_bucket[0+18]=r0_avx;
-		    a_bucket[1+18]=r1_avx;
-		    a_bucket[2+18]=r2_avx;
-		    a_bucket[3+18]=r3_avx;
-		    a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]);
-		
-		//------------------AVX evaluation for 3rd poly ends------------------
-
-
-		//------------------AVX evaluation for 4th poly-----------------------
-
-                    r0_avx=a[3*small_len_avx];
-                    r1_avx=a[3*small_len_avx+1];
-                    r2_avx=a[3*small_len_avx+2];
-                    r3_avx=a[3*small_len_avx+3];
-		    a_bucket[0+27]=r0_avx;
-		    a_bucket[1+27]=r1_avx;
-		    a_bucket[2+27]=r2_avx;
-		    a_bucket[3+27]=r3_avx;
-		    a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]);
-		
-		//------------------AVX evaluation for 4th poly ends------------------
-
-		//------------------AVX evaluation for 5th poly-----------------------
-		
-                    r0_avx=a[4*small_len_avx+0];
-                    r1_avx=a[4*small_len_avx+1];
-                    r2_avx=a[4*small_len_avx+2];
-                    r3_avx=a[4*small_len_avx+3];
-		    a_bucket[0+36]=r0_avx;
-		    a_bucket[1+36]=r1_avx;
-		    a_bucket[2+36]=r2_avx;
-		    a_bucket[3+36]=r3_avx;
-		    a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]);
-		
-		//------------------AVX evaluation for 5th poly ends------------------
-
-
-		//------------------AVX evaluation for 6th poly-----------------------
-                    r0_avx=a[5*small_len_avx];
-                    r1_avx=a[5*small_len_avx+1];
-                    r2_avx=a[5*small_len_avx+2];
-                    r3_avx=a[5*small_len_avx+3];
-		    a_bucket[0+45]=r0_avx;
-		    a_bucket[1+45]=r1_avx;
-		    a_bucket[2+45]=r2_avx;
-		    a_bucket[3+45]=r3_avx;
-		    a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]);
-		
-		//------------------AVX evaluation for 6th poly ends------------------
-
-		//------------------AVX evaluation for 7th poly-----------------------
-
-                    r0_avx=a[6*small_len_avx];
-                    r1_avx=a[6*small_len_avx+1];
-                    r2_avx=a[6*small_len_avx+2];
-                    r3_avx=a[6*small_len_avx+3];
-		    a_bucket[0+54]=r0_avx;
-		    a_bucket[1+54]=r1_avx;
-		    a_bucket[2+54]=r2_avx;
-		    a_bucket[3+54]=r3_avx;
-		    a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]);
-
-		//------------------AVX evaluation for 7th poly ends------------------
-		
-	
-
-		//CLOCK2=cpucycles();
-		//CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1);
-		//printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1);
-
-
-		//CLOCK1=cpucycles();
-		//-----------------Forward transposes--------------------------------------
-			transpose_n1(a_bucket);
-			transpose_n1(a_bucket+16);
-			transpose_n1(a_bucket+32);
-			transpose_n1(a_bucket+48);
-
-		//-----------------Forwatrd transposes ends---------------------------------
-
-		//----------------------all multiplications---------------------------------
-		if(f==0){
-			schoolbook_avx_new2(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
-			schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
-			schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
-		}
-		else{
-			schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
-			//schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
-			schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
-		}
-		/*
-		schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f);
-		schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f);
-		schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f);
-		schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f);
-		*/
-
-
-		//----------------------all multiplications ends-----------------------------
-
-
-		//-----------------Reverse transposes--------------------------------------
-
-			/*
-			transpose(c_bucket);
-			transpose(c_bucket+16);
-
-			transpose(c_bucket+2*SCM_SIZE);
-			transpose(c_bucket+16+2*SCM_SIZE);
-
-			transpose(c_bucket+4*SCM_SIZE);
-			transpose(c_bucket+16+4*SCM_SIZE);
-
-			transpose(c_bucket+6*SCM_SIZE);
-			transpose(c_bucket+16+6*SCM_SIZE);
-			*/
-		//-----------------Reverse transposes ends---------------------------------
-
-		//CLOCK2=cpucycles();
-		//CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1);
-
-		//KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6);
-		
-}
-
-static void KARA_eval(__m256i* b, __m256i *b_bucket){
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx;
-
-
-		//-------1st poly----------------------------------------------------
-                    r0_avx=b[0];
-                    r1_avx=b[1];
-                    r2_avx=b[2];
-                    r3_avx=b[3];
-		    b_bucket[0]=r0_avx;
-		    b_bucket[1]=r1_avx;
-		    b_bucket[2]=r2_avx;
-		    b_bucket[3]=r3_avx;
-		    b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]);
-		//-------2nd poly----------------------------------------------------
-
-                    r0_avx=b[small_len_avx];
-                    r1_avx=b[small_len_avx+1];
-                    r2_avx=b[small_len_avx+2];
-                    r3_avx=b[small_len_avx+3];
-		    b_bucket[0+9]=r0_avx;
-		    b_bucket[1+9]=r1_avx;
-		    b_bucket[2+9]=r2_avx;
-		    b_bucket[3+9]=r3_avx;
-		    b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]);
-
-		//-------3rd poly----------------------------------------------------
-
-                    r0_avx=b[2*small_len_avx+0];
-                    r1_avx=b[2*small_len_avx+1];
-                    r2_avx=b[2*small_len_avx+2];
-                    r3_avx=b[2*small_len_avx+3];
-		    b_bucket[0+18]=r0_avx;
-		    b_bucket[1+18]=r1_avx;
-		    b_bucket[2+18]=r2_avx;
-		    b_bucket[3+18]=r3_avx;
-		    b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]);
-
-		//-------4th poly----------------------------------------------------
-                    r0_avx=b[3*small_len_avx];
-                    r1_avx=b[3*small_len_avx+1];
-                    r2_avx=b[3*small_len_avx+2];
-                    r3_avx=b[3*small_len_avx+3];
-		    b_bucket[0+27]=r0_avx;
-		    b_bucket[1+27]=r1_avx;
-		    b_bucket[2+27]=r2_avx;
-		    b_bucket[3+27]=r3_avx;
-		    b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]);
-
-		//-------5th poly----------------------------------------------------
-
-                    r0_avx=b[4*small_len_avx];
-                    r1_avx=b[4*small_len_avx+1];
-                    r2_avx=b[4*small_len_avx+2];
-                    r3_avx=b[4*small_len_avx+3];
-		    b_bucket[0+36]=r0_avx;
-		    b_bucket[1+36]=r1_avx;
-		    b_bucket[2+36]=r2_avx;
-		    b_bucket[3+36]=r3_avx;
-		    b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]);
-
-		//-------6th poly----------------------------------------------------
-
-                    r0_avx=b[5*small_len_avx];
-                    r1_avx=b[5*small_len_avx+1];
-                    r2_avx=b[5*small_len_avx+2];
-                    r3_avx=b[5*small_len_avx+3];
-		    b_bucket[0+45]=r0_avx;
-		    b_bucket[1+45]=r1_avx;
-		    b_bucket[2+45]=r2_avx;
-		    b_bucket[3+45]=r3_avx;
-		    b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]);
-
-		//-------7th poly----------------------------------------------------
-
-                    r0_avx=b[6*small_len_avx];
-                    r1_avx=b[6*small_len_avx+1];
-                    r2_avx=b[6*small_len_avx+2];
-                    r3_avx=b[6*small_len_avx+3];
-		    b_bucket[0+54]=r0_avx;
-		    b_bucket[1+54]=r1_avx;
-		    b_bucket[2+54]=r2_avx;
-		    b_bucket[3+54]=r3_avx;
-		    b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]);
-
-		//--------------Evaluating B poly ends-------------------------------
-
-			transpose_n1(b_bucket);
-			transpose_n1(b_bucket+16);
-			transpose_n1(b_bucket+32);
-			transpose_n1(b_bucket+48);	
-}
-
-static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){
-
-		//int64_t i;
-		register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
-
-		__m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
-
-		//CLOCK1=cpucycles();
-
-		   //------------------------AVX interpolation for 1st poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[0];
-				res_avx2 = c_bucket[1];
-				res_avx4 = c_bucket[2];
-				res_avx6 = c_bucket[3];
-
-				c6_avx=c_bucket[6];
-				c7_avx=c_bucket[7];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[16];
-				res_avx3 = c_bucket[17];
-				res_avx5 = c_bucket[18];
-				res_avx7 = c_bucket[19];
-
-				c22_avx=c_bucket[22];
-				c23_avx=c_bucket[23];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final0[0]=res_avx0;
-				result_final0[1]=res_avx1;
-
-				result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final0[6]=res_avx6;
-				result_final0[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 1st poly ends--------------
-
-
-		   //------------------------AVX interpolation for 2nd poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[9]; //c_bucket0
-				res_avx2 = c_bucket[10]; //c_bucket1
-				res_avx4 = c_bucket[11]; //c_bucket2
-				res_avx6 = c_bucket[12]; //c_bucket3
-
-				c6_avx=c_bucket[15]; //c_bucket6
-				c7_avx=c_bucket[32]; //c_bucket7
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[25]; //c_bucket0
-				res_avx3 = c_bucket[26]; //c_bucket1
-				res_avx5 = c_bucket[27]; //c_bucket2
-				res_avx7 = c_bucket[28]; //c_bucket3
-
-				c22_avx=c_bucket[31];
-				c23_avx=c_bucket[48];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final1[0]=res_avx0;
-				result_final1[1]=res_avx1;
-
-				result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final1[6]=res_avx6;
-				result_final1[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 2nd poly ends--------------
-
-		   //------------------------AVX interpolation for 3rd poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[34]; //c_bucket0
-				res_avx2 = c_bucket[35]; //c_bucket1
-				res_avx4 = c_bucket[36];
-				res_avx6 = c_bucket[37];
-
-				c6_avx=c_bucket[40];
-				c7_avx=c_bucket[41];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[50]; //c_bucket0
-				res_avx3 = c_bucket[51]; //c_bucket1
-				res_avx5 = c_bucket[52];
-				res_avx7 = c_bucket[53];
-
-				c22_avx=c_bucket[56];
-				c23_avx=c_bucket[57];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-			//loop4
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-			//loop5
-				result_final2[0]=res_avx0;
-				result_final2[1]=res_avx1;
-
-				result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final2[6]=res_avx6;
-				result_final2[7]=res_avx7;
-
-		   //------------------------AVX interpolation for 3rd poly ends--------------
-		
-		   //------------------------AVX interpolation for 4th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[43];
-				res_avx2 = c_bucket[44];
-				res_avx4 = c_bucket[45];
-				res_avx6 = c_bucket[46];
-
-				c6_avx=c_bucket[65];
-				c7_avx=c_bucket[66];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[59];
-				res_avx3 = c_bucket[60];
-				res_avx5 = c_bucket[61];
-				res_avx7 = c_bucket[62];
-
-				c22_avx=c_bucket[81];
-				c23_avx=c_bucket[82];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final3[0]=res_avx0;
-				result_final3[1]=res_avx1;
-
-				result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final3[6]=res_avx6;
-				result_final3[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 4th poly ends--------------
-
-		   //------------------------AVX interpolation for 5th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[68];
-				res_avx2 = c_bucket[69];
-				res_avx4 = c_bucket[70];
-				res_avx6 = c_bucket[71];
-
-				c6_avx=c_bucket[74];
-				c7_avx=c_bucket[75];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[84];
-				res_avx3 = c_bucket[85];
-				res_avx5 = c_bucket[86];
-				res_avx7 = c_bucket[87];
-
-				c22_avx=c_bucket[90];
-				c23_avx=c_bucket[91];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final4[0]=res_avx0;
-				result_final4[1]=res_avx1;
-
-				result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final4[6]=res_avx6;
-				result_final4[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 5th poly ends--------------
-
-		   //------------------------AVX interpolation for 6th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[77];
-				res_avx2 = c_bucket[78];
-				res_avx4 = c_bucket[79];
-				res_avx6 = c_bucket[96];
-
-				c6_avx=c_bucket[99];
-				c7_avx=c_bucket[100];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[93];
-				res_avx3 = c_bucket[94];
-				res_avx5 = c_bucket[95];
-				res_avx7 = c_bucket[112];
-
-				c22_avx=c_bucket[115];
-				c23_avx=c_bucket[116];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final5[0]=res_avx0;
-				result_final5[1]=res_avx1;
-
-				result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final5[6]=res_avx6;
-				result_final5[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 6th poly ends--------------
-
-		   //------------------------AVX interpolation for 7th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[102];
-				res_avx2 = c_bucket[103];
-				res_avx4 = c_bucket[104];
-				res_avx6 = c_bucket[105];
-
-				c6_avx=c_bucket[108];
-				c7_avx=c_bucket[109];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[118];
-				res_avx3 = c_bucket[119];
-				res_avx5 = c_bucket[120];
-				res_avx7 = c_bucket[121];
-
-				c22_avx=c_bucket[124];
-				c23_avx=c_bucket[125];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final6[0]=res_avx0;
-				result_final6[1]=res_avx1;
-
-				result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final6[6]=res_avx6;
-				result_final6[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 7th poly ends--------------
-
-		//CLOCK2=cpucycles();
-		//CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1);
-		//printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1);
-
-
-
-}
-
-static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ 
-
-	int i;
-
-//---------------AVX data-----------------------------
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
-	__m256i aw_avx[7*small_len_avx];
-
-//----------------AVX data----------------------------
-
-
-// EVALUATION
-
-	//CLOCK1=cpucycles();
-
-	for (i=0; i<small_len_avx; i++){
-		r0_avx=a_avx[i];
-		r1_avx=a_avx[i + small_len_avx];
-		r2_avx=a_avx[i + 2*small_len_avx];
-		r3_avx=a_avx[i + 3*small_len_avx];
-		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
-		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		aw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		aw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx=_mm256_slli_epi16(r0_avx,2);
-		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
-		r4_avx=_mm256_slli_epi16(r4_avx,1);
-		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
-		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
-		aw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		aw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx= _mm256_slli_epi16(r3_avx, 3);
-		r6_avx= _mm256_slli_epi16(r2_avx, 2);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		r6_avx= _mm256_slli_epi16(r1_avx, 1);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		aw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
-		aw_avx[6*small_len_avx+i]= r0_avx; 
-		aw_avx[i]= r3_avx;
-	}
-
-
-	//CLOCK2=cpucycles();
-	//CLOCK_TC_EVAL=CLOCK_TC_EVAL+(CLOCK2-CLOCK1);
-
-	batch_64coefficient_multiplications_new(aw_avx, b_bucket, c_bucket, f);//New
-
-}
-
-static void TC_eval(__m256i* b_avx, __m256i* b_bucket){
-
-	int i;
-	__m256i bw_avx[7*small_len_avx];
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
-
-	for (i=0; i<small_len_avx; i++){
-		
-		r0_avx=b_avx[i];
-		r1_avx=b_avx[i + small_len_avx];
-		r2_avx=b_avx[i + 2*small_len_avx];
-		r3_avx=b_avx[i + 3*small_len_avx];
-		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
-		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		bw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		bw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx=_mm256_slli_epi16(r0_avx,2);
-		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
-		r4_avx=_mm256_slli_epi16(r4_avx,1);
-		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
-		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
-		bw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		bw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx= _mm256_slli_epi16(r3_avx, 3);
-		r6_avx= _mm256_slli_epi16(r2_avx, 2);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		r6_avx= _mm256_slli_epi16(r1_avx, 1);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		bw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
-		bw_avx[6*small_len_avx+i]= r0_avx;
-		bw_avx[i]= r3_avx;
-	}
-
-	KARA_eval(bw_avx, b_bucket);
-
-}
-
-
-static void TC_interpol(__m256i *c_bucket, __m256i* res_avx){
-
-	int i;
-
-	register __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
-
-	__m256i w1_avx[2*small_len_avx],w2_avx[2*small_len_avx],w3_avx[2*small_len_avx],w4_avx[2*small_len_avx],w5_avx[2*small_len_avx],w6_avx[2*small_len_avx],w7_avx[2*small_len_avx];
-
-	__m256i res_avx_output[2*AVX_N1];
-
-	//CLOCK1=cpucycles();
-
-	
-	transpose_n1(c_bucket);
-	transpose_n1(c_bucket+16);
-
-	transpose_n1(c_bucket+2*SCM_SIZE);
-	transpose_n1(c_bucket+16+2*SCM_SIZE);
-
-	transpose_n1(c_bucket+4*SCM_SIZE);
-	transpose_n1(c_bucket+16+4*SCM_SIZE);
-
-	transpose_n1(c_bucket+6*SCM_SIZE);
-	transpose_n1(c_bucket+16+6*SCM_SIZE);
-	
-
-	KARA_interpol(c_bucket, w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx);
-
-	for (i = 0; i < 2*small_len_avx; i++) {
-
-		r0_avx = w1_avx[i];
-		r1_avx = w2_avx[i];
-		r2_avx = w3_avx[i];
-		r3_avx = w4_avx[i];
-		r4_avx = w5_avx[i];
-		r5_avx = w6_avx[i];
-		r6_avx = w7_avx[i];
-		r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
-		r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
-		r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
-		r3_avx = _mm256_srli_epi16(r3_avx, 1);
-		r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
-		temp_avx = _mm256_slli_epi16(r6_avx, 6);
-		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
-		r4_avx = _mm256_slli_epi16(r4_avx, 1);
-		r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
-		r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
-		temp_avx = _mm256_slli_epi16(r2_avx, 6);
-		r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
-		r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
-		r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
-		r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
-		temp_avx = _mm256_mullo_epi16 (r2_avx, _mm256_set1_epi16(45));
-		r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
-		temp_avx = _mm256_slli_epi16(r2_avx, 3);
-		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
-		r4_avx = _mm256_mullo_epi16 (r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
-		r4_avx = _mm256_srli_epi16(r4_avx, 3);
-		r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
-		temp_avx = _mm256_slli_epi16(r3_avx, 4);
-		r1_avx= _mm256_add_epi16(r1_avx, temp_avx);
-		r1_avx = _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
-		r1_avx= _mm256_srli_epi16(r1_avx, 1); 	
-		r3_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		r3_avx= _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
-		temp_avx= _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(30));
-		temp_avx= _mm256_sub_epi16(temp_avx, r5_avx);
-		temp_avx= _mm256_mullo_epi16 (temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
-		r5_avx= _mm256_srli_epi16(temp_avx, 2);
-		r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
-		r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
-
-		if(i<small_len_avx){
-			res_avx_output[0*small_len_avx+i]=r6_avx;
-			res_avx_output[1*small_len_avx+i]=r5_avx;
-			res_avx_output[2*small_len_avx+i]=r4_avx;
-			res_avx_output[3*small_len_avx+i]=r3_avx;
-			res_avx_output[4*small_len_avx+i]=r2_avx;
-			res_avx_output[5*small_len_avx+i]=r1_avx;
-			res_avx_output[6*small_len_avx+i]=r0_avx;
-		}
-		else{
-			res_avx_output[0*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[0*small_len_avx+i], r6_avx);
-			res_avx_output[1*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[1*small_len_avx+i], r5_avx);
-			res_avx_output[2*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[2*small_len_avx+i], r4_avx);
-			res_avx_output[3*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[3*small_len_avx+i], r3_avx);
-			res_avx_output[4*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[4*small_len_avx+i], r2_avx);
-			res_avx_output[5*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[5*small_len_avx+i], r1_avx);
-			res_avx_output[6*small_len_avx+i]=r0_avx;
-		}
-	}
-
-	//CLOCK2=cpucycles();
-	//CLOCK_TC_INTER=CLOCK_TC_INTER+(CLOCK2-CLOCK1);
-
-	// Reduction by X^256 + 1
-	for(i=0; i<16; i++)
-  {
-		res_avx[i] = _mm256_sub_epi16(res_avx_output[i], res_avx_output[i+16]);
-  }
-
-}
diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.c b/crypto_kem/firesaber/clean/SABER_indcpa.c
index 342eb7ca..a9e7c141 100644
--- a/crypto_kem/firesaber/clean/SABER_indcpa.c
+++ b/crypto_kem/firesaber/clean/SABER_indcpa.c
@@ -11,81 +11,102 @@
 #define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
 void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
-    uint16_t A[SABER_L][SABER_L][SABER_N];
-    uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N] = {{0}};
-
-    uint8_t seed_A[SABER_SEEDBYTES];
-    uint8_t seed_s[SABER_NOISE_SEEDBYTES];
     size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly s[SABER_L];
+    poly res[SABER_L];
+
+    uint8_t rand[SABER_NOISESEEDBYTES];
+    uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+
     randombytes(seed_A, SABER_SEEDBYTES);
     shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
-    randombytes(seed_s, SABER_NOISE_SEEDBYTES);
 
-    PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A);
-    PQCLEAN_FIRESABER_CLEAN_GenSecret(s, seed_s);
-    PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1);
+    randombytes(rand, SABER_NOISESEEDBYTES);
+    PQCLEAN_FIRESABER_CLEAN_GenSecret(s, rand);
+    PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(sk, s);
 
+    PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A); // sample matrix A
+    PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 1); // Matrix in transposed order
+
+
+    // rounding
     for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_N; j++) {
-            b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP);
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
 
-    PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s);
-    PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b);
-    memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A));
+    PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(pk, res); // pack public key
 }
 
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
-    uint16_t A[SABER_L][SABER_L][SABER_N];
-    uint16_t sp[SABER_L][SABER_N];
-    uint16_t bp[SABER_L][SABER_N] = {{0}};
-    uint16_t vp[SABER_N] = {0};
-    uint16_t mp[SABER_N];
-    uint16_t b[SABER_L][SABER_N];
+
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
     size_t i, j;
+
+    poly A[SABER_L][SABER_L];
+    poly res[SABER_L];
+    poly s[SABER_L];
+    poly *temp = A[0]; // re-use stack space
+    poly *vprime = &A[0][0];
+    poly *message = &A[0][1];
+
     const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+    uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
 
+    PQCLEAN_FIRESABER_CLEAN_GenSecret(s, noiseseed);
     PQCLEAN_FIRESABER_CLEAN_GenMatrix(A, seed_A);
-    PQCLEAN_FIRESABER_CLEAN_GenSecret(sp, seed_sp);
-    PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0);
+    PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed
 
-    for (i = 0; i < SABER_L; i++) {
+
+    // rounding
+    for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits
         for (j = 0; j < SABER_N; j++) {
-            bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP);
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
+    PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(ciphertext, res);
 
-    PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp);
-    PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, pk);
-    PQCLEAN_FIRESABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp);
-
-    PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(mp, m);
-
-    for (j = 0; j < SABER_N; j++) {
-        vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET);
-    }
-
-    PQCLEAN_FIRESABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp);
-}
-
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
-
-    uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N];
-    uint16_t v[SABER_N] = {0};
-    uint16_t cm[SABER_N];
-    size_t i;
-
-    PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk);
-    PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(b, ciphertext);
-    PQCLEAN_FIRESABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s);
-    PQCLEAN_FIRESABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES);
+    // vector-vector scalar multiplication with mod p
+    PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(temp, pk);
+    PQCLEAN_FIRESABER_CLEAN_InnerProd(vprime, temp, s);
+    PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(message, m);
 
     for (i = 0; i < SABER_N; i++) {
-        v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1);
+        vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1));
+        vprime->coeffs[i] &= SABER_P - 1;
+        vprime->coeffs[i] >>= SABER_EP - SABER_ET;
+    }
+
+    PQCLEAN_FIRESABER_CLEAN_POLT2BS(msk_c, vprime);
+}
+
+
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+    size_t i;
+
+    poly temp[SABER_L];
+    poly s[SABER_L];
+
+    const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
+    poly *v = &temp[0];
+    poly *cm = &temp[1];
+
+    PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(s, sk);
+    PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(temp, ciphertext);
+    PQCLEAN_FIRESABER_CLEAN_InnerProd(&temp[0], temp, s);
+
+    PQCLEAN_FIRESABER_CLEAN_BS2POLT(cm, packed_cm);
+
+    for (i = 0; i < SABER_N; i++) {
+        v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET));
+        v->coeffs[i] &= SABER_P - 1;
+        v->coeffs[i] >>= SABER_EP - 1;
     }
 
     PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(m, v);
diff --git a/crypto_kem/firesaber/clean/SABER_indcpa.h b/crypto_kem/firesaber/clean/SABER_indcpa.h
index 28a5feee..cc009afe 100644
--- a/crypto_kem/firesaber/clean/SABER_indcpa.h
+++ b/crypto_kem/firesaber/clean/SABER_indcpa.h
@@ -5,7 +5,7 @@
 
 void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
 
-void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
 
 void PQCLEAN_FIRESABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
 
diff --git a/crypto_kem/firesaber/clean/SABER_params.h b/crypto_kem/firesaber/clean/SABER_params.h
index 9121a12b..6481efec 100644
--- a/crypto_kem/firesaber/clean/SABER_params.h
+++ b/crypto_kem/firesaber/clean/SABER_params.h
@@ -2,19 +2,21 @@
 #define PARAMS_H
 
 
-/* Change this for different security strengths */
-
 /* Don't change anything below this line */
 #define SABER_L 4
 #define SABER_MU 6
 #define SABER_ET 6
 
-#define SABER_EQ 13
-#define SABER_EP 10
 #define SABER_N 256
 
+#define SABER_EP 10
+#define SABER_P (1 << SABER_EP)
+
+#define SABER_EQ 13
+#define SABER_Q (1 << SABER_EQ)
+
 #define SABER_SEEDBYTES 32
-#define SABER_NOISE_SEEDBYTES 32
+#define SABER_NOISESEEDBYTES 32
 #define SABER_KEYBYTES 32
 #define SABER_HASHBYTES 32
 
diff --git a/crypto_kem/firesaber/clean/api.h b/crypto_kem/firesaber/clean/api.h
index 14718674..fdff18fa 100644
--- a/crypto_kem/firesaber/clean/api.h
+++ b/crypto_kem/firesaber/clean/api.h
@@ -15,4 +15,4 @@ int PQCLEAN_FIRESABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k,
 int PQCLEAN_FIRESABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
 
 
-#endif /* api_h */
+#endif /* PQCLEAN_FIRESABER_CLEAN_API_H */
diff --git a/crypto_kem/firesaber/clean/pack_unpack.c b/crypto_kem/firesaber/clean/pack_unpack.c
index 0add1409..ec2f1263 100644
--- a/crypto_kem/firesaber/clean/pack_unpack.c
+++ b/crypto_kem/firesaber/clean/pack_unpack.c
@@ -1,136 +1,149 @@
-#include "api.h"
+#include "SABER_params.h"
 #include "pack_unpack.h"
+#include "poly.h"
 #include <string.h>
 
-void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2);
+        out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6);
+        out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4);
+        out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2);
+        in += 4;
+        out += 3;
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
-    size_t j, offset_byte, offset_data;
+void PQCLEAN_FIRESABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
-        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) | ((bytes[offset_byte + 1] & 0x0f) << 2);
-        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4);
-        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
+        out[0] = in[0] & 0x3f;
+        out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2);
+        out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4);
+        out[3] = ((in[2] & 0xff) >> 2);
+        in += 3;
+        out += 4;
     }
 }
 
-static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
-        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5);
-        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff);
-        bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2);
-        bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7);
-        bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff);
-        bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4);
-        bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff);
-        bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1);
-        bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6);
-        bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff);
-        bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3);
-        bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
+        out[2] = ((in[1] >> 3) & 0xff);
+        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
+        out[5] = ((in[3] >> 1) & 0xff);
+        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
+        out[7] = ((in[4] >> 4) & 0xff);
+        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
+        out[10] = ((in[6] >> 2) & 0xff);
+        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
+        out[12] = ((in[7] >> 5) & 0xff);
+        in += 8;
+        out += 13;
     }
 }
 
-static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) {
-    size_t j, offset_byte, offset_data;
+static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
+        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
+        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
+        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
+        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
+        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
+        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
+        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        in += 13;
+        out += 8;
     }
 }
 
-static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 5 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
-        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2);
-        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6);
-        bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
+        out[4] = ((in[3] >> 2) & 0xff);
+        in += 4;
+        out += 5;
     }
 }
 
-static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
-    size_t j, offset_byte, offset_data;
+static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 5 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8);
-        data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6);
-        data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4);
-        data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
+        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
+        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
+        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        in += 5;
+        out += 4;
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) {
+void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        POLq2BS(bytes + i * SABER_POLYBYTES, data[i]);
+        POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]);
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) {
+void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        BS2POLq(data[i], bytes + i * SABER_POLYBYTES);
+        BS2POLq(&data[i], bytes + i * SABER_POLYBYTES);
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) {
+void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]);
+        POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]);
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8));
+        BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES);
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) {
+void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) {
     size_t i, j;
     for (j = 0; j < SABER_KEYBYTES; j++) {
         for (i = 0; i < 8; i++) {
-            data[j * 8 + i] = ((bytes[j] >> i) & 0x01);
+            data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01);
         }
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) {
+void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) {
     size_t i, j;
     memset(bytes, 0, SABER_KEYBYTES);
 
     for (j = 0; j < SABER_KEYBYTES; j++) {
         for (i = 0; i < 8; i++) {
-            bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i);
+            bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i);
         }
     }
 }
diff --git a/crypto_kem/firesaber/clean/pack_unpack.h b/crypto_kem/firesaber/clean/pack_unpack.h
index 0a8ee253..698cecb1 100644
--- a/crypto_kem/firesaber/clean/pack_unpack.h
+++ b/crypto_kem/firesaber/clean/pack_unpack.h
@@ -1,27 +1,28 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
 #include "SABER_params.h"
+#include "poly.h"
 #include <stdint.h>
 #include <stdio.h>
 
-void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]);
+void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data);
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]);
+void PQCLEAN_FIRESABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]);
+void PQCLEAN_FIRESABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]);
 
-void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]);
+void PQCLEAN_FIRESABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]);
 
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]);
+void PQCLEAN_FIRESABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]);
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
+void PQCLEAN_FIRESABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
 
 
-void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]);
+void PQCLEAN_FIRESABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]);
 
-void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]);
+void PQCLEAN_FIRESABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data);
 
 
 #endif
diff --git a/crypto_kem/firesaber/clean/poly.c b/crypto_kem/firesaber/clean/poly.c
index c6d729ba..2ce0e871 100644
--- a/crypto_kem/firesaber/clean/poly.c
+++ b/crypto_kem/firesaber/clean/poly.c
@@ -3,32 +3,40 @@
 #include "fips202.h"
 #include "pack_unpack.h"
 #include "poly.h"
-#include "poly_mul.h"
 #include <stddef.h>
 
-void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
+void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) {
     size_t i, j;
-    for (i = 0; i < SABER_L; i++) {
-        for (j = 0; j < SABER_L; j++) {
-            if (transpose == 1) {
-                PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]);
-            } else {
-                PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]);
+
+    if (transpose) {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1);
+            }
+        }
+    } else {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_FIRESABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1);
             }
         }
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
-    size_t j;
-    for (j = 0; j < SABER_L; j++) {
-        PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(res, b[j], s[j]);
+void PQCLEAN_FIRESABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) {
+    size_t i;
+
+    PQCLEAN_FIRESABER_CLEAN_poly_mul(c, &b[0], &s[0], 0);
+    for (i = 1; i < SABER_L; i++) {
+        PQCLEAN_FIRESABER_CLEAN_poly_mul(c, &b[i], &s[i], 1);
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
-    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+void PQCLEAN_FIRESABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) {
     size_t i;
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
 
     shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
 
@@ -37,13 +45,13 @@ void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], co
     }
 }
 
-void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) {
-    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+void PQCLEAN_FIRESABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) {
     size_t i;
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
 
-    shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES);
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
 
     for (i = 0; i < SABER_L; i++) {
-        PQCLEAN_FIRESABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES);
+        PQCLEAN_FIRESABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES);
     }
 }
diff --git a/crypto_kem/firesaber/clean/poly.h b/crypto_kem/firesaber/clean/poly.h
index 044e4eec..fdbbfa1f 100644
--- a/crypto_kem/firesaber/clean/poly.h
+++ b/crypto_kem/firesaber/clean/poly.h
@@ -3,13 +3,21 @@
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose);
+typedef union {
+    uint16_t coeffs[SABER_N];
+} poly;
 
-void PQCLEAN_FIRESABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]);
 
-void PQCLEAN_FIRESABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]);
+void PQCLEAN_FIRESABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose);
 
-void PQCLEAN_FIRESABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]);
+void PQCLEAN_FIRESABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]);
+
+void PQCLEAN_FIRESABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_FIRESABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]);
+
+
+void PQCLEAN_FIRESABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate);
 
 
 #endif
diff --git a/crypto_kem/firesaber/clean/poly_mul.c b/crypto_kem/firesaber/clean/poly_mul.c
index 6b527c21..b57e04fb 100644
--- a/crypto_kem/firesaber/clean/poly_mul.c
+++ b/crypto_kem/firesaber/clean/poly_mul.c
@@ -1,4 +1,4 @@
-#include "poly_mul.h"
+#include "poly.h"
 #include <stdint.h>
 #include <string.h>
 
@@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t
 }
 
 /* res += a*b */
-void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) {
-    uint16_t c[2 * SABER_N] = {0};
+void PQCLEAN_FIRESABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) {
+    uint16_t C[2 * SABER_N] = {0};
     size_t i;
 
-    toom_cook_4way(c, a, b);
+    toom_cook_4way(C, a->coeffs, b->coeffs);
 
     /* reduction */
-    for (i = SABER_N; i < 2 * SABER_N; i++) {
-        res[i - SABER_N] += (c[i - SABER_N] - c[i]);
+    if (accumulate == 0) {
+        for (i = SABER_N; i < 2 * SABER_N; i++) {
+            c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]);
+        }
+    } else {
+        for (i = SABER_N; i < 2 * SABER_N; i++) {
+            c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]);
+        }
     }
 }
diff --git a/crypto_kem/firesaber/clean/poly_mul.h b/crypto_kem/firesaber/clean/poly_mul.h
index b6911577..b28b04f6 100644
--- a/crypto_kem/firesaber/clean/poly_mul.h
+++ b/crypto_kem/firesaber/clean/poly_mul.h
@@ -1,9 +1,3 @@
-#ifndef POLY_MUL_H
-#define POLY_MUL_H
-#include "SABER_params.h"
-#include <stdint.h>
-
-void PQCLEAN_FIRESABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]);
 
 
-#endif
+
diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml
index 7e1dd2eb..027f1fab 100644
--- a/crypto_kem/lightsaber/META.yml
+++ b/crypto_kem/lightsaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/lightsaber/avx2/Makefile b/crypto_kem/lightsaber/avx2/Makefile
index 0522fe8d..f2817574 100644
--- a/crypto_kem/lightsaber/avx2/Makefile
+++ b/crypto_kem/lightsaber/avx2/Makefile
@@ -2,7 +2,7 @@
 
 LIB=liblightsaber_avx2.a
 HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
-OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o 
+OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
 CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
 
diff --git a/crypto_kem/lightsaber/avx2/SABER_indcpa.c b/crypto_kem/lightsaber/avx2/SABER_indcpa.c
index 47f760e9..50f57221 100644
--- a/crypto_kem/lightsaber/avx2/SABER_indcpa.c
+++ b/crypto_kem/lightsaber/avx2/SABER_indcpa.c
@@ -1,416 +1,125 @@
-#include "./polymul/toom-cook_4way.c"
 #include "SABER_indcpa.h"
 #include "SABER_params.h"
-#include "api.h"
-#include "cbd.h"
 #include "fips202.h"
 #include "pack_unpack.h"
+#include "poly.h"
 #include "randombytes.h"
 #include <stdint.h>
-#include <stdio.h>
 #include <string.h>
-//#include "randombytes.h"
-//#include "./polymul/toom_cook_4/toom-cook_4way.c"
 
-#define h1 4 //2^(EQ-EP-1)
+#define h1 (1 << (SABER_EQ - SABER_EP - 1))
+#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
-#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
+    size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly *skpv1 = A[0]; // use first row of A to hold sk temporarily
+    toom4_points skpv1_eval[SABER_L];
+    poly res[SABER_L];
 
-static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) {
-    int32_t i, j;
+    uint8_t rand[SABER_NOISESEEDBYTES];
+    uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        message_dec[j] = 0;
-        for (i = 0; i < 8; i++) {
-            message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i);
-        }
-    }
-}
+    randombytes(seed_A, SABER_SEEDBYTES);
+    shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
 
-/*-----------------------------------------------------------------------------------
-    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
--------------------------------------------------------------------------------------*/
+    randombytes(rand, SABER_NOISESEEDBYTES);
+    PQCLEAN_LIGHTSABER_AVX2_GenSecret(skpv1, rand);
+    PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key
 
-static void GenMatrix(polyvec *a, const uint8_t *seed) {
-    uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8];
-
-    uint16_t temp_ar[SABER_N];
-
-    int i, j, k;
-    uint16_t mod = (SABER_Q - 1);
-
-    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            PQCLEAN_LIGHTSABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8);
-            for (k = 0; k < SABER_N; k++) {
-                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
-            }
-        }
-    }
-}
-
-static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
-
-    uint32_t i;
-
-    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
-
-    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        PQCLEAN_LIGHTSABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
-    }
-}
-
-//********************************matrix-vector mul routines*****************************************************
-static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) {
-    int64_t i, j;
-
-    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
-
-    for (i = 0; i < NUM_POLY; i++) {
-        for (j = 0; j < NUM_POLY; j++) {
-
-            if (isTranspose == 0) {
-                toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j);
-            } else {
-                toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j);
-            }
-        }
-
-        TC_interpol(c_bucket, res_avx[i]);
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]);
     }
 
-}
+    PQCLEAN_LIGHTSABER_AVX2_GenMatrix(A, seed_A); // sample matrix A
+    PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order
 
-static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) {
-
-    int64_t i;
-
-    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
-
-    for (i = 0; i < NUM_POLY; i++) {
-        toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i);
-    }
-    TC_interpol(c_bucket, res_avx);
-}
-
-//********************************matrix-vector mul routines*****************************************************
-
-void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
-
-    polyvec a[SABER_K];
-
-    uint16_t skpv1[SABER_K][SABER_N];
-
-
-
-    uint8_t seed[SABER_SEEDBYTES];
-    uint8_t noiseseed[SABER_COINBYTES];
-    int32_t i, j, k;
-
-
-//--------------AVX declaration------------------
-
-    __m256i sk_avx[SABER_K][SABER_N / 16];
-    __m256i mod;
-    __m256i res_avx[SABER_K][SABER_N / 16];
-    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
-    //__m256i acc[2*SABER_N/16];
-
-    mod = _mm256_set1_epi16(SABER_Q - 1);
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-
-//--------------AVX declaration ends------------------
-
-    randombytes(seed, SABER_SEEDBYTES);
-
-    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state
-    randombytes(noiseseed, SABER_COINBYTES);
-
-
-    GenMatrix(a, seed); //sample matrix A
-
-    GenSecret(skpv1, noiseseed);
-
-
-// Load sk into avx vectors
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
-        }
-
-    }
-
-    // Load a into avx vectors
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            for (k = 0; k < SABER_N / 16; k++) {
-                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
-            }
+    // rounding
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_N; j++) {
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
 
-
-
-    //------------------------do the matrix vector multiplication and rounding------------
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sk_avx[j], b_bucket[j]);
-    }
-    matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order
-
-    // Now truncation
-
-
-    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
-        for (j = 0; j < SABER_N / 16; j++) {
-            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
-            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
-            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
-        }
-    }
-
-    //------------------Pack sk into byte string-------
-
-    PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q);
-
-    //------------------Pack pk into byte string-------
-
-    for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key
-        for (j = 0; j < SABER_N / 16; j++) {
-            _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
-        }
-    }
-    PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string
-
-
-    for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
-        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
-    }
-
+    PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(pk, res); // pack public key
 }
 
 
 void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+    size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly res[SABER_L];
+    toom4_points skpv1_eval[SABER_L];
 
-    uint32_t i, j, k;
-    polyvec a[SABER_K];     // skpv;
-    uint8_t seed[SABER_SEEDBYTES];
-    uint16_t pkcl[SABER_K][SABER_N];    //public key of received by the client
+    poly *temp = A[0]; // re-use stack space
+    poly *vprime = &A[0][0];
+    poly *message = &A[0][1];
 
+    const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+    uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
 
-    uint16_t skpv1[SABER_K][SABER_N];
-    uint16_t temp[SABER_K][SABER_N];
-    uint16_t message[SABER_KEYBYTES * 8];
-
-    uint8_t msk_c[SABER_SCALEBYTES_KEM];
-
-    //--------------AVX declaration------------------
-
-    __m256i sk_avx[SABER_K][SABER_N / 16];
-    __m256i mod, mod_p;
-    __m256i res_avx[SABER_K][SABER_N / 16];
-    __m256i vprime_avx[SABER_N / 16];
-    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
-    //__m256i acc[2*SABER_N/16];
-
-    __m256i pkcl_avx[SABER_K][SABER_N / 16];
-
-    __m256i message_avx[SABER_N / 16];
-
-    mod = _mm256_set1_epi16(SABER_Q - 1);
-    mod_p = _mm256_set1_epi16(SABER_P - 1);
-
-
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-
-    //--------------AVX declaration ends------------------
-    for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK.
-        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
+    PQCLEAN_LIGHTSABER_AVX2_GenSecret(temp, noiseseed);
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]);
     }
 
-    GenMatrix(a, seed);
-    GenSecret(skpv1, noiseseed);
+    PQCLEAN_LIGHTSABER_AVX2_GenMatrix(A, seed_A);
+    PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed
 
-    // ----------- Load skpv1 into avx vectors ----------
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+    // rounding
+    for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N; j++) {
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
-
-    // ----------- Load skpv1 into avx vectors ----------
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            for (k = 0; k < SABER_N / 16; k++) {
-                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
-            }
-        }
-    }
-    //-----------------matrix-vector multiplication and rounding
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sk_avx[j], b_bucket[j]);
-    }
-    matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order
-
-    // Now truncation
-
-    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
-        for (j = 0; j < SABER_N / 16; j++) {
-            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
-            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
-            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
-
-        }
-    }
-
-
-    //-----this result should be put in b_prime for later use in server.
-    for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays
-        for (j = 0; j < SABER_N / 16; j++) {
-            _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
-        }
-    }
-
-    PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string
-
-//**************client matrix-vector multiplication ends******************//
-
-    //------now calculate the v'
-
-    //-------unpack the public_key
-    PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16]));
-        }
-    }
-
-    // InnerProduct
-    //for(k=0;k<SABER_N/16;k++){
-    //  vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]);
-    //}
+    PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(ciphertext, res);
 
     // vector-vector scalar multiplication with mod p
+    PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(temp, pk);
+    PQCLEAN_LIGHTSABER_AVX2_InnerProd(vprime, temp, skpv1_eval);
+    PQCLEAN_LIGHTSABER_AVX2_BS2POLmsg(message, m);
 
-    vector_vector_mul(vprime_avx, pkcl_avx, b_bucket);
-
-    // Computation of v'+h1
-    for (i = 0; i < SABER_N / 16; i++) { //adding h1
-        vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1));
-    }
-
-    // unpack m;
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        for (i = 0; i < 8; i++) {
-            message[8 * j + i] = ((m[j] >> i) & 0x01);
-        }
-    }
-    // message encoding
-    for (i = 0; i < SABER_N / 16; i++) {
-        message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16]));
-        message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) );
-    }
-
-    // SHIFTRIGHT(v'+h1-m mod p, EP-ET)
-    for (k = 0; k < SABER_N / 16; k++) {
-        vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]);
-        vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p);
-        vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) );
-    }
-
-    // Unpack avx
-    for (j = 0; j < SABER_N / 16; j++) {
-        _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]);
-    }
-
-    PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(msk_c, temp[0]);
-
-
-    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
-        ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j];
+    for (i = 0; i < SABER_N; i++) {
+        vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1));
+        vprime->coeffs[i] &= SABER_P - 1;
+        vprime->coeffs[i] >>= SABER_EP - SABER_ET;
     }
 
+    PQCLEAN_LIGHTSABER_AVX2_POLT2BS(msk_c, vprime);
 }
 
 
 void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+    size_t i;
 
-    uint32_t i, j;
-    uint16_t sksv[SABER_K][SABER_N]; //secret key of the server
-    uint16_t pksv[SABER_K][SABER_N];
-    uint16_t message_dec_unpacked[SABER_KEYBYTES * 8];  // one element containes on decrypted bit;
-    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
-    uint16_t op[SABER_N];
+    poly temp[SABER_L];
+    toom4_points sksv_eval[SABER_L];
 
-    //--------------AVX declaration------------------
+    const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
+    poly *v = &temp[0];
+    poly *cm = &temp[1];
 
-
-    //__m256i mod_p;
-
-    __m256i v_avx[SABER_N / 16];
-
-    //__m256i acc[2*SABER_N/16];
-
-    __m256i sksv_avx[SABER_K][SABER_N / 16];
-    __m256i pksv_avx[SABER_K][SABER_N / 16];
-
-    //mod_p=_mm256_set1_epi16(SABER_P-1);
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-    //--------------AVX declaration ends------------------
-
-    //-------unpack the public_key
-
-    PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key
-    PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16]));
-            pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16]));
-        }
+    PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(temp, sk);
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_LIGHTSABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]);
     }
 
-    for (i = 0; i < SABER_N / 16; i++) {
-        v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]);
-    }
+    PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(temp, ciphertext);
+    PQCLEAN_LIGHTSABER_AVX2_InnerProd(v, temp, sksv_eval);
 
+    PQCLEAN_LIGHTSABER_AVX2_BS2POLT(cm, packed_cm);
 
-    // InnerProduct(b', s, mod p)
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sksv_avx[j], b_bucket[j]);
-    }
-
-    vector_vector_mul(v_avx, pksv_avx, b_bucket);
-
-    for (i = 0; i < SABER_N / 16; i++) {
-        _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
-    }
-
-
-    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
-        scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i];
-    }
-
-    PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(op, scale_ar);
-
-
-    //addition of h2
     for (i = 0; i < SABER_N; i++) {
-        message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1);
+        v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET));
+        v->coeffs[i] &= SABER_P - 1;
+        v->coeffs[i] >>= SABER_EP - 1;
     }
 
-
-    POL2MSG(m, message_dec_unpacked);
+    PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(m, v);
 }
diff --git a/crypto_kem/lightsaber/avx2/SABER_params.h b/crypto_kem/lightsaber/avx2/SABER_params.h
index 11d34fda..8da6ec34 100644
--- a/crypto_kem/lightsaber/avx2/SABER_params.h
+++ b/crypto_kem/lightsaber/avx2/SABER_params.h
@@ -1,46 +1,41 @@
 #ifndef PARAMS_H
 #define PARAMS_H
-#include "api.h"
 
 
-
-
-#define SABER_K 2
+/* Don't change anything below this line */
+#define SABER_L 2
 #define SABER_MU 10
 #define SABER_ET 3
 
+#define SABER_N 256
+
+#define SABER_EP 10
+#define SABER_P (1 << SABER_EP)
 
 #define SABER_EQ 13
-#define SABER_EP 10
+#define SABER_Q (1 << SABER_EQ)
 
-#define SABER_N 256
-#define SABER_Q 8192 //2^13
-#define SABER_P 1024
+#define SABER_SEEDBYTES 32
+#define SABER_NOISESEEDBYTES 32
+#define SABER_KEYBYTES 32
+#define SABER_HASHBYTES 32
 
-#define SABER_SEEDBYTES       32
-#define SABER_NOISESEEDBYTES  32
-#define SABER_COINBYTES       32
-#define SABER_KEYBYTES        32
+#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8)
 
-#define SABER_HASHBYTES       32
+#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8)
+#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES)
 
-#define SABER_POLYBYTES              416 //13*256/8 
+#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8)
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES)
 
-#define SABER_POLYVECBYTES           (SABER_K * SABER_POLYBYTES)
-
-#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
-
-#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
-
-#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8)
 
 #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
 #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
 
 #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
 
-#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
-
-#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
+#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM)
 
 #endif
diff --git a/crypto_kem/lightsaber/avx2/cbd.c b/crypto_kem/lightsaber/avx2/cbd.c
index a43170e2..5a61236f 100644
--- a/crypto_kem/lightsaber/avx2/cbd.c
+++ b/crypto_kem/lightsaber/avx2/cbd.c
@@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
 
 
-static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+static uint64_t load_littleendian(const uint8_t *x, int bytes) {
     int i;
     uint64_t r = x[0];
     for (i = 1; i < bytes; i++) {
@@ -20,10 +20,7 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) {
     return r;
 }
 
-
-void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) {
-    uint16_t Qmod_minus1 = SABER_Q - 1;
-
+void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) {
     uint64_t t, d, a[4], b[4];
     int i, j;
 
@@ -34,8 +31,8 @@ void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) {
             d += (t >> j) & 0x0842108421UL;
         }
 
-        a[0] =  d & 0x1f;
-        b[0] = (d >>  5) & 0x1f;
+        a[0] = d & 0x1f;
+        b[0] = (d >> 5) & 0x1f;
         a[1] = (d >> 10) & 0x1f;
         b[1] = (d >> 15) & 0x1f;
         a[2] = (d >> 20) & 0x1f;
@@ -43,9 +40,9 @@ void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) {
         a[3] = (d >> 30) & 0x1f;
         b[3] = (d >> 35);
 
-        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
-        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
-        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
-        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
+        s[4 * i + 0] = (uint16_t)(a[0] - b[0]);
+        s[4 * i + 1] = (uint16_t)(a[1] - b[1]);
+        s[4 * i + 2] = (uint16_t)(a[2] - b[2]);
+        s[4 * i + 3] = (uint16_t)(a[3] - b[3]);
     }
 }
diff --git a/crypto_kem/lightsaber/avx2/cbd.h b/crypto_kem/lightsaber/avx2/cbd.h
index 01ba76e8..5be3a405 100644
--- a/crypto_kem/lightsaber/avx2/cbd.h
+++ b/crypto_kem/lightsaber/avx2/cbd.h
@@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
 by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
-#include "poly.h"
+#include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t *r, const unsigned char *buf);
+void PQCLEAN_LIGHTSABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]);
 
 
 #endif
diff --git a/crypto_kem/lightsaber/avx2/kem.c b/crypto_kem/lightsaber/avx2/kem.c
index 70221f10..e60a2d51 100644
--- a/crypto_kem/lightsaber/avx2/kem.c
+++ b/crypto_kem/lightsaber/avx2/kem.c
@@ -4,14 +4,12 @@
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
-#include <immintrin.h>
+#include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 
 
 int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
-    int i;
+    size_t i;
 
     PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -39,7 +37,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t
     sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
     // K^ <-- kr[0:31]
     // noiseseed (r) <-- kr[32:63];
-    PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
+    PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
 
     sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
 
@@ -49,7 +47,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t
 }
 
 int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
-    int i;
+    size_t i;
     uint8_t fail;
     uint8_t cmp[SABER_BYTES_CCA_DEC];
     uint8_t buf[64];
@@ -65,7 +63,7 @@ int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const u
 
     sha3_512(kr, buf, 64);
 
-    PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk);
+    PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk);
 
     fail = PQCLEAN_LIGHTSABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC);
 
diff --git a/crypto_kem/lightsaber/avx2/kem.h b/crypto_kem/lightsaber/avx2/kem.h
index b80c335d..b28b04f6 100644
--- a/crypto_kem/lightsaber/avx2/kem.h
+++ b/crypto_kem/lightsaber/avx2/kem.h
@@ -1,35 +1,3 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-
-void PQCLEAN_LIGHTSABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk);
-
-
-void PQCLEAN_LIGHTSABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
-
-
-void PQCLEAN_LIGHTSABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
-
-
-void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk,  uint8_t *ciphertext);
-
-void PQCLEAN_LIGHTSABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]);
-
-
-int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
-
-int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
-
-int PQCLEAN_LIGHTSABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
 
 
 
-//uint64_t clock1,clock2;
-
-//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex;
-
-
-#endif
diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.c b/crypto_kem/lightsaber/avx2/pack_unpack.c
index e912fd0a..a9f866ae 100644
--- a/crypto_kem/lightsaber/avx2/pack_unpack.c
+++ b/crypto_kem/lightsaber/avx2/pack_unpack.c
@@ -1,502 +1,153 @@
+#include "SABER_params.h"
 #include "pack_unpack.h"
+#include "poly.h"
+#include <string.h>
 
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
+void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7);
-        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 );
+        out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6);
+        out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7);
+        out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5);
+        in += 8;
+        out += 3;
     }
 }
 
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
-        data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07;
-        data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 );
-        data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07;
-        data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07;
-        data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 );
-        data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 );
-        data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 );
-    }
-
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0;
-
-    for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 );
+        out[0] = (in[0]) & 0x07;
+        out[1] = ((in[0]) >> 3) & 0x07;
+        out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2);
+        out[3] = ((in[1]) >> 1) & 0x07;
+        out[4] = ((in[1]) >> 4) & 0x07;
+        out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1);
+        out[6] = ((in[2] >> 2) & 0x07);
+        out[7] = ((in[2] >> 5) & 0x07);
+        in += 3;
+        out += 8;
     }
 }
 
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0;
-
-    for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        data[offset_data] = bytes[j] & 0x0f;
-        data[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
+    for (j = 0; j < SABER_N / 8; j++) {
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
+        out[2] = ((in[1] >> 3) & 0xff);
+        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
+        out[5] = ((in[3] >> 1) & 0xff);
+        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
+        out[7] = ((in[4] >> 4) & 0xff);
+        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
+        out[10] = ((in[6] >> 2) & 0xff);
+        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
+        out[12] = ((in[7] >> 5) & 0xff);
+        in += 8;
+        out += 13;
     }
 }
 
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
+static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
+    for (j = 0; j < SABER_N / 8; j++) {
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
+        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
+        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
+        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
+        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
+        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
+        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
+        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        in += 13;
+        out += 8;
+    }
+}
 
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
+        out[4] = ((in[3] >> 2) & 0xff);
+        in += 4;
+        out += 5;
     }
 }
 
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
+static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
-        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |  ((bytes[offset_byte + 1] & 0x0f) << 2)  ;
-        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ;
-        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
+        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
+        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
+        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        in += 5;
+        out += 4;
     }
-
 }
 
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]);
+    }
+}
 
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLq(&data[i], bytes + i * SABER_POLYBYTES);
+    }
+}
 
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]);
+    }
+}
 
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES);
+    }
+}
 
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) {
+    size_t i, j;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01);
         }
     }
 }
 
-void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
+void PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) {
+    size_t i, j;
+    memset(bytes, 0, SABER_KEYBYTES);
 
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i);
         }
     }
 }
-
-void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
-
-            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
-
-        }
-    }
-
-
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
-    }
-}
-
-
-
-void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
-            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
-            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
-            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
-
-        }
-    }
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-
-
-}
-
-
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
-            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
-            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
-            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
-
-        }
-    }
-
-
-}
-
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
-
-            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
-
-        }
-    }
-
-
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-
-
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    //for(i=0;i<SABER_K;i++){
-    //i=0;
-    //offset_byte1=i*(SABER_N*13)/8;
-    for (j = 0; j < SABER_N / 8; j++) {
-        //offset_byte=offset_byte1+13*j;
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
-    }
-    //}
-
-
-}
-
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-    /*This function packs 11 bit data stream into 8 bits of data.
-    */
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 11) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 11 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6);
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1);
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7);
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5);
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff );
-
-        }
-    }
-
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 11) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 11 * j;
-            offset_data = 8 * j;
-
-            data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 );
-
-            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 );
-
-            data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 );
-
-            data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 );
-
-            data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 );
-
-            data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 );
-
-            data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 );
-
-            data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 );
-        }
-    }
-
-
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 14) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 7 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff );
-        }
-    }
-
-
-}
-
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 14) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 7 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 );
-
-            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 );
-
-            data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 );
-
-            data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 );
-        }
-    }
-
-
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-
-    if (modulus == 1024) {
-        PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(bytes, data);
-    } else if (modulus == 8192) {
-        PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(bytes, data);
-    }
-}
-
-void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) {
-
-    if (modulus == 1024) {
-        PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(data, bytes);
-    } else if (modulus == 8192) {
-        PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(data, bytes);
-    }
-
-}
diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.h b/crypto_kem/lightsaber/avx2/pack_unpack.h
index 9a5d41f0..2ba7822b 100644
--- a/crypto_kem/lightsaber/avx2/pack_unpack.h
+++ b/crypto_kem/lightsaber/avx2/pack_unpack.h
@@ -1,56 +1,28 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
 #include "SABER_params.h"
+#include "poly.h"
 #include <stdint.h>
 #include <stdio.h>
 
-void PQCLEAN_LIGHTSABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes);
+void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data);
 
-void PQCLEAN_LIGHTSABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus);
-
-void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_LIGHTSABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]);
 
-void PQCLEAN_LIGHTSABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+void PQCLEAN_LIGHTSABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]);
 
 
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data);
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]);
 
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
 
 
-void PQCLEAN_LIGHTSABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes);
+void PQCLEAN_LIGHTSABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]);
 
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_LIGHTSABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+void PQCLEAN_LIGHTSABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data);
 
 
 #endif
diff --git a/crypto_kem/lightsaber/avx2/poly.c b/crypto_kem/lightsaber/avx2/poly.c
new file mode 100644
index 00000000..56227f6f
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/poly.c
@@ -0,0 +1,62 @@
+#include "cbd.h"
+#include "fips202.h"
+#include "pack_unpack.h"
+#include "poly.h"
+
+
+void PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) {
+    size_t i, j;
+    toom4_points_product c_eval;
+
+    if (transpose) {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1);
+            }
+            PQCLEAN_LIGHTSABER_AVX2_toom4_interp(&c[i], &c_eval);
+        }
+    } else {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1);
+            }
+            PQCLEAN_LIGHTSABER_AVX2_toom4_interp(&c[i], &c_eval);
+        }
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) {
+    size_t i;
+    toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time
+
+    PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0);
+    for (i = 1; i < SABER_L; i++) {
+        PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1);
+    }
+
+    PQCLEAN_LIGHTSABER_AVX2_toom4_interp(c, &c_eval);
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) {
+    size_t i;
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_LIGHTSABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES);
+    }
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) {
+    size_t i;
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_LIGHTSABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES);
+    }
+}
diff --git a/crypto_kem/lightsaber/avx2/poly.h b/crypto_kem/lightsaber/avx2/poly.h
index 8f2a7574..2e7b2a11 100644
--- a/crypto_kem/lightsaber/avx2/poly.h
+++ b/crypto_kem/lightsaber/avx2/poly.h
@@ -1,27 +1,38 @@
 #ifndef POLY_H
 #define POLY_H
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
 #include "SABER_params.h"
+#include <immintrin.h>
 #include <stdint.h>
 
-typedef struct {
+typedef union {
     uint16_t coeffs[SABER_N];
+    __m256i dummy;
 } poly;
 
-typedef struct {
-    poly vec[SABER_K];
-} polyvec;
+typedef union {
+    uint16_t coeffs[4 * SABER_N];
+    __m256i dummy;
+} toom4_points;
 
-void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce);
+typedef union {
+    uint16_t coeffs[8 * SABER_N];
+    __m256i dummy;
+} toom4_points_product;
+
+void PQCLEAN_LIGHTSABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose);
+
+void PQCLEAN_LIGHTSABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]);
+
+void PQCLEAN_LIGHTSABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_LIGHTSABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]);
 
 
-void PQCLEAN_LIGHTSABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3);
+void PQCLEAN_LIGHTSABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval);
+
+void PQCLEAN_LIGHTSABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b);
+
+void PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate);
 
 
 #endif
diff --git a/crypto_kem/lightsaber/avx2/poly_mul.c b/crypto_kem/lightsaber/avx2/poly_mul.c
new file mode 100644
index 00000000..9ae8de05
--- /dev/null
+++ b/crypto_kem/lightsaber/avx2/poly_mul.c
@@ -0,0 +1,1524 @@
+#include "SABER_params.h"
+#include "poly.h"
+
+
+#define L (SABER_N / 64)
+
+static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) {
+    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
+}
+
+static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+    __m256i temp;
+
+    a0 = a[0];
+    a1 = a[1];
+    a2 = a[2];
+    a3 = a[3];
+    a4 = a[4];
+    a5 = a[5];
+    a6 = a[6];
+    a7 = a[7];
+
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    b4 = b[4];
+    b5 = b[5];
+    b6 = b[6];
+    b7 = b[7];
+
+    c[0] = mul_add(a0, b0, c[0]);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    temp = mul_add(a1, b0, temp);
+    c[1] = _mm256_add_epi16(temp, c[1]);
+
+    temp = _mm256_mullo_epi16(a0, b2);
+    temp = mul_add(a1, b1, temp);
+    temp = mul_add(a2, b0, temp);
+    c[2] = _mm256_add_epi16(temp, c[2]);
+
+    temp = _mm256_mullo_epi16(a0, b3);
+    temp = mul_add(a1, b2, temp);
+    temp = mul_add(a2, b1, temp);
+    temp = mul_add(a3, b0, temp);
+    c[3] = _mm256_add_epi16(temp, c[3]);
+
+    temp = _mm256_mullo_epi16(a0, b4);
+    temp = mul_add(a1, b3, temp);
+    temp = mul_add(a3, b1, temp);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    c[4] = _mm256_add_epi16(temp, c[4]);
+
+    temp = _mm256_mullo_epi16(a0, b5);
+    temp = mul_add(a1, b4, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add( a4, b1, temp);
+    temp = mul_add(a5, b0, temp);
+    c[5] = _mm256_add_epi16(temp, c[5]);
+
+    temp = _mm256_mullo_epi16(a0, b6);
+    temp = mul_add(a1, b5, temp);
+    temp = mul_add(a5, b1, temp);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a4, b2, temp);
+    c[6] = _mm256_add_epi16(temp, c[6]);
+
+    temp = _mm256_mullo_epi16(a0, b7);
+    temp = mul_add(a1, b6, temp);
+    temp = mul_add(a6, b1, temp);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a5, b2, temp);
+    c[7] = _mm256_add_epi16(temp, c[7]);
+
+    temp = _mm256_mullo_epi16(a0, b[8]);
+    temp = mul_add(a1, b7, temp);
+    temp = mul_add(a7, b1, temp);
+    temp = mul_add(a[8], b0, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a6, b2, temp);
+    c[8] = _mm256_add_epi16(temp, c[8]);
+
+    temp = _mm256_mullo_epi16(a0, b[9]);
+    temp = mul_add(a1, b[8], temp);
+    temp = mul_add(a[8], b1, temp);
+    temp = mul_add(a[9], b0, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a7, b2, temp);
+    c[9] = _mm256_add_epi16(temp, c[9]);
+
+    temp = _mm256_mullo_epi16(a0, b[10]);
+    temp = mul_add(a1, b[9], temp);
+    temp = mul_add(a[9], b1, temp);
+    temp = mul_add(a[10], b0, temp);
+    temp = mul_add(a2, b[8], temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a[8], b2, temp);
+    c[10] = _mm256_add_epi16(temp, c[10]);
+
+    temp = _mm256_mullo_epi16(a0, b[11]);
+    temp = mul_add(a1, b[10], temp);
+    temp = mul_add(a[10], b1, temp);
+    temp = mul_add(a[11], b0, temp);
+    temp = mul_add(a2, b[9], temp);
+    temp = mul_add(a3, b[8], temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a[8], b3, temp);
+    temp = mul_add(a[9], b2, temp);
+    c[11] = _mm256_add_epi16(temp, c[11]);
+
+    temp = _mm256_mullo_epi16(a0, b[12]);
+    temp = mul_add(a1, b[11], temp);
+    temp = mul_add(a[11], b1, temp);
+    temp = mul_add(a[12], b0, temp);
+    temp = mul_add(a2, b[10], temp);
+    temp = mul_add(a3, b[9], temp);
+    temp = mul_add(a4, b[8], temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a[8], b4, temp);
+    temp = mul_add(a[9], b3, temp);
+    temp = mul_add(a[10], b2, temp);
+    c[12] = _mm256_add_epi16(temp, c[12]);
+
+    temp = _mm256_mullo_epi16(a0, b[13]);
+    temp = mul_add(a1, b[12], temp);
+    temp = mul_add(a[12], b1, temp);
+    temp = mul_add(a[13], b0, temp);
+    temp = mul_add(a2, b[11], temp);
+    temp = mul_add(a3, b[10], temp);
+    temp = mul_add(a4, b[9], temp);
+    temp = mul_add(a5, b[8], temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a[8], b5, temp);
+    temp = mul_add(a[9], b4, temp);
+    temp = mul_add(a[10], b3, temp);
+    temp = mul_add(a[11], b2, temp);
+    c[13] = _mm256_add_epi16(temp, c[13]);
+
+    temp = _mm256_mullo_epi16(a0, b[14]);
+    temp = mul_add(a1, b[13], temp);
+    temp = mul_add(a[13], b1, temp);
+    temp = mul_add(a[14], b0, temp);
+    temp = mul_add(a2, b[12], temp);
+    temp = mul_add(a3, b[11], temp);
+    temp = mul_add(a4, b[10], temp);
+    temp = mul_add(a5, b[9], temp);
+    temp = mul_add(a6, b[8], temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a[8], b6, temp);
+    temp = mul_add(a[9], b5, temp);
+    temp = mul_add(a[10], b4, temp);
+    temp = mul_add(a[11], b3, temp);
+    temp = mul_add(a[12], b2, temp);
+    c[14] = _mm256_add_epi16(temp, c[14]);
+
+    temp = _mm256_mullo_epi16(a0, b[15]);
+    temp = mul_add(a1, b[14], temp);
+    temp = mul_add(a[14], b1, temp);
+    temp = mul_add(a[15], b0, temp);
+    temp = mul_add(a2, b[13], temp);
+    temp = mul_add(a3, b[12], temp);
+    temp = mul_add(a4, b[11], temp);
+    temp = mul_add(a5, b[10], temp);
+    temp = mul_add(a6, b[9], temp);
+    temp = mul_add(a7, b[8], temp);
+    temp = mul_add(a[8], b7, temp);
+    temp = mul_add(a[9], b6, temp);
+    temp = mul_add(a[10], b5, temp);
+    temp = mul_add(a[11], b4, temp);
+    temp = mul_add(a[12], b3, temp);
+    temp = mul_add(a[13], b2, temp);
+    c[15] = _mm256_add_epi16(temp, c[15]);
+
+    a0 = a[14];
+    a1 = a[15];
+    a2 = a[13];
+    a3 = a[12];
+    a4 = a[11];
+    a5 = a[10];
+    a6 = a[9];
+    a7 = a[8];
+
+    b0 = b[14];
+    b1 = b[15];
+    b2 = b[13];
+    b3 = b[12];
+    b4 = b[11];
+    b5 = b[10];
+    b6 = b[9];
+    b7 = b[8];
+
+    temp = _mm256_mullo_epi16(a[1], b1);
+    temp = mul_add(a[2], b0, temp);
+    temp = mul_add(a[3], b2, temp);
+    temp = mul_add(a[4], b3, temp);
+    temp = mul_add(a[5], b4, temp);
+    temp = mul_add(a[6], b5, temp);
+    temp = mul_add(a[7], b6, temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a6, b[7], temp);
+    temp = mul_add(a5, b[6], temp);
+    temp = mul_add(a4, b[5], temp);
+    temp = mul_add(a3, b[4], temp);
+    temp = mul_add(a2, b[3], temp);
+    temp = mul_add(a0, b[2], temp);
+    temp = mul_add(a1, b[1], temp);
+    c[16] = _mm256_add_epi16(temp, c[16]);
+
+    temp = _mm256_mullo_epi16(a[2], b1);
+    temp = mul_add(a[3], b0, temp);
+    temp = mul_add(a[4], b2, temp);
+    temp = mul_add(a[5], b3, temp);
+    temp = mul_add(a[6], b4, temp);
+    temp = mul_add(a[7], b5, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a5, b[7], temp);
+    temp = mul_add(a4, b[6], temp);
+    temp = mul_add(a3, b[5], temp);
+    temp = mul_add(a2, b[4], temp);
+    temp = mul_add(a0, b[3], temp);
+    temp = mul_add(a1, b[2], temp);
+    c[17] = _mm256_add_epi16(temp, c[17]);
+
+    temp = _mm256_mullo_epi16(a[3], b1);
+    temp = mul_add(a[4], b0, temp);
+    temp = mul_add(a[5], b2, temp);
+    temp = mul_add(a[6], b3, temp);
+    temp = mul_add(a[7], b4, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a4, b[7], temp);
+    temp = mul_add(a3, b[6], temp);
+    temp = mul_add(a2, b[5], temp);
+    temp = mul_add(a0, b[4], temp);
+    temp = mul_add(a1, b[3], temp);
+    c[18] = _mm256_add_epi16(temp, c[18]);
+
+    temp = _mm256_mullo_epi16(a[4], b1);
+    temp = mul_add(a[5], b0, temp);
+    temp = mul_add(a[6], b2, temp);
+    temp = mul_add(a[7], b3, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a3, b[7], temp);
+    temp = mul_add(a2, b[6], temp);
+    temp = mul_add(a0, b[5], temp);
+    temp = mul_add(a1, b[4], temp);
+    c[19] = _mm256_add_epi16(temp, c[19]);
+
+    temp = _mm256_mullo_epi16(a[5], b1);
+    temp = mul_add(a[6], b0, temp);
+    temp = mul_add(a[7], b2, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a2, b[7], temp);
+    temp = mul_add(a0, b[6], temp);
+    temp = mul_add(a1, b[5], temp);
+    c[20] = _mm256_add_epi16(temp, c[20]);
+
+    temp = _mm256_mullo_epi16(a[6], b1);
+    temp = mul_add(a[7], b0, temp);
+    temp = mul_add(a7, b2, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a0, b[7], temp);
+    temp = mul_add(a1, b[6], temp);
+    c[21] = _mm256_add_epi16(temp, c[21]);
+
+    temp = _mm256_mullo_epi16(a[7], b1);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a6, b2, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a0, b7, temp);
+    temp = mul_add(a1, b[7], temp);
+    c[22] = _mm256_add_epi16(temp, c[22]);
+
+    temp = _mm256_mullo_epi16(a7, b1);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a5, b2, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a0, b6, temp);
+    temp = mul_add(a1, b7, temp);
+    c[23] = _mm256_add_epi16(temp, c[23]);
+
+    temp = _mm256_mullo_epi16(a6, b1);
+    temp = mul_add(a5, b0, temp);
+    temp = mul_add(a4, b2, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a0, b5, temp);
+    temp = mul_add(a1, b6, temp);
+    c[24] = _mm256_add_epi16(temp, c[24]);
+
+    temp = _mm256_mullo_epi16(a5, b1);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a0, b4, temp);
+    temp = mul_add(a1, b5, temp);
+    c[25] = _mm256_add_epi16(temp, c[25]);
+
+    temp = _mm256_mullo_epi16(a4, b1);
+    temp = mul_add(a3, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    temp = mul_add(a0, b3, temp);
+    temp = mul_add(a1, b4, temp);
+    c[26] = _mm256_add_epi16(temp, c[26]);
+
+    temp = _mm256_mullo_epi16(a3, b1);
+    temp = mul_add(a2, b0, temp);
+    temp = mul_add(a0, b2, temp);
+    temp = mul_add(a1, b3, temp);
+    c[27] = _mm256_add_epi16(temp, c[27]);
+
+    temp = _mm256_mullo_epi16(a2, b1);
+    temp = mul_add(a0, b0, temp);
+    temp = mul_add(a1, b2, temp);
+    c[28] = _mm256_add_epi16(temp, c[28]);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    temp = mul_add(a1, b0, temp);
+    c[29] = _mm256_add_epi16(temp, c[29]);
+
+    c[30] = mul_add(a1, b1, c[30]);
+
+    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+}
+
+
+static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+    __m256i temp;
+
+    a0 = a[0];
+    a1 = a[1];
+    a2 = a[2];
+    a3 = a[3];
+    a4 = a[4];
+    a5 = a[5];
+    a6 = a[6];
+    a7 = a[7];
+
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    b4 = b[4];
+    b5 = b[5];
+    b6 = b[6];
+    b7 = b[7];
+
+    c[0] = _mm256_mullo_epi16(a0, b0);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    c[1] = mul_add(a1, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b2);
+    temp = mul_add(a1, b1, temp);
+    c[2] = mul_add(a2, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b3);
+    temp = mul_add(a1, b2, temp);
+    temp = mul_add(a2, b1, temp);
+    c[3] = mul_add(a3, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b4);
+    temp = mul_add(a1, b3, temp);
+    temp = mul_add(a3, b1, temp);
+    temp = mul_add(a4, b0, temp);
+    c[4] = mul_add(a2, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b5);
+    temp = mul_add(a1, b4, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add( a4, b1, temp);
+    c[5] = mul_add(a5, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b6);
+    temp = mul_add(a1, b5, temp);
+    temp = mul_add(a5, b1, temp);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a3, b3, temp);
+    c[6] = mul_add(a4, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b7);
+    temp = mul_add(a1, b6, temp);
+    temp = mul_add(a6, b1, temp);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a4, b3, temp);
+    c[7] = mul_add(a5, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[8]);
+    temp = mul_add(a1, b7, temp);
+    temp = mul_add(a7, b1, temp);
+    temp = mul_add(a[8], b0, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a5, b3, temp);
+    c[8] = mul_add(a6, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[9]);
+    temp = mul_add(a1, b[8], temp);
+    temp = mul_add(a[8], b1, temp);
+    temp = mul_add(a[9], b0, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a6, b3, temp);
+    c[9] = mul_add(a7, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[10]);
+    temp = mul_add(a1, b[9], temp);
+    temp = mul_add(a[9], b1, temp);
+    temp = mul_add(a[10], b0, temp);
+    temp = mul_add(a2, b[8], temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a7, b3, temp);
+    c[10] = mul_add(a[8], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[11]);
+    temp = mul_add(a1, b[10], temp);
+    temp = mul_add(a[10], b1, temp);
+    temp = mul_add(a[11], b0, temp);
+    temp = mul_add(a2, b[9], temp);
+    temp = mul_add(a3, b[8], temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a[8], b3, temp);
+    c[11] = mul_add(a[9], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[12]);
+    temp = mul_add(a1, b[11], temp);
+    temp = mul_add(a[11], b1, temp);
+    temp = mul_add(a[12], b0, temp);
+    temp = mul_add(a2, b[10], temp);
+    temp = mul_add(a3, b[9], temp);
+    temp = mul_add(a4, b[8], temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a[8], b4, temp);
+    temp = mul_add(a[9], b3, temp);
+    c[12] = mul_add(a[10], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[13]);
+    temp = mul_add(a1, b[12], temp);
+    temp = mul_add(a[12], b1, temp);
+    temp = mul_add(a[13], b0, temp);
+    temp = mul_add(a2, b[11], temp);
+    temp = mul_add(a3, b[10], temp);
+    temp = mul_add(a4, b[9], temp);
+    temp = mul_add(a5, b[8], temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a[8], b5, temp);
+    temp = mul_add(a[9], b4, temp);
+    temp = mul_add(a[10], b3, temp);
+    c[13] = mul_add(a[11], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[14]);
+    temp = mul_add(a1, b[13], temp);
+    temp = mul_add(a[13], b1, temp);
+    temp = mul_add(a[14], b0, temp);
+    temp = mul_add(a2, b[12], temp);
+    temp = mul_add(a3, b[11], temp);
+    temp = mul_add(a4, b[10], temp);
+    temp = mul_add(a5, b[9], temp);
+    temp = mul_add(a6, b[8], temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a[8], b6, temp);
+    temp = mul_add(a[9], b5, temp);
+    temp = mul_add(a[10], b4, temp);
+    temp = mul_add(a[11], b3, temp);
+    c[14] = mul_add(a[12], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[15]);
+    temp = mul_add(a1, b[14], temp);
+    temp = mul_add(a[14], b1, temp);
+    temp = mul_add(a[15], b0, temp);
+    temp = mul_add(a2, b[13], temp);
+    temp = mul_add(a3, b[12], temp);
+    temp = mul_add(a4, b[11], temp);
+    temp = mul_add(a5, b[10], temp);
+    temp = mul_add(a6, b[9], temp);
+    temp = mul_add(a7, b[8], temp);
+    temp = mul_add(a[8], b7, temp);
+    temp = mul_add(a[9], b6, temp);
+    temp = mul_add(a[10], b5, temp);
+    temp = mul_add(a[11], b4, temp);
+    temp = mul_add(a[12], b3, temp);
+    c[15] = mul_add(a[13], b2, temp);
+
+    // unrolled second triangle
+    a0 = a[14];
+    a1 = a[15];
+    a2 = a[13];
+    a3 = a[12];
+    a4 = a[11];
+    a5 = a[10];
+    a6 = a[9];
+    a7 = a[8];
+
+    b0 = b[14];
+    b1 = b[15];
+    b2 = b[13];
+    b3 = b[12];
+    b4 = b[11];
+    b5 = b[10];
+    b6 = b[9];
+    b7 = b[8];
+
+    temp = _mm256_mullo_epi16(a[1], b1);
+    temp = mul_add(a[2], b0, temp);
+    temp = mul_add(a[3], b2, temp);
+    temp = mul_add(a[4], b3, temp);
+    temp = mul_add(a[5], b4, temp);
+    temp = mul_add(a[6], b5, temp);
+    temp = mul_add(a[7], b6, temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a6, b[7], temp);
+    temp = mul_add(a5, b[6], temp);
+    temp = mul_add(a4, b[5], temp);
+    temp = mul_add(a3, b[4], temp);
+    temp = mul_add(a2, b[3], temp);
+    temp = mul_add(a0, b[2], temp);
+    c[16] = mul_add(a1, b[1], temp);
+
+    temp = _mm256_mullo_epi16(a[2], b1);
+    temp = mul_add(a[3], b0, temp);
+    temp = mul_add(a[4], b2, temp);
+    temp = mul_add(a[5], b3, temp);
+    temp = mul_add(a[6], b4, temp);
+    temp = mul_add(a[7], b5, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a5, b[7], temp);
+    temp = mul_add(a4, b[6], temp);
+    temp = mul_add(a3, b[5], temp);
+    temp = mul_add(a2, b[4], temp);
+    temp = mul_add(a0, b[3], temp);
+    c[17] = mul_add(a1, b[2], temp);
+
+    temp = _mm256_mullo_epi16(a[3], b1);
+    temp = mul_add(a[4], b0, temp);
+    temp = mul_add(a[5], b2, temp);
+    temp = mul_add(a[6], b3, temp);
+    temp = mul_add(a[7], b4, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a4, b[7], temp);
+    temp = mul_add(a3, b[6], temp);
+    temp = mul_add(a2, b[5], temp);
+    temp = mul_add(a0, b[4], temp);
+    c[18] = mul_add(a1, b[3], temp);
+
+    temp = _mm256_mullo_epi16(a[4], b1);
+    temp = mul_add(a[5], b0, temp);
+    temp = mul_add(a[6], b2, temp);
+    temp = mul_add(a[7], b3, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a3, b[7], temp);
+    temp = mul_add(a2, b[6], temp);
+    temp = mul_add(a0, b[5], temp);
+    c[19] = mul_add(a1, b[4], temp);
+
+    temp = _mm256_mullo_epi16(a[5], b1);
+    temp = mul_add(a[6], b0, temp);
+    temp = mul_add(a[7], b2, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a2, b[7], temp);
+    temp = mul_add(a0, b[6], temp);
+    c[20] = mul_add(a1, b[5], temp);
+
+    temp = _mm256_mullo_epi16(a[6], b1);
+    temp = mul_add(a[7], b0, temp);
+    temp = mul_add(a7, b2, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a0, b[7], temp);
+    c[21] = mul_add(a1, b[6], temp);
+
+    temp = _mm256_mullo_epi16(a[7], b1);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a6, b2, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a0, b7, temp);
+    c[22] = mul_add(a1, b[7], temp);
+
+    temp = _mm256_mullo_epi16(a7, b1);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a5, b2, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a0, b6, temp);
+    c[23] = mul_add(a1, b7, temp);
+
+    temp = _mm256_mullo_epi16(a6, b1);
+    temp = mul_add(a5, b0, temp);
+    temp = mul_add(a4, b2, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a0, b5, temp);
+    c[24] = mul_add(a1, b6, temp);
+
+    temp = _mm256_mullo_epi16(a5, b1);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a0, b4, temp);
+    c[25] = mul_add(a1, b5, temp);
+
+    temp = _mm256_mullo_epi16(a4, b1);
+    temp = mul_add(a3, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    temp = mul_add(a0, b3, temp);
+    c[26] = mul_add(a1, b4, temp);
+
+    temp = _mm256_mullo_epi16(a3, b1);
+    temp = mul_add(a2, b0, temp);
+    temp = mul_add(a0, b2, temp);
+    c[27] = mul_add(a1, b3, temp);
+
+    temp = _mm256_mullo_epi16(a2, b1);
+    temp = mul_add(a0, b0, temp);
+    c[28] = mul_add(a1, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    c[29] = mul_add(a1, b0, temp);
+
+    c[30] = _mm256_mullo_epi16(a1, b1);
+
+    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+}
+
+static void transpose(__m256i *M) {
+    __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
+    __m256i temp, temp0, temp1, temp2;
+
+    r0 = _mm256_unpacklo_epi16(M[0], M[1]);
+    r1 = _mm256_unpacklo_epi16(M[2], M[3]);
+    r2 = _mm256_unpacklo_epi16(M[4], M[5]);
+    r3 = _mm256_unpacklo_epi16(M[6], M[7]);
+    r4 = _mm256_unpacklo_epi16(M[8], M[9]);
+    r5 = _mm256_unpacklo_epi16(M[10], M[11]);
+    r6 = _mm256_unpacklo_epi16(M[12], M[13]);
+    r7 = _mm256_unpacklo_epi16(M[14], M[15]);
+
+    temp = _mm256_unpacklo_epi32(r0, r1);
+    temp0 = _mm256_unpacklo_epi32(r2, r3);
+    temp1 = _mm256_unpacklo_epi32(r4, r5);
+    temp2 = _mm256_unpacklo_epi32(r6, r7);
+
+    r8 = _mm256_unpackhi_epi32(r0, r1);
+    r9 = _mm256_unpackhi_epi32(r2, r3);
+    r10 = _mm256_unpackhi_epi32(r4, r5);
+    r11 = _mm256_unpackhi_epi32(r6, r7);
+
+    r0 = _mm256_unpacklo_epi64(temp, temp0);
+    r2 = _mm256_unpackhi_epi64(temp, temp0);
+    r1 = _mm256_unpacklo_epi64(temp1, temp2);
+    r3 = _mm256_unpackhi_epi64(temp1, temp2);
+
+    temp = _mm256_unpackhi_epi16(M[0], M[1]);
+    temp0 = _mm256_unpackhi_epi16(M[2], M[3]);
+    temp1 = _mm256_unpackhi_epi16(M[4], M[5]);
+    temp2 = _mm256_unpackhi_epi16(M[6], M[7]);
+
+    r4 = _mm256_unpackhi_epi16(M[8], M[9]);
+    M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
+    M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
+    M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
+    M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
+    r5 = _mm256_unpackhi_epi16(M[10], M[11]);
+    r6 = _mm256_unpackhi_epi16(M[12], M[13]);
+    r7 = _mm256_unpackhi_epi16(M[14], M[15]);
+
+    r0 = _mm256_unpacklo_epi64(r8, r9);
+    r1 = _mm256_unpacklo_epi64(r10, r11);
+    r2 = _mm256_unpackhi_epi64(r8, r9);
+    r3 = _mm256_unpackhi_epi64(r10, r11);
+
+    M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
+    M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
+    M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
+    M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
+
+    r0 = _mm256_unpacklo_epi32(temp, temp0);
+    r1 = _mm256_unpacklo_epi32(temp1, temp2);
+    r2 = _mm256_unpacklo_epi32(r4, r5);
+    r3 = _mm256_unpacklo_epi32(r6, r7);
+
+    r8 = _mm256_unpacklo_epi64(r0, r1);
+    r10 = _mm256_unpackhi_epi64(r0, r1);
+    r9 = _mm256_unpacklo_epi64(r2, r3);
+    r11 = _mm256_unpackhi_epi64(r2, r3);
+
+    M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
+    M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
+    M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
+    M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
+
+    r0 = _mm256_unpackhi_epi32(temp, temp0);
+    r1 = _mm256_unpackhi_epi32(temp1, temp2);
+    r2 = _mm256_unpackhi_epi32(r4, r5);
+    r3 = _mm256_unpackhi_epi32(r6, r7);
+
+    r4 = _mm256_unpacklo_epi64(r0, r1);
+    r6 = _mm256_unpackhi_epi64(r0, r1);
+    r5 = _mm256_unpacklo_epi64(r2, r3);
+    r7 = _mm256_unpackhi_epi64(r2, r3);
+
+    M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
+    M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
+    M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
+    M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
+}
+
+static void batch_64coefficient_multiplications(toom4_points_product *c_eval, const __m256i *a, const toom4_points *b_eval, int accumulate) {
+    toom4_points a_eval;// Holds evaluation (a & b) for 7 Karatsuba at a time
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+    __m256i *va = (__m256i *)a_eval.coeffs;
+    __m256i *vb = (__m256i *)b_eval->coeffs;
+    __m256i *vc = (__m256i *)c_eval->coeffs;
+
+    //------------------AVX evaluation for 1st poly-----------------------
+    r0_avx = a[0 * L + 0];
+    r1_avx = a[0 * L + 1];
+    r2_avx = a[0 * L + 2];
+    r3_avx = a[0 * L + 3];
+
+    va[0] = r0_avx;
+    va[1] = r1_avx;
+    va[2] = r2_avx;
+    va[3] = r3_avx;
+    va[4] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8] = _mm256_add_epi16(va[6], va[7]);
+    //------------------AVX evaluation for 1st poly ends------------------
+
+    //------------------AVX evaluation for 2nd poly-----------------------
+    r0_avx = a[1 * L + 0];
+    r1_avx = a[1 * L + 1];
+    r2_avx = a[1 * L + 2];
+    r3_avx = a[1 * L + 3];
+
+    va[0 + 9] = r0_avx;
+    va[1 + 9] = r1_avx;
+    va[2 + 9] = r2_avx;
+    va[3 + 9] = r3_avx;
+    va[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 9] = _mm256_add_epi16(va[6 + 9], va[7 + 9]);
+    //------------------AVX evaluation for 2nd poly ends------------------
+
+    //------------------AVX evaluation for 3rd poly-----------------------
+    r0_avx = a[2 * L + 0];
+    r1_avx = a[2 * L + 1];
+    r2_avx = a[2 * L + 2];
+    r3_avx = a[2 * L + 3];
+
+    va[0 + 18] = r0_avx;
+    va[1 + 18] = r1_avx;
+    va[2 + 18] = r2_avx;
+    va[3 + 18] = r3_avx;
+    va[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 18] = _mm256_add_epi16(va[6 + 18], va[7 + 18]);
+    //------------------AVX evaluation for 3rd poly ends------------------
+
+    //------------------AVX evaluation for 4th poly-----------------------
+    r0_avx = a[3 * L + 0];
+    r1_avx = a[3 * L + 1];
+    r2_avx = a[3 * L + 2];
+    r3_avx = a[3 * L + 3];
+
+    va[0 + 27] = r0_avx;
+    va[1 + 27] = r1_avx;
+    va[2 + 27] = r2_avx;
+    va[3 + 27] = r3_avx;
+    va[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 27] = _mm256_add_epi16(va[6 + 27], va[7 + 27]);
+    //------------------AVX evaluation for 4th poly ends------------------
+
+    //------------------AVX evaluation for 5th poly-----------------------
+    r0_avx = a[4 * L + 0];
+    r1_avx = a[4 * L + 1];
+    r2_avx = a[4 * L + 2];
+    r3_avx = a[4 * L + 3];
+
+    va[0 + 36] = r0_avx;
+    va[1 + 36] = r1_avx;
+    va[2 + 36] = r2_avx;
+    va[3 + 36] = r3_avx;
+    va[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 36] = _mm256_add_epi16(va[6 + 36], va[7 + 36]);
+    //------------------AVX evaluation for 5th poly ends------------------
+
+    //------------------AVX evaluation for 6th poly-----------------------
+    r0_avx = a[5 * L + 0];
+    r1_avx = a[5 * L + 1];
+    r2_avx = a[5 * L + 2];
+    r3_avx = a[5 * L + 3];
+
+    va[0 + 45] = r0_avx;
+    va[1 + 45] = r1_avx;
+    va[2 + 45] = r2_avx;
+    va[3 + 45] = r3_avx;
+    va[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 45] = _mm256_add_epi16(va[6 + 45], va[7 + 45]);
+    //------------------AVX evaluation for 6th poly ends------------------
+
+    //------------------AVX evaluation for 7th poly-----------------------
+    r0_avx = a[6 * L + 0];
+    r1_avx = a[6 * L + 1];
+    r2_avx = a[6 * L + 2];
+    r3_avx = a[6 * L + 3];
+
+    va[0 + 54] = r0_avx;
+    va[1 + 54] = r1_avx;
+    va[2 + 54] = r2_avx;
+    va[3 + 54] = r3_avx;
+    va[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 54] = _mm256_add_epi16(va[6 + 54], va[7 + 54]);
+    //------------------AVX evaluation for 7th poly ends------------------
+
+    //-----------------Forward transposes--------------------------------------
+    transpose(va);
+    transpose(va + 16);
+    transpose(va + 32);
+    transpose(va + 48);
+    //-----------------Forward transposes ends---------------------------------
+
+    if (accumulate == 0) {
+        schoolbook_avx(vc, va, vb);
+        schoolbook_avx(vc + 32, va + 16, vb + 16);
+        schoolbook_avx(vc + 64, va + 32, vb + 32);
+        schoolbook_avx(vc + 96, va + 48, vb + 48);
+    } else {
+        schoolbook_avx_acc(vc, va, vb);
+        schoolbook_avx_acc(vc + 32, va + 16, vb + 16);
+        schoolbook_avx_acc(vc + 64, va + 32, vb + 32);
+        schoolbook_avx_acc(vc + 96, va + 48, vb + 48);
+    }
+}
+
+static void karatsuba_eval(__m256i *b_eval, const __m256i *b) {
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+    //-------1st poly----------------------------------------------------
+    r0_avx = b[0 * L + 0];
+    r1_avx = b[0 * L + 1];
+    r2_avx = b[0 * L + 2];
+    r3_avx = b[0 * L + 3];
+
+    b_eval[0] = r0_avx;
+    b_eval[1] = r1_avx;
+    b_eval[2] = r2_avx;
+    b_eval[3] = r3_avx;
+    b_eval[4] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8] = _mm256_add_epi16(b_eval[6], b_eval[7]);
+
+    //-------2nd poly----------------------------------------------------
+    r0_avx = b[1 * L + 0];
+    r1_avx = b[1 * L + 1];
+    r2_avx = b[1 * L + 2];
+    r3_avx = b[1 * L + 3];
+
+    b_eval[0 + 9] = r0_avx;
+    b_eval[1 + 9] = r1_avx;
+    b_eval[2 + 9] = r2_avx;
+    b_eval[3 + 9] = r3_avx;
+    b_eval[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 9] = _mm256_add_epi16(b_eval[6 + 9], b_eval[7 + 9]);
+
+    //-------3rd poly----------------------------------------------------
+    r0_avx = b[2 * L + 0];
+    r1_avx = b[2 * L + 1];
+    r2_avx = b[2 * L + 2];
+    r3_avx = b[2 * L + 3];
+
+    b_eval[0 + 18] = r0_avx;
+    b_eval[1 + 18] = r1_avx;
+    b_eval[2 + 18] = r2_avx;
+    b_eval[3 + 18] = r3_avx;
+    b_eval[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 18] = _mm256_add_epi16(b_eval[6 + 18], b_eval[7 + 18]);
+
+    //-------4th poly----------------------------------------------------
+    r0_avx = b[3 * L + 0];
+    r1_avx = b[3 * L + 1];
+    r2_avx = b[3 * L + 2];
+    r3_avx = b[3 * L + 3];
+
+    b_eval[0 + 27] = r0_avx;
+    b_eval[1 + 27] = r1_avx;
+    b_eval[2 + 27] = r2_avx;
+    b_eval[3 + 27] = r3_avx;
+    b_eval[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 27] = _mm256_add_epi16(b_eval[6 + 27], b_eval[7 + 27]);
+
+    //-------5th poly----------------------------------------------------
+    r0_avx = b[4 * L + 0];
+    r1_avx = b[4 * L + 1];
+    r2_avx = b[4 * L + 2];
+    r3_avx = b[4 * L + 3];
+
+    b_eval[0 + 36] = r0_avx;
+    b_eval[1 + 36] = r1_avx;
+    b_eval[2 + 36] = r2_avx;
+    b_eval[3 + 36] = r3_avx;
+    b_eval[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 36] = _mm256_add_epi16(b_eval[6 + 36], b_eval[7 + 36]);
+
+    //-------6th poly----------------------------------------------------
+    r0_avx = b[5 * L + 0];
+    r1_avx = b[5 * L + 1];
+    r2_avx = b[5 * L + 2];
+    r3_avx = b[5 * L + 3];
+
+    b_eval[0 + 45] = r0_avx;
+    b_eval[1 + 45] = r1_avx;
+    b_eval[2 + 45] = r2_avx;
+    b_eval[3 + 45] = r3_avx;
+    b_eval[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 45] = _mm256_add_epi16(b_eval[6 + 45], b_eval[7 + 45]);
+
+    //-------7th poly----------------------------------------------------
+    r0_avx = b[6 * L + 0];
+    r1_avx = b[6 * L + 1];
+    r2_avx = b[6 * L + 2];
+    r3_avx = b[6 * L + 3];
+
+    b_eval[0 + 54] = r0_avx;
+    b_eval[1 + 54] = r1_avx;
+    b_eval[2 + 54] = r2_avx;
+    b_eval[3 + 54] = r3_avx;
+    b_eval[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 54] = _mm256_add_epi16(b_eval[6 + 54], b_eval[7 + 54]);
+
+    //--------------Evaluating B poly ends-------------------------------
+    transpose(b_eval);
+    transpose(b_eval + 16);
+    transpose(b_eval + 32);
+    transpose(b_eval + 48);
+}
+
+static void karatsuba_interp(__m256i *result_final0, __m256i *result_final1, __m256i *result_final2, __m256i *result_final3, __m256i *result_final4, __m256i *result_final5, __m256i *result_final6, const __m256i *c_eval) {
+    __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
+    __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
+
+    //------------------------AVX interpolation for 1st poly external-------------------
+    res_avx0 = c_eval[0];
+    res_avx2 = c_eval[1];
+    res_avx4 = c_eval[2];
+    res_avx6 = c_eval[3];
+    c6_avx = c_eval[6];
+    c7_avx = c_eval[7];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[8], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[16];
+    res_avx3 = c_eval[17];
+    res_avx5 = c_eval[18];
+    res_avx7 = c_eval[19];
+    c22_avx = c_eval[22];
+    c23_avx = c_eval[23];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[21], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[24], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[20], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[5], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[4], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final0[0] = res_avx0;
+    result_final0[1] = res_avx1;
+    result_final0[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final0[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final0[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final0[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final0[6] = res_avx6;
+    result_final0[7] = res_avx7;
+    //------------------------AVX interpolation for 1st poly ends--------------
+
+
+    //------------------------AVX interpolation for 2nd poly external-------------------
+    res_avx0 = c_eval[9]; //c_eval0
+    res_avx2 = c_eval[10]; //c_eval1
+    res_avx4 = c_eval[11]; //c_eval2
+    res_avx6 = c_eval[12]; //c_eval3
+    c6_avx = c_eval[15]; //c_eval6
+    c7_avx = c_eval[32]; //c_eval7
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[33], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[25]; //c_eval0
+    res_avx3 = c_eval[26]; //c_eval1
+    res_avx5 = c_eval[27]; //c_eval2
+    res_avx7 = c_eval[28]; //c_eval3
+    c22_avx = c_eval[31];
+    c23_avx = c_eval[48];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[30], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[49], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[29], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[14], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[13], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final1[0] = res_avx0;
+    result_final1[1] = res_avx1;
+    result_final1[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final1[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final1[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final1[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final1[6] = res_avx6;
+    result_final1[7] = res_avx7;
+    //------------------------AVX interpolation for 2nd poly ends--------------
+
+    //------------------------AVX interpolation for 3rd poly external-------------------
+    res_avx0 = c_eval[34]; //c_eval0
+    res_avx2 = c_eval[35]; //c_eval1
+    res_avx4 = c_eval[36];
+    res_avx6 = c_eval[37];
+    c6_avx = c_eval[40];
+    c7_avx = c_eval[41];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[42], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[50]; //c_eval0
+    res_avx3 = c_eval[51]; //c_eval1
+    res_avx5 = c_eval[52];
+    res_avx7 = c_eval[53];
+    c22_avx = c_eval[56];
+    c23_avx = c_eval[57];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[55], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[58], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[54], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[39], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[38], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final2[0] = res_avx0;
+    result_final2[1] = res_avx1;
+    result_final2[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final2[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final2[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final2[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final2[6] = res_avx6;
+    result_final2[7] = res_avx7;
+    //------------------------AVX interpolation for 3rd poly ends--------------
+
+    //------------------------AVX interpolation for 4th poly external-------------------
+    res_avx0 = c_eval[43];
+    res_avx2 = c_eval[44];
+    res_avx4 = c_eval[45];
+    res_avx6 = c_eval[46];
+    c6_avx = c_eval[65];
+    c7_avx = c_eval[66];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[67], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[59];
+    res_avx3 = c_eval[60];
+    res_avx5 = c_eval[61];
+    res_avx7 = c_eval[62];
+    c22_avx = c_eval[81];
+    c23_avx = c_eval[82];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[80], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[83], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[63], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[64], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[47], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final3[0] = res_avx0;
+    result_final3[1] = res_avx1;
+    result_final3[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final3[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final3[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final3[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final3[6] = res_avx6;
+    result_final3[7] = res_avx7;
+    //------------------------AVX interpolation for 4th poly ends--------------
+
+    //------------------------AVX interpolation for 5th poly external-------------------
+    res_avx0 = c_eval[68];
+    res_avx2 = c_eval[69];
+    res_avx4 = c_eval[70];
+    res_avx6 = c_eval[71];
+    c6_avx = c_eval[74];
+    c7_avx = c_eval[75];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[76], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[84];
+    res_avx3 = c_eval[85];
+    res_avx5 = c_eval[86];
+    res_avx7 = c_eval[87];
+    c22_avx = c_eval[90];
+    c23_avx = c_eval[91];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[89], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[92], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[88], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[73], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[72], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final4[0] = res_avx0;
+    result_final4[1] = res_avx1;
+    result_final4[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final4[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final4[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final4[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final4[6] = res_avx6;
+    result_final4[7] = res_avx7;
+    //------------------------AVX interpolation for 5th poly ends--------------
+
+    //------------------------AVX interpolation for 6th poly external-------------------
+    res_avx0 = c_eval[77];
+    res_avx2 = c_eval[78];
+    res_avx4 = c_eval[79];
+    res_avx6 = c_eval[96];
+    c6_avx = c_eval[99];
+    c7_avx = c_eval[100];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[101], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[93];
+    res_avx3 = c_eval[94];
+    res_avx5 = c_eval[95];
+    res_avx7 = c_eval[112];
+    c22_avx = c_eval[115];
+    c23_avx = c_eval[116];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[114], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[117], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[113], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[98], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[97], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final5[0] = res_avx0;
+    result_final5[1] = res_avx1;
+    result_final5[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final5[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final5[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final5[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final5[6] = res_avx6;
+    result_final5[7] = res_avx7;
+    //------------------------AVX interpolation for 6th poly ends--------------
+
+    //------------------------AVX interpolation for 7th poly external-------------------
+    res_avx0 = c_eval[102];
+    res_avx2 = c_eval[103];
+    res_avx4 = c_eval[104];
+    res_avx6 = c_eval[105];
+    c6_avx = c_eval[108];
+    c7_avx = c_eval[109];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[110], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[118];
+    res_avx3 = c_eval[119];
+    res_avx5 = c_eval[120];
+    res_avx7 = c_eval[121];
+    c22_avx = c_eval[124];
+    c23_avx = c_eval[125];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[123], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[126], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[122], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[107], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[106], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final6[0] = res_avx0;
+    result_final6[1] = res_avx1;
+    result_final6[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final6[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final6[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final6[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final6[6] = res_avx6;
+    result_final6[7] = res_avx7;
+    //------------------------AVX interpolation for 7th poly ends--------------
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a, const toom4_points *b_eval, int accumulate) {
+    size_t i;
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+    __m256i aw_avx[7 * L];
+    __m256i *va = (__m256i *)a->coeffs;
+
+    for (i = 0; i < L; i++) {
+        r0_avx = va[0 * L + i];
+        r1_avx = va[1 * L + i];
+        r2_avx = va[2 * L + i];
+        r3_avx = va[3 * L + i];
+        r4_avx = _mm256_add_epi16(r0_avx, r2_avx);
+        r5_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        aw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        aw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r0_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r2_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r5_avx = _mm256_slli_epi16(r1_avx, 2);
+        r5_avx = _mm256_add_epi16(r5_avx, r3_avx);
+        aw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        aw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r3_avx, 3);
+        r6_avx = _mm256_slli_epi16(r2_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        r6_avx = _mm256_slli_epi16(r1_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        aw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx);
+        aw_avx[6 * L + i] = r0_avx;
+        aw_avx[0 * L + i] = r3_avx;
+    }
+
+    batch_64coefficient_multiplications(c_eval, aw_avx, b_eval, accumulate);
+}
+
+void PQCLEAN_LIGHTSABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b) {
+    size_t i;
+    __m256i bw_avx[7 * L];
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+    __m256i *vb = (__m256i *)b->coeffs;
+    __m256i *vb_eval = (__m256i *)b_eval->coeffs;
+
+    for (i = 0; i < L; i++) {
+        r0_avx = vb[0 * L + i];
+        r1_avx = vb[1 * L + i];
+        r2_avx = vb[2 * L + i];
+        r3_avx = vb[3 * L + i];
+        r4_avx = _mm256_add_epi16(r0_avx, r2_avx);
+        r5_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        bw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        bw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r0_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r2_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r5_avx = _mm256_slli_epi16(r1_avx, 2);
+        r5_avx = _mm256_add_epi16(r5_avx, r3_avx);
+        bw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        bw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r3_avx, 3);
+        r6_avx = _mm256_slli_epi16(r2_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        r6_avx = _mm256_slli_epi16(r1_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        bw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx);
+        bw_avx[6 * L + i] = r0_avx;
+        bw_avx[0 * L + i] = r3_avx;
+    }
+
+    karatsuba_eval(vb_eval, bw_avx);
+}
+
+
+void PQCLEAN_LIGHTSABER_AVX2_toom4_interp(poly *res, const toom4_points_product *c_eval) {
+    size_t i;
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
+    __m256i w1_avx[2 * L], w2_avx[2 * L], w3_avx[2 * L], w4_avx[2 * L], w5_avx[2 * L], w6_avx[2 * L], w7_avx[2 * L];
+    __m256i res_full[32];
+    __m256i *vc = (__m256i *)c_eval->coeffs;
+    __m256i *vres = (__m256i *)res->coeffs;
+
+    transpose(vc);
+    transpose(vc + 16);
+    transpose(vc + 32);
+    transpose(vc + 48);
+    transpose(vc + 64);
+    transpose(vc + 80);
+    transpose(vc + 96);
+    transpose(vc + 112);
+
+    karatsuba_interp(w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx, vc);
+
+    for (i = 0; i < 2 * L; i++) {
+        r0_avx = w1_avx[i];
+        r1_avx = w2_avx[i];
+        r2_avx = w3_avx[i];
+        r3_avx = w4_avx[i];
+        r4_avx = w5_avx[i];
+        r5_avx = w6_avx[i];
+        r6_avx = w7_avx[i];
+
+        r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
+        r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
+        r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
+        r3_avx = _mm256_srli_epi16(r3_avx, 1);
+        r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
+        temp_avx = _mm256_slli_epi16(r6_avx, 6);
+
+        r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
+        r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
+        temp_avx = _mm256_slli_epi16(r2_avx, 6);
+
+        r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
+        r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
+        r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
+        r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
+        temp_avx = _mm256_mullo_epi16(r2_avx, _mm256_set1_epi16(45));
+
+        r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+        temp_avx = _mm256_slli_epi16(r2_avx, 3);
+
+        r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+        r4_avx = _mm256_mullo_epi16(r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
+        r4_avx = _mm256_srli_epi16(r4_avx, 3);
+        r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
+        temp_avx = _mm256_slli_epi16(r3_avx, 4);
+
+        r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+        r1_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
+        r1_avx = _mm256_srli_epi16(r1_avx, 1);
+        r3_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        r3_avx = _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
+        temp_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(30));
+        temp_avx = _mm256_sub_epi16(temp_avx, r5_avx);
+        temp_avx = _mm256_mullo_epi16(temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
+
+        r5_avx = _mm256_srli_epi16(temp_avx, 2);
+        r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
+        r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
+
+        if (i < L) {
+            res_full[0 * L + i] = r6_avx;
+            res_full[1 * L + i] = r5_avx;
+            res_full[2 * L + i] = r4_avx;
+            res_full[3 * L + i] = r3_avx;
+            res_full[4 * L + i] = r2_avx;
+            res_full[5 * L + i] = r1_avx;
+            res_full[6 * L + i] = r0_avx;
+        } else {
+            res_full[0 * L + i] = _mm256_add_epi16(res_full[0 * L + i], r6_avx);
+            res_full[1 * L + i] = _mm256_add_epi16(res_full[1 * L + i], r5_avx);
+            res_full[2 * L + i] = _mm256_add_epi16(res_full[2 * L + i], r4_avx);
+            res_full[3 * L + i] = _mm256_add_epi16(res_full[3 * L + i], r3_avx);
+            res_full[4 * L + i] = _mm256_add_epi16(res_full[4 * L + i], r2_avx);
+            res_full[5 * L + i] = _mm256_add_epi16(res_full[5 * L + i], r1_avx);
+            res_full[6 * L + i] = r0_avx;
+        }
+    }
+
+    // Reduction by X^256 + 1
+    for (i = 0; i < 16; i++) {
+        vres[i] = _mm256_sub_epi16(res_full[i], res_full[i + 16]);
+    }
+}
diff --git a/crypto_kem/lightsaber/avx2/polymul/consts.h b/crypto_kem/lightsaber/avx2/polymul/consts.h
deleted file mode 100644
index 40826398..00000000
--- a/crypto_kem/lightsaber/avx2/polymul/consts.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "../SABER_params.h"
-
-#define AVX_N (SABER_N >> 4)
-#define small_len_avx (AVX_N >> 2)
-
-#define SCHB_N 16
-
-#define N_SB (SABER_N >> 2)
-#define N_SB_RES (2*N_SB-1)
-
-#define N_SB_16 (N_SB >> 2)
-#define N_SB_16_RES (2*N_SB_16-1)
-
-#define AVX_N1 16 /*N/16*/ 
-
-#define SCM_SIZE 16
-
-// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements
-#define NUM_POLY SABER_K
-//int NUM_POLY=2; 
diff --git a/crypto_kem/lightsaber/avx2/polymul/matrix.c b/crypto_kem/lightsaber/avx2/polymul/matrix.c
deleted file mode 100644
index 5fa35783..00000000
--- a/crypto_kem/lightsaber/avx2/polymul/matrix.c
+++ /dev/null
@@ -1,303 +0,0 @@
-#include <immintrin.h>
-
-static void transpose_n1(__m256i *M)
-{
-	//int i;
-	register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
-	register __m256i temp, temp0, temp1, temp2;
-
-	//for(i=0; i<8; i=i+1)
-	//{
-		r0 = _mm256_unpacklo_epi16(M[0], M[1]); 
-		r1 = _mm256_unpacklo_epi16(M[2], M[3]); 
-		r2 = _mm256_unpacklo_epi16(M[4], M[5]); 
-		r3 = _mm256_unpacklo_epi16(M[6], M[7]);
-		r4 = _mm256_unpacklo_epi16(M[8], M[9]); 
-		r5 = _mm256_unpacklo_epi16(M[10], M[11]);
-		r6 = _mm256_unpacklo_epi16(M[12], M[13]); 
-		r7 = _mm256_unpacklo_epi16(M[14], M[15]); 
-
-
-		temp = _mm256_unpacklo_epi32(r0, r1); 
-		temp0 = _mm256_unpacklo_epi32(r2, r3); 
-		temp1 = _mm256_unpacklo_epi32(r4, r5); 
-		temp2 = _mm256_unpacklo_epi32(r6, r7); 
-
-		r8 = _mm256_unpackhi_epi32(r0, r1); 
-		r9 = _mm256_unpackhi_epi32(r2, r3); 
-		r10 = _mm256_unpackhi_epi32(r4, r5); 
-		r11 = _mm256_unpackhi_epi32(r6, r7);
-
-		r0 = _mm256_unpacklo_epi64(temp, temp0); 
-		r2 = _mm256_unpackhi_epi64(temp, temp0); 
-
-		r1 = _mm256_unpacklo_epi64(temp1, temp2); 
-		r3 = _mm256_unpackhi_epi64(temp1, temp2);
-
-		temp = _mm256_unpackhi_epi16(M[0], M[1]); 
-		temp0 = _mm256_unpackhi_epi16(M[2], M[3]); 
-		temp1 = _mm256_unpackhi_epi16(M[4], M[5]); 
-		temp2 = _mm256_unpackhi_epi16(M[6], M[7]); 
-		r4 = _mm256_unpackhi_epi16(M[8], M[9]); 
-
-		M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
-		M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
-		M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
-		M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
-
-
-		r5 = _mm256_unpackhi_epi16(M[10], M[11]); 
-		r6 = _mm256_unpackhi_epi16(M[12], M[13]); 
-		r7 = _mm256_unpackhi_epi16(M[14], M[15]); 
-
-
-
-		r0 = _mm256_unpacklo_epi64(r8, r9); 
-		r1 = _mm256_unpacklo_epi64(r10, r11); 
-
-		r2 = _mm256_unpackhi_epi64(r8, r9); 
-		r3 = _mm256_unpackhi_epi64(r10, r11); 
-
-
-
-		M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
-		M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
-		M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
-		M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
-
-
-	//for(i=0; i<4; i=i+1)
-	//{
-		r0 = _mm256_unpacklo_epi32(temp, temp0); 
-		r1 = _mm256_unpacklo_epi32(temp1, temp2);
-		r2 = _mm256_unpacklo_epi32(r4, r5); 
-		r3 = _mm256_unpacklo_epi32(r6, r7); 
-
-	//}
-
-
-	//for(i=0; i<2; i=i+1)
-	//{
-		r8 = _mm256_unpacklo_epi64(r0, r1); 
-		r10 = _mm256_unpackhi_epi64(r0, r1); 
-
-		r9 = _mm256_unpacklo_epi64(r2, r3); 
-		r11 = _mm256_unpackhi_epi64(r2, r3); 
-
-		M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
-		M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
-		M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
-		M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
-
-		r0 = _mm256_unpackhi_epi32(temp, temp0); 
-		r1 = _mm256_unpackhi_epi32(temp1, temp2); 
-		r2 = _mm256_unpackhi_epi32(r4, r5); 
-		r3 = _mm256_unpackhi_epi32(r6, r7); 
-
-	//}
-//	for(i=0; i<2; i=i+1)
-//	{
-		r4 = _mm256_unpacklo_epi64(r0, r1); 
-		r6 = _mm256_unpackhi_epi64(r0, r1); 
-
-		r5 = _mm256_unpacklo_epi64(r2, r3); 
-		r7 = _mm256_unpackhi_epi64(r2, r3); 
-
-//	}
-
-	//-------------------------------------------------------
-
-	M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
-	M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
-	M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
-	M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
-}
-
-/*
-void transpose_unrolled(__m256i *M)
-{
-	int i;
-	__m256i tL[8], tH[8];
-	__m256i bL[4], bH[4], cL[4], cH[4];
-	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
-
-	__m256i r0, r1, r2, r3, r4, r5, r6, r7;
-
-	//for(i=0; i<8; i=i+1)
-	//{
-		tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); 
-		tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); 
-
-		tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); 
-		tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); 
-
-		tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); 
-		tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); 
-
-		tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); 
-		tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); 
-
-		tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); 
-		tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); 
-
-		tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); 
-		tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); 
-
-		tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); 
-		tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); 
-
-		tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); 
-		tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); 
-
-	//}
-
-	//-------------------------------------------------------
-	//for(i=0; i<4; i=i+1)
-	//{
-		bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); 
-		bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); 
-
-		bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); 
-		bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); 
-
-		bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); 
-		bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); 
-
-		bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); 
-		bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); 
-
-	//}
-
-	//for(i=0; i<2; i=i+1)
-	//{
-		dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); 
-		dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); 
-
-		dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); 
-		dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]);
-
-		M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
-		M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
-		M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
-		M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
-
-	//}
-	//for(i=0; i<2; i=i+1)
-	//{
-		eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); 
-		eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); 
-
-		eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); 
-		eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); 
-
-	//}
-
-	//-------------------------------------------------------
-
-	//-------------------------------------------------------
-	for(i=0; i<4; i=i+1)
-	{
-		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
-		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
-	}
-
-
-	for(i=0; i<2; i=i+1)
-	{
-		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
-		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
-		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
-	}
-
-	//-------------------------------------------------------
-
-
-
-	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
-	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
-	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
-	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
-
-	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
-	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
-	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
-	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
-
-	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
-	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
-	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
-	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
-}
-
-
-void transpose1(__m256i *M)
-{
-	int i;
-	__m256i tL[8], tH[8];
-	__m256i bL[4], bH[4], cL[4], cH[4];
-	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
-
-	for(i=0; i<8; i=i+1)
-	{
-		tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); 
-		tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); 
-	}
-
-	for(i=0; i<4; i=i+1)
-	{
-		bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); 
-		bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); 
-	}
-	for(i=0; i<4; i=i+1)
-	{
-		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
-		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
-	}
-
-	for(i=0; i<2; i=i+1)
-	{
-		dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); 
-		dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); 
-		eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); 
-	}
-
-	for(i=0; i<2; i=i+1)
-	{
-		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
-		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
-		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
-	}
-
-	M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
-	M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
-	M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
-	M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
-
-	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
-	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
-	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
-	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
-
-	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
-	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
-	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
-	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
-
-	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
-	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
-	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
-	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
-}
-*/
diff --git a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c b/crypto_kem/lightsaber/avx2/polymul/scm_avx.c
deleted file mode 100644
index 48870f51..00000000
--- a/crypto_kem/lightsaber/avx2/polymul/scm_avx.c
+++ /dev/null
@@ -1,753 +0,0 @@
-//#define SCM_SIZE 16
-
-//#pragma STDC FP_CONTRACT ON
-
-#include <immintrin.h>
-
-static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
-    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
-}
-
-
-static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
-									      //the c_avx are added cummulatively
-{
-
-	register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-	register __m256i temp;
-
-
-	a0=a[0];
-	a1=a[1];
-	a2=a[2];
-	a3=a[3];
-	a4=a[4];
-	a5=a[5];
-	a6=a[6];
-	a7=a[7];
-
-	b0=b[0];
-	b1=b[1];
-	b2=b[2];
-	b3=b[3];
-	b4=b[4];
-	b5=b[5];
-	b6=b[6];
-	b7=b[7];
-
-	// New Unrolled first triangle
-
-	//otherwise accumulate
-	c_avx[0] = mul_add(a0, b0, c_avx[0]);
-	
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	temp=mul_add(a1, b0, temp);
-	c_avx[1] = _mm256_add_epi16(temp, c_avx[1]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b2);
-	temp = mul_add(a1, b1, temp);
-	temp=mul_add(a2, b0, temp);
-	c_avx[2] = _mm256_add_epi16(temp, c_avx[2]);
-	
-
-	temp = _mm256_mullo_epi16 (a0, b3);
-	temp = mul_add(a1, b2, temp);
-	temp = mul_add(a2, b1, temp);
-	temp=mul_add(a3, b0, temp);
-	c_avx[3] = _mm256_add_epi16(temp, c_avx[3]);
-
-	temp = _mm256_mullo_epi16 (a0, b4);
-	temp = mul_add(a1, b3, temp);
-	temp = mul_add(a3, b1, temp);
-	temp = mul_add(a4, b0, temp);
-	temp=mul_add(a2, b2, temp);
-	c_avx[4] = _mm256_add_epi16(temp, c_avx[4]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b5);
-	temp = mul_add(a1, b4 , temp);
-	temp = mul_add(a2, b3, temp);
-	temp = mul_add(a3, b2, temp);
-	temp = mul_add( a4, b1, temp);
-	temp=mul_add(a5, b0, temp);
-	c_avx[5] = _mm256_add_epi16(temp, c_avx[5]);
-	
-	temp = _mm256_mullo_epi16 (a0, b6);
-	temp = mul_add(a1, b5, temp);
-	temp = mul_add(a5, b1, temp);
-	temp = mul_add(a6, b0, temp);
-	temp = mul_add(a2, b4, temp);
-	temp = mul_add(a3, b3, temp);
-	temp=mul_add(a4, b2, temp);
-	c_avx[6] = _mm256_add_epi16(temp, c_avx[6]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b7);
-	temp = mul_add(a1, b6, temp);
-	temp = mul_add (a6, b1, temp);
-	temp = mul_add (a7, b0, temp);
-	temp = mul_add(a2, b5, temp);
-	temp = mul_add (a3, b4, temp);
-	temp = mul_add (a4, b3, temp);
-	temp=mul_add(a5, b2, temp);
-	c_avx[7] = _mm256_add_epi16(temp, c_avx[7]);
-
-	temp = _mm256_mullo_epi16 (a0, b[8]);
-	temp = mul_add (a1, b7, temp);
-	temp = mul_add (a7, b1, temp);
-	temp = mul_add (a[8], b0, temp);
-	temp = mul_add (a2, b6,temp);
-	temp = mul_add(a3, b5, temp);
-	temp = mul_add (a4, b4,temp);
-	temp = mul_add (a5, b3, temp);
-	
-		temp=mul_add(a6, b2, temp);
-		c_avx[8] = _mm256_add_epi16(temp, c_avx[8]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[9]);
-	temp = mul_add (a1, b[8], temp);
-	temp = mul_add (a[8], b1, temp);
-	temp = mul_add (a[9], b0, temp);
-	temp = mul_add (a2, b7, temp);
-	temp = mul_add (a3, b6, temp);
-	temp = mul_add (a4, b5, temp);
-	temp = mul_add (a5, b4, temp);
-	temp = mul_add (a6, b3, temp);
-		temp=mul_add(a7, b2, temp);
-		c_avx[9] = _mm256_add_epi16(temp, c_avx[9]);
-
-
-	temp= _mm256_mullo_epi16 (a0, b[10]);
-	temp = mul_add (a1, b[9], temp);
-	temp = mul_add (a[9], b1, temp);
-	temp = mul_add (a[10], b0, temp);
-	temp = mul_add (a2, b[8], temp);
-	temp = mul_add (a3, b7, temp);
-	temp = mul_add (a4, b6, temp);
-	temp = mul_add (a5, b5, temp);
-	temp = mul_add (a6, b4, temp);
-	temp = mul_add (a7, b3, temp);
-		temp=mul_add(a[8], b2, temp);
-		c_avx[10] = _mm256_add_epi16(temp, c_avx[10]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[11]);
-	temp = mul_add (a1, b[10], temp );
-	temp = mul_add (a[10], b1, temp );
-	temp = mul_add (a[11], b0, temp );
-	temp = mul_add (a2, b[9], temp );
-	temp = mul_add (a3, b[8], temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a[8], b3, temp );
-		temp=mul_add(a[9], b2, temp);
-		c_avx[11] = _mm256_add_epi16(temp, c_avx[11]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[12]);
-	temp = mul_add (a1, b[11], temp);
-	temp = mul_add (a[11], b1, temp);
-	temp = mul_add (a[12], b0, temp);
-	temp = mul_add (a2, b[10], temp);
-	temp = mul_add (a3, b[9], temp);
-	temp = mul_add (a4, b[8], temp);
-	temp = mul_add (a5, b7, temp);
-	temp = mul_add (a6, b6, temp);
-	temp = mul_add (a7, b5, temp);
-	temp = mul_add (a[8], b4, temp);
-	temp = mul_add (a[9], b3, temp);
-		temp=mul_add(a[10], b2, temp);
-		c_avx[12] = _mm256_add_epi16(temp, c_avx[12]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[13]);
-	temp = mul_add (a1, b[12], temp );
-	temp = mul_add (a[12], b1, temp );
-	temp = mul_add (a[13], b0, temp );
-	temp = mul_add (a2, b[11], temp );
-	temp = mul_add (a3, b[10], temp );
-	temp = mul_add (a4, b[9], temp );
-	temp = mul_add (a5, b[8], temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a[8], b5, temp );
-	temp = mul_add (a[9], b4, temp );
-	temp = mul_add (a[10], b3, temp );
-		temp=mul_add(a[11], b2, temp);
-		c_avx[13] = _mm256_add_epi16(temp, c_avx[13]);
-
-
-
-	temp = _mm256_mullo_epi16 (a0, b[14]);
-	temp = mul_add (a1, b[13], temp );
-	temp = mul_add (a[13], b1, temp );
-	temp = mul_add (a[14], b0, temp );
-	temp = mul_add (a2, b[12], temp );
-	temp = mul_add (a3, b[11], temp );
-	temp = mul_add (a4, b[10], temp );
-	temp = mul_add (a5, b[9], temp );
-	temp = mul_add (a6, b[8], temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a[8], b6, temp );
-	temp = mul_add (a[9], b5, temp );
-	temp = mul_add (a[10], b4, temp );
-	temp = mul_add (a[11], b3, temp );
-		temp=mul_add(a[12], b2, temp);
-		c_avx[14] = _mm256_add_epi16(temp, c_avx[14]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[15]);
-	temp = mul_add (a1, b[14], temp );
-	temp = mul_add (a[14], b1, temp );
-	temp = mul_add (a[15], b0, temp );
-	temp = mul_add (a2, b[13], temp );
-	temp = mul_add (a3, b[12], temp );
-	temp = mul_add (a4, b[11], temp );
-	temp = mul_add (a5, b[10], temp );
-	temp = mul_add (a6, b[9], temp );
-	temp = mul_add (a7, b[8], temp );
-	temp = mul_add (a[8], b7, temp );
-	temp = mul_add (a[9], b6, temp );
-	temp = mul_add (a[10], b5, temp );
-	temp = mul_add (a[11], b4, temp );
-	temp = mul_add (a[12], b3, temp );
-		temp=mul_add(a[13], b2, temp);
-		c_avx[15] = _mm256_add_epi16(temp, c_avx[15]);
-
-
-	// unrolled second triangle
-	a0=a[14];
-	a1=a[15];
-	a2=a[13];
-	a3=a[12];
-	a4=a[11];
-	a5=a[10];
-	a6=a[9];
-	a7=a[8];
-
-	b0=b[14];
-	b1=b[15];
-	b2=b[13];
-	b3=b[12];
-	b4=b[11];
-	b5=b[10];
-	b6=b[9];
-	b7=b[8];
-
-	temp = _mm256_mullo_epi16 (a[1], b1);
-	temp = mul_add (a[2], b0, temp );
-	temp = mul_add (a[3], b2, temp );
-	temp = mul_add (a[4], b3, temp );
-	temp = mul_add (a[5], b4, temp );
-	temp = mul_add (a[6], b5, temp );
-	temp = mul_add (a[7], b6, temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a6, b[7], temp );
-	temp = mul_add (a5, b[6], temp );
-	temp = mul_add (a4, b[5], temp );
-	temp = mul_add (a3, b[4], temp );
-	temp = mul_add (a2, b[3], temp );
-	temp = mul_add (a0, b[2], temp );
-		temp=mul_add(a1, b[1], temp);
-		c_avx[16] = _mm256_add_epi16(temp, c_avx[16]);
-
-
-	temp = _mm256_mullo_epi16 (a[2], b1);
-	temp = mul_add (a[3], b0, temp );
-	temp = mul_add (a[4], b2, temp );
-	temp = mul_add (a[5], b3, temp );
-	temp = mul_add (a[6], b4, temp );
-	temp = mul_add (a[7], b5, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a5, b[7], temp );
-	temp = mul_add (a4, b[6], temp );
-	temp = mul_add (a3, b[5], temp );
-	temp = mul_add (a2, b[4], temp );
-	temp = mul_add (a0, b[3], temp );
-		temp=mul_add(a1, b[2], temp);
-		c_avx[17] = _mm256_add_epi16(temp, c_avx[17]);
-
-
-	temp = _mm256_mullo_epi16 (a[3], b1);
-	temp = mul_add (a[4], b0, temp );
-	temp = mul_add (a[5], b2, temp );
-	temp = mul_add (a[6], b3, temp );
-	temp = mul_add (a[7], b4, temp );
-	temp = mul_add (a7, b5, temp );
-	temp = mul_add (a6, b6, temp );
-	temp = mul_add (a5, b7, temp );
-	temp = mul_add (a4, b[7], temp );
-	temp = mul_add (a3, b[6], temp );
-	temp = mul_add (a2, b[5], temp );
-	temp = mul_add (a0, b[4], temp );
-		temp=mul_add(a1, b[3], temp);
-		c_avx[18] = _mm256_add_epi16(temp, c_avx[18]);
-
-
-	temp = _mm256_mullo_epi16 (a[4], b1);
-	temp = mul_add (a[5], b0, temp );
-	temp = mul_add (a[6], b2, temp );
-	temp = mul_add (a[7], b3, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a3, b[7], temp );
-	temp = mul_add (a2, b[6], temp );
-	temp = mul_add (a0, b[5], temp );
-		temp=mul_add(a1, b[4], temp);
-		c_avx[19] = _mm256_add_epi16(temp, c_avx[19]);
-
-
-	temp = _mm256_mullo_epi16 (a[5], b1);
-	temp = mul_add (a[6], b0, temp );
-	temp = mul_add (a[7], b2, temp );
-	temp = mul_add (a7, b3, temp );
-	temp = mul_add (a6, b4, temp );
-	temp = mul_add (a5, b5, temp );
-	temp = mul_add (a4, b6, temp );
-	temp = mul_add (a3, b7, temp );
-	temp = mul_add (a2, b[7], temp );
-	temp = mul_add (a0, b[6], temp );
-		temp=mul_add(a1, b[5], temp);
-		c_avx[20] = _mm256_add_epi16(temp, c_avx[20]);
-
-
-	temp = _mm256_mullo_epi16 (a[6], b1);
-	temp = mul_add (a[7], b0, temp );
-	temp = mul_add (a7, b2, temp );
-	temp = mul_add (a6, b3, temp );
-	temp = mul_add (a5, b4, temp );
-	temp = mul_add (a4, b5, temp );
-	temp = mul_add (a3, b6, temp );
-	temp = mul_add (a2, b7, temp );
-	temp = mul_add (a0, b[7], temp );
-		temp=mul_add(a1, b[6], temp);
-		c_avx[21] = _mm256_add_epi16(temp, c_avx[21]);
-
-
-	temp = _mm256_mullo_epi16 (a[7], b1);
-	temp = mul_add (a7, b0, temp );
-	temp = mul_add (a6, b2, temp );
-	temp = mul_add (a5, b3, temp );
-	temp = mul_add (a4, b4, temp );
-	temp = mul_add (a3, b5, temp );
-	temp = mul_add (a2, b6, temp );
-	temp = mul_add (a0, b7, temp );
-		temp=mul_add(a1, b[7], temp);
-		c_avx[22] = _mm256_add_epi16(temp, c_avx[22]);
-
-
-	temp = _mm256_mullo_epi16 (a7, b1);
-	temp = mul_add (a6, b0, temp );
-	temp = mul_add (a5, b2, temp );
-	temp = mul_add (a4, b3, temp );
-	temp = mul_add (a3, b4, temp );
-	temp = mul_add (a2, b5, temp );
-	temp = mul_add (a0, b6, temp );
-		temp=mul_add(a1, b7, temp);
-		c_avx[23] = _mm256_add_epi16(temp, c_avx[23]);
-
-
-	temp = _mm256_mullo_epi16 (a6, b1);
-	temp = mul_add (a5, b0, temp );
-	temp = mul_add (a4, b2, temp );
-	temp = mul_add (a3, b3, temp );
-	temp = mul_add (a2, b4, temp );
-	temp = mul_add (a0, b5, temp );
-		temp=mul_add(a1, b6, temp);
-		c_avx[24] = _mm256_add_epi16(temp, c_avx[24]);
-
-
-	temp = _mm256_mullo_epi16 (a5, b1);
-	temp = mul_add (a4, b0, temp );
-	temp = mul_add (a3, b2, temp );
-	temp = mul_add (a2, b3, temp );
-	temp = mul_add (a0, b4, temp );
-		temp=mul_add(a1, b5, temp);
-		c_avx[25] = _mm256_add_epi16(temp, c_avx[25]);
-
-
-	temp = _mm256_mullo_epi16 (a4, b1);
-	temp = mul_add (a3, b0, temp );
-	temp = mul_add (a2, b2, temp );
-	temp = mul_add (a0, b3, temp );
-		temp=mul_add(a1, b4, temp);
-		c_avx[26] = _mm256_add_epi16(temp, c_avx[26]);
-
-
-	temp = _mm256_mullo_epi16 (a3, b1);
-	temp = mul_add (a2, b0, temp );
-	temp = mul_add (a0, b2, temp );
-		temp=mul_add(a1, b3, temp);
-		c_avx[27] = _mm256_add_epi16(temp, c_avx[27]);
-
-
-	temp = _mm256_mullo_epi16 (a2, b1);
-	temp = mul_add (a0, b0, temp );
-		temp=mul_add(a1, b2, temp);
-		c_avx[28] = _mm256_add_epi16(temp, c_avx[28]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-		temp=mul_add(a1, b0, temp);
-		c_avx[29] = _mm256_add_epi16(temp, c_avx[29]);
-
-
-		c_avx[30] = mul_add(a1, b1, c_avx[30]);
-
-
-
-	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
-
-
-}
-
-
-
-static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
-									      //the c_avx are not added cummulatively
-{
-
-	__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-	__m256i temp;
-
-
-	a0=a[0];
-	a1=a[1];
-	a2=a[2];
-	a3=a[3];
-	a4=a[4];
-	a5=a[5];
-	a6=a[6];
-	a7=a[7];
-
-	b0=b[0];
-	b1=b[1];
-	b2=b[2];
-	b3=b[3];
-	b4=b[4];
-	b5=b[5];
-	b6=b[6];
-	b7=b[7];
-
-	// New Unrolled first triangle
-	c_avx[0] = _mm256_mullo_epi16 (a0, b0);
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	c_avx[1]=mul_add(a1, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b2);
-
-	temp = mul_add(a1, b1, temp);
-	c_avx[2]= mul_add(a2, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b3);
-	temp = mul_add(a1, b2, temp);
-	temp = mul_add(a2, b1, temp);
-	c_avx[3]= mul_add(a3, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b4);
-	temp = mul_add(a1, b3, temp);
-	temp = mul_add(a3, b1, temp);
-	temp = mul_add(a4, b0, temp);
-	c_avx[4]= mul_add(a2, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b5);
-	temp = mul_add(a1, b4 , temp);
-	temp = mul_add(a2, b3, temp);
-	temp = mul_add(a3, b2, temp);
-	temp = mul_add( a4, b1, temp);
-	c_avx[5] = mul_add(a5, b0, temp);
-	
-	temp = _mm256_mullo_epi16 (a0, b6);
-	temp = mul_add(a1, b5, temp);
-	temp = mul_add(a5, b1, temp);
-	temp = mul_add(a6, b0, temp);
-	temp = mul_add(a2, b4, temp);
-	temp = mul_add(a3, b3, temp);
-	c_avx[6] = mul_add(a4, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b7);
-	temp = mul_add(a1, b6, temp);
-	temp = mul_add (a6, b1, temp);
-	temp = mul_add (a7, b0, temp);
-	temp = mul_add(a2, b5, temp);
-	temp = mul_add (a3, b4, temp);
-	temp = mul_add (a4, b3, temp);
-	c_avx[7] = mul_add (a5, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[8]);
-	temp = mul_add (a1, b7, temp);
-	temp = mul_add (a7, b1, temp);
-	temp = mul_add (a[8], b0, temp);
-	temp = mul_add (a2, b6,temp);
-	temp = mul_add(a3, b5, temp);
-	temp = mul_add (a4, b4,temp);
-	temp = mul_add (a5, b3, temp);
-	c_avx[8] = mul_add (a6, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[9]);
-	temp = mul_add (a1, b[8], temp);
-	temp = mul_add (a[8], b1, temp);
-	temp = mul_add (a[9], b0, temp);
-	temp = mul_add (a2, b7, temp);
-	temp = mul_add (a3, b6, temp);
-	temp = mul_add (a4, b5, temp);
-	temp = mul_add (a5, b4, temp);
-	temp = mul_add (a6, b3, temp);
-	c_avx[9] = mul_add (a7, b2, temp);
-
-	temp= _mm256_mullo_epi16 (a0, b[10]);
-	temp = mul_add (a1, b[9], temp);
-	temp = mul_add (a[9], b1, temp);
-	temp = mul_add (a[10], b0, temp);
-	temp = mul_add (a2, b[8], temp);
-	temp = mul_add (a3, b7, temp);
-	temp = mul_add (a4, b6, temp);
-	temp = mul_add (a5, b5, temp);
-	temp = mul_add (a6, b4, temp);
-	temp = mul_add (a7, b3, temp);
-	c_avx[10] = mul_add (a[8], b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[11]);
-	temp = mul_add (a1, b[10], temp );
-	temp = mul_add (a[10], b1, temp );
-	temp = mul_add (a[11], b0, temp );
-	temp = mul_add (a2, b[9], temp );
-	temp = mul_add (a3, b[8], temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a[8], b3, temp );
-	c_avx[11] = mul_add (a[9], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[12]);
-	temp = mul_add (a1, b[11], temp);
-	temp = mul_add (a[11], b1, temp);
-	temp = mul_add (a[12], b0, temp);
-	temp = mul_add (a2, b[10], temp);
-	temp = mul_add (a3, b[9], temp);
-	temp = mul_add (a4, b[8], temp);
-	temp = mul_add (a5, b7, temp);
-	temp = mul_add (a6, b6, temp);
-	temp = mul_add (a7, b5, temp);
-	temp = mul_add (a[8], b4, temp);
-	temp = mul_add (a[9], b3, temp);
-	c_avx[12] = mul_add (a[10], b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[13]);
-	temp = mul_add (a1, b[12], temp );
-	temp = mul_add (a[12], b1, temp );
-	temp = mul_add (a[13], b0, temp );
-	temp = mul_add (a2, b[11], temp );
-	temp = mul_add (a3, b[10], temp );
-	temp = mul_add (a4, b[9], temp );
-	temp = mul_add (a5, b[8], temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a[8], b5, temp );
-	temp = mul_add (a[9], b4, temp );
-	temp = mul_add (a[10], b3, temp );
-	c_avx[13] = mul_add (a[11], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[14]);
-	temp = mul_add (a1, b[13], temp );
-	temp = mul_add (a[13], b1, temp );
-	temp = mul_add (a[14], b0, temp );
-	temp = mul_add (a2, b[12], temp );
-	temp = mul_add (a3, b[11], temp );
-	temp = mul_add (a4, b[10], temp );
-	temp = mul_add (a5, b[9], temp );
-	temp = mul_add (a6, b[8], temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a[8], b6, temp );
-	temp = mul_add (a[9], b5, temp );
-	temp = mul_add (a[10], b4, temp );
-	temp = mul_add (a[11], b3, temp );
-	c_avx[14] = mul_add (a[12], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[15]);
-	temp = mul_add (a1, b[14], temp );
-	temp = mul_add (a[14], b1, temp );
-	temp = mul_add (a[15], b0, temp );
-	temp = mul_add (a2, b[13], temp );
-	temp = mul_add (a3, b[12], temp );
-	temp = mul_add (a4, b[11], temp );
-	temp = mul_add (a5, b[10], temp );
-	temp = mul_add (a6, b[9], temp );
-	temp = mul_add (a7, b[8], temp );
-	temp = mul_add (a[8], b7, temp );
-	temp = mul_add (a[9], b6, temp );
-	temp = mul_add (a[10], b5, temp );
-	temp = mul_add (a[11], b4, temp );
-	temp = mul_add (a[12], b3, temp );
-	c_avx[15] = mul_add (a[13], b2, temp );
-
-
-	// unrolled second triangle
-	a0=a[14];
-	a1=a[15];
-	a2=a[13];
-	a3=a[12];
-	a4=a[11];
-	a5=a[10];
-	a6=a[9];
-	a7=a[8];
-
-	b0=b[14];
-	b1=b[15];
-	b2=b[13];
-	b3=b[12];
-	b4=b[11];
-	b5=b[10];
-	b6=b[9];
-	b7=b[8];
-	
-
-	temp = _mm256_mullo_epi16 (a[1], b1);
-	temp = mul_add (a[2], b0, temp );
-	temp = mul_add (a[3], b2, temp );
-	temp = mul_add (a[4], b3, temp );
-	temp = mul_add (a[5], b4, temp );
-	temp = mul_add (a[6], b5, temp );
-	temp = mul_add (a[7], b6, temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a6, b[7], temp );
-	temp = mul_add (a5, b[6], temp );
-	temp = mul_add (a4, b[5], temp );
-	temp = mul_add (a3, b[4], temp );
-	temp = mul_add (a2, b[3], temp );
-	temp = mul_add (a0, b[2], temp );
-	c_avx[16] = mul_add (a1, b[1], temp );
-
-	temp = _mm256_mullo_epi16 (a[2], b1);
-	temp = mul_add (a[3], b0, temp );
-	temp = mul_add (a[4], b2, temp );
-	temp = mul_add (a[5], b3, temp );
-	temp = mul_add (a[6], b4, temp );
-	temp = mul_add (a[7], b5, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a5, b[7], temp );
-	temp = mul_add (a4, b[6], temp );
-	temp = mul_add (a3, b[5], temp );
-	temp = mul_add (a2, b[4], temp );
-	temp = mul_add (a0, b[3], temp );
-	c_avx[17] = mul_add (a1, b[2], temp );
-
-	temp = _mm256_mullo_epi16 (a[3], b1);
-	temp = mul_add (a[4], b0, temp );
-	temp = mul_add (a[5], b2, temp );
-	temp = mul_add (a[6], b3, temp );
-	temp = mul_add (a[7], b4, temp );
-	temp = mul_add (a7, b5, temp );
-	temp = mul_add (a6, b6, temp );
-	temp = mul_add (a5, b7, temp );
-	temp = mul_add (a4, b[7], temp );
-	temp = mul_add (a3, b[6], temp );
-	temp = mul_add (a2, b[5], temp );
-	temp = mul_add (a0, b[4], temp );
-	c_avx[18] = mul_add (a1, b[3], temp );
-
-	temp = _mm256_mullo_epi16 (a[4], b1);
-	temp = mul_add (a[5], b0, temp );
-	temp = mul_add (a[6], b2, temp );
-	temp = mul_add (a[7], b3, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a3, b[7], temp );
-	temp = mul_add (a2, b[6], temp );
-	temp = mul_add (a0, b[5], temp );
-	c_avx[19] = mul_add (a1, b[4], temp );
-
-	temp = _mm256_mullo_epi16 (a[5], b1);
-	temp = mul_add (a[6], b0, temp );
-	temp = mul_add (a[7], b2, temp );
-	temp = mul_add (a7, b3, temp );
-	temp = mul_add (a6, b4, temp );
-	temp = mul_add (a5, b5, temp );
-	temp = mul_add (a4, b6, temp );
-	temp = mul_add (a3, b7, temp );
-	temp = mul_add (a2, b[7], temp );
-	temp = mul_add (a0, b[6], temp );
-	c_avx[20] = mul_add (a1, b[5], temp );
-
-	temp = _mm256_mullo_epi16 (a[6], b1);
-	temp = mul_add (a[7], b0, temp );
-	temp = mul_add (a7, b2, temp );
-	temp = mul_add (a6, b3, temp );
-	temp = mul_add (a5, b4, temp );
-	temp = mul_add (a4, b5, temp );
-	temp = mul_add (a3, b6, temp );
-	temp = mul_add (a2, b7, temp );
-	temp = mul_add (a0, b[7], temp );
-	c_avx[21] = mul_add (a1, b[6], temp );
-
-	temp = _mm256_mullo_epi16 (a[7], b1);
-	temp = mul_add (a7, b0, temp );
-	temp = mul_add (a6, b2, temp );
-	temp = mul_add (a5, b3, temp );
-	temp = mul_add (a4, b4, temp );
-	temp = mul_add (a3, b5, temp );
-	temp = mul_add (a2, b6, temp );
-	temp = mul_add (a0, b7, temp );
-	c_avx[22] = mul_add (a1, b[7], temp );
-
-	temp = _mm256_mullo_epi16 (a7, b1);
-	temp = mul_add (a6, b0, temp );
-	temp = mul_add (a5, b2, temp );
-	temp = mul_add (a4, b3, temp );
-	temp = mul_add (a3, b4, temp );
-	temp = mul_add (a2, b5, temp );
-	temp = mul_add (a0, b6, temp );
-	c_avx[23] = mul_add (a1, b7, temp );
-
-	temp = _mm256_mullo_epi16 (a6, b1);
-	temp = mul_add (a5, b0, temp );
-	temp = mul_add (a4, b2, temp );
-	temp = mul_add (a3, b3, temp );
-	temp = mul_add (a2, b4, temp );
-	temp = mul_add (a0, b5, temp );
-	c_avx[24] = mul_add (a1, b6, temp );
-
-	temp = _mm256_mullo_epi16 (a5, b1);
-	temp = mul_add (a4, b0, temp );
-	temp = mul_add (a3, b2, temp );
-	temp = mul_add (a2, b3, temp );
-	temp = mul_add (a0, b4, temp );
-	c_avx[25] = mul_add (a1, b5, temp );
-
-	temp = _mm256_mullo_epi16 (a4, b1);
-	temp = mul_add (a3, b0, temp );
-	temp = mul_add (a2, b2, temp );
-	temp = mul_add (a0, b3, temp );
-	c_avx[26] = mul_add (a1, b4, temp );
-
-	temp = _mm256_mullo_epi16 (a3, b1);
-	temp = mul_add (a2, b0, temp );
-	temp = mul_add (a0, b2, temp );
-	c_avx[27] = mul_add (a1, b3, temp );
-
-	temp = _mm256_mullo_epi16 (a2, b1);
-	temp = mul_add (a0, b0, temp );
-	c_avx[28] = mul_add (a1, b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	c_avx[29] = mul_add (a1, b0, temp);
-
-	c_avx[30] = _mm256_mullo_epi16 (a1, b1);
-
-
-	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
-
-}
diff --git a/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c b/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c
deleted file mode 100644
index 78fb86c2..00000000
--- a/crypto_kem/lightsaber/avx2/polymul/toom-cook_4way.c
+++ /dev/null
@@ -1,1010 +0,0 @@
-/*
-Cleaned version for step by step approach look into the _debug file
-*/
-//#include "timing.c"
-#include "consts.h"
-#include "matrix.c"
-#include "scm_avx.c"
-
-static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX.
-{
-	__m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time
-
-	//uint16_t i;
-
-	register __m256i r0_avx, r1_avx, r2_avx, r3_avx;
-
-
-
-		//CLOCK1=cpucycles();
-		
-		//------------------AVX evaluation for 1st poly-----------------------
-
-                    r0_avx=a[0];
-                    r1_avx=a[1];
-                    r2_avx=a[2];
-                    r3_avx=a[3];
-		    a_bucket[0]=r0_avx;
-		    a_bucket[1]=r1_avx;
-		    a_bucket[2]=r2_avx;
-		    a_bucket[3]=r3_avx;
-		    a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]);
-
-
-		//------------------AVX evaluation for 1st poly ends------------------
-
-
-		//------------------AVX evaluation for 2nd poly-----------------------
-                    r0_avx=a[small_len_avx];
-                    r1_avx=a[small_len_avx+1];
-                    r2_avx=a[small_len_avx+2];
-                    r3_avx=a[small_len_avx+3];
-		    a_bucket[0+9]=r0_avx;
-		    a_bucket[1+9]=r1_avx;
-		    a_bucket[2+9]=r2_avx;
-		    a_bucket[3+9]=r3_avx;
-		    a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]);
-
-	
-		//------------------AVX evaluation for 2nd poly ends------------------
-
-
-		//------------------AVX evaluation for 3rd poly-----------------------
-                    r0_avx=a[2*small_len_avx];
-                    r1_avx=a[2*small_len_avx+1];
-                    r2_avx=a[2*small_len_avx+2];
-                    r3_avx=a[2*small_len_avx+3];
-		    a_bucket[0+18]=r0_avx;
-		    a_bucket[1+18]=r1_avx;
-		    a_bucket[2+18]=r2_avx;
-		    a_bucket[3+18]=r3_avx;
-		    a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]);
-		
-		//------------------AVX evaluation for 3rd poly ends------------------
-
-
-		//------------------AVX evaluation for 4th poly-----------------------
-
-                    r0_avx=a[3*small_len_avx];
-                    r1_avx=a[3*small_len_avx+1];
-                    r2_avx=a[3*small_len_avx+2];
-                    r3_avx=a[3*small_len_avx+3];
-		    a_bucket[0+27]=r0_avx;
-		    a_bucket[1+27]=r1_avx;
-		    a_bucket[2+27]=r2_avx;
-		    a_bucket[3+27]=r3_avx;
-		    a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]);
-		
-		//------------------AVX evaluation for 4th poly ends------------------
-
-		//------------------AVX evaluation for 5th poly-----------------------
-		
-                    r0_avx=a[4*small_len_avx+0];
-                    r1_avx=a[4*small_len_avx+1];
-                    r2_avx=a[4*small_len_avx+2];
-                    r3_avx=a[4*small_len_avx+3];
-		    a_bucket[0+36]=r0_avx;
-		    a_bucket[1+36]=r1_avx;
-		    a_bucket[2+36]=r2_avx;
-		    a_bucket[3+36]=r3_avx;
-		    a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]);
-		
-		//------------------AVX evaluation for 5th poly ends------------------
-
-
-		//------------------AVX evaluation for 6th poly-----------------------
-                    r0_avx=a[5*small_len_avx];
-                    r1_avx=a[5*small_len_avx+1];
-                    r2_avx=a[5*small_len_avx+2];
-                    r3_avx=a[5*small_len_avx+3];
-		    a_bucket[0+45]=r0_avx;
-		    a_bucket[1+45]=r1_avx;
-		    a_bucket[2+45]=r2_avx;
-		    a_bucket[3+45]=r3_avx;
-		    a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]);
-		
-		//------------------AVX evaluation for 6th poly ends------------------
-
-		//------------------AVX evaluation for 7th poly-----------------------
-
-                    r0_avx=a[6*small_len_avx];
-                    r1_avx=a[6*small_len_avx+1];
-                    r2_avx=a[6*small_len_avx+2];
-                    r3_avx=a[6*small_len_avx+3];
-		    a_bucket[0+54]=r0_avx;
-		    a_bucket[1+54]=r1_avx;
-		    a_bucket[2+54]=r2_avx;
-		    a_bucket[3+54]=r3_avx;
-		    a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]);
-
-		//------------------AVX evaluation for 7th poly ends------------------
-		
-	
-
-		//CLOCK2=cpucycles();
-		//CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1);
-		//printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1);
-
-
-		//CLOCK1=cpucycles();
-		//-----------------Forward transposes--------------------------------------
-			transpose_n1(a_bucket);
-			transpose_n1(a_bucket+16);
-			transpose_n1(a_bucket+32);
-			transpose_n1(a_bucket+48);
-
-		//-----------------Forwatrd transposes ends---------------------------------
-
-		//----------------------all multiplications---------------------------------
-		if(f==0){
-			schoolbook_avx_new2(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
-			schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
-			schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
-		}
-		else{
-			schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
-			//schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
-			schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
-		}
-		/*
-		schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f);
-		schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f);
-		schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f);
-		schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f);
-		*/
-
-
-		//----------------------all multiplications ends-----------------------------
-
-
-		//-----------------Reverse transposes--------------------------------------
-
-			/*
-			transpose(c_bucket);
-			transpose(c_bucket+16);
-
-			transpose(c_bucket+2*SCM_SIZE);
-			transpose(c_bucket+16+2*SCM_SIZE);
-
-			transpose(c_bucket+4*SCM_SIZE);
-			transpose(c_bucket+16+4*SCM_SIZE);
-
-			transpose(c_bucket+6*SCM_SIZE);
-			transpose(c_bucket+16+6*SCM_SIZE);
-			*/
-		//-----------------Reverse transposes ends---------------------------------
-
-		//CLOCK2=cpucycles();
-		//CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1);
-
-		//KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6);
-		
-}
-
-static void KARA_eval(__m256i* b, __m256i *b_bucket){
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx;
-
-
-		//-------1st poly----------------------------------------------------
-                    r0_avx=b[0];
-                    r1_avx=b[1];
-                    r2_avx=b[2];
-                    r3_avx=b[3];
-		    b_bucket[0]=r0_avx;
-		    b_bucket[1]=r1_avx;
-		    b_bucket[2]=r2_avx;
-		    b_bucket[3]=r3_avx;
-		    b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]);
-		//-------2nd poly----------------------------------------------------
-
-                    r0_avx=b[small_len_avx];
-                    r1_avx=b[small_len_avx+1];
-                    r2_avx=b[small_len_avx+2];
-                    r3_avx=b[small_len_avx+3];
-		    b_bucket[0+9]=r0_avx;
-		    b_bucket[1+9]=r1_avx;
-		    b_bucket[2+9]=r2_avx;
-		    b_bucket[3+9]=r3_avx;
-		    b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]);
-
-		//-------3rd poly----------------------------------------------------
-
-                    r0_avx=b[2*small_len_avx+0];
-                    r1_avx=b[2*small_len_avx+1];
-                    r2_avx=b[2*small_len_avx+2];
-                    r3_avx=b[2*small_len_avx+3];
-		    b_bucket[0+18]=r0_avx;
-		    b_bucket[1+18]=r1_avx;
-		    b_bucket[2+18]=r2_avx;
-		    b_bucket[3+18]=r3_avx;
-		    b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]);
-
-		//-------4th poly----------------------------------------------------
-                    r0_avx=b[3*small_len_avx];
-                    r1_avx=b[3*small_len_avx+1];
-                    r2_avx=b[3*small_len_avx+2];
-                    r3_avx=b[3*small_len_avx+3];
-		    b_bucket[0+27]=r0_avx;
-		    b_bucket[1+27]=r1_avx;
-		    b_bucket[2+27]=r2_avx;
-		    b_bucket[3+27]=r3_avx;
-		    b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]);
-
-		//-------5th poly----------------------------------------------------
-
-                    r0_avx=b[4*small_len_avx];
-                    r1_avx=b[4*small_len_avx+1];
-                    r2_avx=b[4*small_len_avx+2];
-                    r3_avx=b[4*small_len_avx+3];
-		    b_bucket[0+36]=r0_avx;
-		    b_bucket[1+36]=r1_avx;
-		    b_bucket[2+36]=r2_avx;
-		    b_bucket[3+36]=r3_avx;
-		    b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]);
-
-		//-------6th poly----------------------------------------------------
-
-                    r0_avx=b[5*small_len_avx];
-                    r1_avx=b[5*small_len_avx+1];
-                    r2_avx=b[5*small_len_avx+2];
-                    r3_avx=b[5*small_len_avx+3];
-		    b_bucket[0+45]=r0_avx;
-		    b_bucket[1+45]=r1_avx;
-		    b_bucket[2+45]=r2_avx;
-		    b_bucket[3+45]=r3_avx;
-		    b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]);
-
-		//-------7th poly----------------------------------------------------
-
-                    r0_avx=b[6*small_len_avx];
-                    r1_avx=b[6*small_len_avx+1];
-                    r2_avx=b[6*small_len_avx+2];
-                    r3_avx=b[6*small_len_avx+3];
-		    b_bucket[0+54]=r0_avx;
-		    b_bucket[1+54]=r1_avx;
-		    b_bucket[2+54]=r2_avx;
-		    b_bucket[3+54]=r3_avx;
-		    b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]);
-
-		//--------------Evaluating B poly ends-------------------------------
-
-			transpose_n1(b_bucket);
-			transpose_n1(b_bucket+16);
-			transpose_n1(b_bucket+32);
-			transpose_n1(b_bucket+48);	
-}
-
-static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){
-
-		//int64_t i;
-		register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
-
-		__m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
-
-		//CLOCK1=cpucycles();
-
-		   //------------------------AVX interpolation for 1st poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[0];
-				res_avx2 = c_bucket[1];
-				res_avx4 = c_bucket[2];
-				res_avx6 = c_bucket[3];
-
-				c6_avx=c_bucket[6];
-				c7_avx=c_bucket[7];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[16];
-				res_avx3 = c_bucket[17];
-				res_avx5 = c_bucket[18];
-				res_avx7 = c_bucket[19];
-
-				c22_avx=c_bucket[22];
-				c23_avx=c_bucket[23];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final0[0]=res_avx0;
-				result_final0[1]=res_avx1;
-
-				result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final0[6]=res_avx6;
-				result_final0[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 1st poly ends--------------
-
-
-		   //------------------------AVX interpolation for 2nd poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[9]; //c_bucket0
-				res_avx2 = c_bucket[10]; //c_bucket1
-				res_avx4 = c_bucket[11]; //c_bucket2
-				res_avx6 = c_bucket[12]; //c_bucket3
-
-				c6_avx=c_bucket[15]; //c_bucket6
-				c7_avx=c_bucket[32]; //c_bucket7
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[25]; //c_bucket0
-				res_avx3 = c_bucket[26]; //c_bucket1
-				res_avx5 = c_bucket[27]; //c_bucket2
-				res_avx7 = c_bucket[28]; //c_bucket3
-
-				c22_avx=c_bucket[31];
-				c23_avx=c_bucket[48];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final1[0]=res_avx0;
-				result_final1[1]=res_avx1;
-
-				result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final1[6]=res_avx6;
-				result_final1[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 2nd poly ends--------------
-
-		   //------------------------AVX interpolation for 3rd poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[34]; //c_bucket0
-				res_avx2 = c_bucket[35]; //c_bucket1
-				res_avx4 = c_bucket[36];
-				res_avx6 = c_bucket[37];
-
-				c6_avx=c_bucket[40];
-				c7_avx=c_bucket[41];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[50]; //c_bucket0
-				res_avx3 = c_bucket[51]; //c_bucket1
-				res_avx5 = c_bucket[52];
-				res_avx7 = c_bucket[53];
-
-				c22_avx=c_bucket[56];
-				c23_avx=c_bucket[57];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-			//loop4
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-			//loop5
-				result_final2[0]=res_avx0;
-				result_final2[1]=res_avx1;
-
-				result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final2[6]=res_avx6;
-				result_final2[7]=res_avx7;
-
-		   //------------------------AVX interpolation for 3rd poly ends--------------
-		
-		   //------------------------AVX interpolation for 4th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[43];
-				res_avx2 = c_bucket[44];
-				res_avx4 = c_bucket[45];
-				res_avx6 = c_bucket[46];
-
-				c6_avx=c_bucket[65];
-				c7_avx=c_bucket[66];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[59];
-				res_avx3 = c_bucket[60];
-				res_avx5 = c_bucket[61];
-				res_avx7 = c_bucket[62];
-
-				c22_avx=c_bucket[81];
-				c23_avx=c_bucket[82];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final3[0]=res_avx0;
-				result_final3[1]=res_avx1;
-
-				result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final3[6]=res_avx6;
-				result_final3[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 4th poly ends--------------
-
-		   //------------------------AVX interpolation for 5th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[68];
-				res_avx2 = c_bucket[69];
-				res_avx4 = c_bucket[70];
-				res_avx6 = c_bucket[71];
-
-				c6_avx=c_bucket[74];
-				c7_avx=c_bucket[75];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[84];
-				res_avx3 = c_bucket[85];
-				res_avx5 = c_bucket[86];
-				res_avx7 = c_bucket[87];
-
-				c22_avx=c_bucket[90];
-				c23_avx=c_bucket[91];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final4[0]=res_avx0;
-				result_final4[1]=res_avx1;
-
-				result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final4[6]=res_avx6;
-				result_final4[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 5th poly ends--------------
-
-		   //------------------------AVX interpolation for 6th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[77];
-				res_avx2 = c_bucket[78];
-				res_avx4 = c_bucket[79];
-				res_avx6 = c_bucket[96];
-
-				c6_avx=c_bucket[99];
-				c7_avx=c_bucket[100];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[93];
-				res_avx3 = c_bucket[94];
-				res_avx5 = c_bucket[95];
-				res_avx7 = c_bucket[112];
-
-				c22_avx=c_bucket[115];
-				c23_avx=c_bucket[116];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final5[0]=res_avx0;
-				result_final5[1]=res_avx1;
-
-				result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final5[6]=res_avx6;
-				result_final5[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 6th poly ends--------------
-
-		   //------------------------AVX interpolation for 7th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[102];
-				res_avx2 = c_bucket[103];
-				res_avx4 = c_bucket[104];
-				res_avx6 = c_bucket[105];
-
-				c6_avx=c_bucket[108];
-				c7_avx=c_bucket[109];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[118];
-				res_avx3 = c_bucket[119];
-				res_avx5 = c_bucket[120];
-				res_avx7 = c_bucket[121];
-
-				c22_avx=c_bucket[124];
-				c23_avx=c_bucket[125];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final6[0]=res_avx0;
-				result_final6[1]=res_avx1;
-
-				result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final6[6]=res_avx6;
-				result_final6[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 7th poly ends--------------
-
-		//CLOCK2=cpucycles();
-		//CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1);
-		//printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1);
-
-
-
-}
-
-static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ 
-
-	int i;
-
-//---------------AVX data-----------------------------
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
-	__m256i aw_avx[7*small_len_avx];
-
-//----------------AVX data----------------------------
-
-
-// EVALUATION
-
-	//CLOCK1=cpucycles();
-
-	for (i=0; i<small_len_avx; i++){
-		r0_avx=a_avx[i];
-		r1_avx=a_avx[i + small_len_avx];
-		r2_avx=a_avx[i + 2*small_len_avx];
-		r3_avx=a_avx[i + 3*small_len_avx];
-		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
-		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		aw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		aw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx=_mm256_slli_epi16(r0_avx,2);
-		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
-		r4_avx=_mm256_slli_epi16(r4_avx,1);
-		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
-		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
-		aw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		aw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx= _mm256_slli_epi16(r3_avx, 3);
-		r6_avx= _mm256_slli_epi16(r2_avx, 2);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		r6_avx= _mm256_slli_epi16(r1_avx, 1);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		aw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
-		aw_avx[6*small_len_avx+i]= r0_avx; 
-		aw_avx[i]= r3_avx;
-	}
-
-
-	//CLOCK2=cpucycles();
-	//CLOCK_TC_EVAL=CLOCK_TC_EVAL+(CLOCK2-CLOCK1);
-
-	batch_64coefficient_multiplications_new(aw_avx, b_bucket, c_bucket, f);//New
-
-}
-
-static void TC_eval(__m256i* b_avx, __m256i* b_bucket){
-
-	int i;
-	__m256i bw_avx[7*small_len_avx];
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
-
-	for (i=0; i<small_len_avx; i++){
-		
-		r0_avx=b_avx[i];
-		r1_avx=b_avx[i + small_len_avx];
-		r2_avx=b_avx[i + 2*small_len_avx];
-		r3_avx=b_avx[i + 3*small_len_avx];
-		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
-		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		bw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		bw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx=_mm256_slli_epi16(r0_avx,2);
-		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
-		r4_avx=_mm256_slli_epi16(r4_avx,1);
-		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
-		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
-		bw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		bw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx= _mm256_slli_epi16(r3_avx, 3);
-		r6_avx= _mm256_slli_epi16(r2_avx, 2);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		r6_avx= _mm256_slli_epi16(r1_avx, 1);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		bw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
-		bw_avx[6*small_len_avx+i]= r0_avx;
-		bw_avx[i]= r3_avx;
-	}
-
-	KARA_eval(bw_avx, b_bucket);
-
-}
-
-
-static void TC_interpol(__m256i *c_bucket, __m256i* res_avx){
-
-	int i;
-
-	register __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
-
-	__m256i w1_avx[2*small_len_avx],w2_avx[2*small_len_avx],w3_avx[2*small_len_avx],w4_avx[2*small_len_avx],w5_avx[2*small_len_avx],w6_avx[2*small_len_avx],w7_avx[2*small_len_avx];
-
-	__m256i res_avx_output[2*AVX_N1];
-
-	//CLOCK1=cpucycles();
-
-	
-	transpose_n1(c_bucket);
-	transpose_n1(c_bucket+16);
-
-	transpose_n1(c_bucket+2*SCM_SIZE);
-	transpose_n1(c_bucket+16+2*SCM_SIZE);
-
-	transpose_n1(c_bucket+4*SCM_SIZE);
-	transpose_n1(c_bucket+16+4*SCM_SIZE);
-
-	transpose_n1(c_bucket+6*SCM_SIZE);
-	transpose_n1(c_bucket+16+6*SCM_SIZE);
-	
-
-	KARA_interpol(c_bucket, w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx);
-
-	for (i = 0; i < 2*small_len_avx; i++) {
-
-		r0_avx = w1_avx[i];
-		r1_avx = w2_avx[i];
-		r2_avx = w3_avx[i];
-		r3_avx = w4_avx[i];
-		r4_avx = w5_avx[i];
-		r5_avx = w6_avx[i];
-		r6_avx = w7_avx[i];
-		r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
-		r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
-		r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
-		r3_avx = _mm256_srli_epi16(r3_avx, 1);
-		r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
-		temp_avx = _mm256_slli_epi16(r6_avx, 6);
-		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
-		r4_avx = _mm256_slli_epi16(r4_avx, 1);
-		r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
-		r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
-		temp_avx = _mm256_slli_epi16(r2_avx, 6);
-		r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
-		r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
-		r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
-		r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
-		temp_avx = _mm256_mullo_epi16 (r2_avx, _mm256_set1_epi16(45));
-		r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
-		temp_avx = _mm256_slli_epi16(r2_avx, 3);
-		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
-		r4_avx = _mm256_mullo_epi16 (r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
-		r4_avx = _mm256_srli_epi16(r4_avx, 3);
-		r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
-		temp_avx = _mm256_slli_epi16(r3_avx, 4);
-		r1_avx= _mm256_add_epi16(r1_avx, temp_avx);
-		r1_avx = _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
-		r1_avx= _mm256_srli_epi16(r1_avx, 1); 	
-		r3_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		r3_avx= _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
-		temp_avx= _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(30));
-		temp_avx= _mm256_sub_epi16(temp_avx, r5_avx);
-		temp_avx= _mm256_mullo_epi16 (temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
-		r5_avx= _mm256_srli_epi16(temp_avx, 2);
-		r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
-		r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
-
-		if(i<small_len_avx){
-			res_avx_output[0*small_len_avx+i]=r6_avx;
-			res_avx_output[1*small_len_avx+i]=r5_avx;
-			res_avx_output[2*small_len_avx+i]=r4_avx;
-			res_avx_output[3*small_len_avx+i]=r3_avx;
-			res_avx_output[4*small_len_avx+i]=r2_avx;
-			res_avx_output[5*small_len_avx+i]=r1_avx;
-			res_avx_output[6*small_len_avx+i]=r0_avx;
-		}
-		else{
-			res_avx_output[0*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[0*small_len_avx+i], r6_avx);
-			res_avx_output[1*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[1*small_len_avx+i], r5_avx);
-			res_avx_output[2*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[2*small_len_avx+i], r4_avx);
-			res_avx_output[3*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[3*small_len_avx+i], r3_avx);
-			res_avx_output[4*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[4*small_len_avx+i], r2_avx);
-			res_avx_output[5*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[5*small_len_avx+i], r1_avx);
-			res_avx_output[6*small_len_avx+i]=r0_avx;
-		}
-	}
-
-	//CLOCK2=cpucycles();
-	//CLOCK_TC_INTER=CLOCK_TC_INTER+(CLOCK2-CLOCK1);
-
-	// Reduction by X^256 + 1
-	for(i=0; i<16; i++)
-  {
-		res_avx[i] = _mm256_sub_epi16(res_avx_output[i], res_avx_output[i+16]);
-  }
-
-}
diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.c b/crypto_kem/lightsaber/clean/SABER_indcpa.c
index 9dcdfb93..fac58484 100644
--- a/crypto_kem/lightsaber/clean/SABER_indcpa.c
+++ b/crypto_kem/lightsaber/clean/SABER_indcpa.c
@@ -11,81 +11,102 @@
 #define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
 void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
-    uint16_t A[SABER_L][SABER_L][SABER_N];
-    uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N] = {{0}};
-
-    uint8_t seed_A[SABER_SEEDBYTES];
-    uint8_t seed_s[SABER_NOISE_SEEDBYTES];
     size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly s[SABER_L];
+    poly res[SABER_L];
+
+    uint8_t rand[SABER_NOISESEEDBYTES];
+    uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+
     randombytes(seed_A, SABER_SEEDBYTES);
     shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
-    randombytes(seed_s, SABER_NOISE_SEEDBYTES);
 
-    PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A);
-    PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, seed_s);
-    PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1);
+    randombytes(rand, SABER_NOISESEEDBYTES);
+    PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, rand);
+    PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(sk, s);
 
+    PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A); // sample matrix A
+    PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 1); // Matrix in transposed order
+
+
+    // rounding
     for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_N; j++) {
-            b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP);
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
 
-    PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s);
-    PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b);
-    memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A));
+    PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(pk, res); // pack public key
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
-    uint16_t A[SABER_L][SABER_L][SABER_N];
-    uint16_t sp[SABER_L][SABER_N];
-    uint16_t bp[SABER_L][SABER_N] = {{0}};
-    uint16_t vp[SABER_N] = {0};
-    uint16_t mp[SABER_N];
-    uint16_t b[SABER_L][SABER_N];
+
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
     size_t i, j;
+
+    poly A[SABER_L][SABER_L];
+    poly res[SABER_L];
+    poly s[SABER_L];
+    poly *temp = A[0]; // re-use stack space
+    poly *vprime = &A[0][0];
+    poly *message = &A[0][1];
+
     const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+    uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
 
+    PQCLEAN_LIGHTSABER_CLEAN_GenSecret(s, noiseseed);
     PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(A, seed_A);
-    PQCLEAN_LIGHTSABER_CLEAN_GenSecret(sp, seed_sp);
-    PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0);
+    PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed
 
-    for (i = 0; i < SABER_L; i++) {
+
+    // rounding
+    for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits
         for (j = 0; j < SABER_N; j++) {
-            bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP);
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
+    PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(ciphertext, res);
 
-    PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp);
-    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, pk);
-    PQCLEAN_LIGHTSABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp);
-
-    PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(mp, m);
-
-    for (j = 0; j < SABER_N; j++) {
-        vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET);
-    }
-
-    PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp);
-}
-
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
-
-    uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N];
-    uint16_t v[SABER_N] = {0};
-    uint16_t cm[SABER_N];
-    size_t i;
-
-    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk);
-    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(b, ciphertext);
-    PQCLEAN_LIGHTSABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s);
-    PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES);
+    // vector-vector scalar multiplication with mod p
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(temp, pk);
+    PQCLEAN_LIGHTSABER_CLEAN_InnerProd(vprime, temp, s);
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(message, m);
 
     for (i = 0; i < SABER_N; i++) {
-        v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1);
+        vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1));
+        vprime->coeffs[i] &= SABER_P - 1;
+        vprime->coeffs[i] >>= SABER_EP - SABER_ET;
+    }
+
+    PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(msk_c, vprime);
+}
+
+
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+    size_t i;
+
+    poly temp[SABER_L];
+    poly s[SABER_L];
+
+    const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
+    poly *v = &temp[0];
+    poly *cm = &temp[1];
+
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(s, sk);
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(temp, ciphertext);
+    PQCLEAN_LIGHTSABER_CLEAN_InnerProd(&temp[0], temp, s);
+
+    PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(cm, packed_cm);
+
+    for (i = 0; i < SABER_N; i++) {
+        v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET));
+        v->coeffs[i] &= SABER_P - 1;
+        v->coeffs[i] >>= SABER_EP - 1;
     }
 
     PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(m, v);
diff --git a/crypto_kem/lightsaber/clean/SABER_indcpa.h b/crypto_kem/lightsaber/clean/SABER_indcpa.h
index efccbf5e..df8906ab 100644
--- a/crypto_kem/lightsaber/clean/SABER_indcpa.h
+++ b/crypto_kem/lightsaber/clean/SABER_indcpa.h
@@ -5,7 +5,7 @@
 
 void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
 
 void PQCLEAN_LIGHTSABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
 
diff --git a/crypto_kem/lightsaber/clean/SABER_params.h b/crypto_kem/lightsaber/clean/SABER_params.h
index a6a9fc55..8da6ec34 100644
--- a/crypto_kem/lightsaber/clean/SABER_params.h
+++ b/crypto_kem/lightsaber/clean/SABER_params.h
@@ -2,19 +2,21 @@
 #define PARAMS_H
 
 
-/* Change this for different security strengths */
-
 /* Don't change anything below this line */
 #define SABER_L 2
 #define SABER_MU 10
 #define SABER_ET 3
 
-#define SABER_EQ 13
-#define SABER_EP 10
 #define SABER_N 256
 
+#define SABER_EP 10
+#define SABER_P (1 << SABER_EP)
+
+#define SABER_EQ 13
+#define SABER_Q (1 << SABER_EQ)
+
 #define SABER_SEEDBYTES 32
-#define SABER_NOISE_SEEDBYTES 32
+#define SABER_NOISESEEDBYTES 32
 #define SABER_KEYBYTES 32
 #define SABER_HASHBYTES 32
 
diff --git a/crypto_kem/lightsaber/clean/api.h b/crypto_kem/lightsaber/clean/api.h
index f0fe63f1..2e39ae02 100644
--- a/crypto_kem/lightsaber/clean/api.h
+++ b/crypto_kem/lightsaber/clean/api.h
@@ -15,4 +15,4 @@ int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k,
 int PQCLEAN_LIGHTSABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
 
 
-#endif /* api_h */
+#endif /* PQCLEAN_LIGHTSABER_CLEAN_API_H */
diff --git a/crypto_kem/lightsaber/clean/pack_unpack.c b/crypto_kem/lightsaber/clean/pack_unpack.c
index 2a39a1d7..f64c4143 100644
--- a/crypto_kem/lightsaber/clean/pack_unpack.c
+++ b/crypto_kem/lightsaber/clean/pack_unpack.c
@@ -1,140 +1,153 @@
-#include "api.h"
+#include "SABER_params.h"
 #include "pack_unpack.h"
+#include "poly.h"
 #include <string.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ((data[offset_data + 1] & 0x7) << 3) | ((data[offset_data + 2] & 0x3) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2) & 0x01) | ((data[offset_data + 3] & 0x7) << 1) | ((data[offset_data + 4] & 0x7) << 4) | (((data[offset_data + 5]) & 0x01) << 7);
-        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1) & 0x03) | ((data[offset_data + 6] & 0x7) << 2) | ((data[offset_data + 7] & 0x7) << 5);
+        out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6);
+        out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7);
+        out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5);
+        in += 8;
+        out += 3;
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
-    size_t j, offset_byte, offset_data;
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
-        data[offset_data + 1] = ((bytes[offset_byte + 0]) >> 3) & 0x07;
-        data[offset_data + 2] = (((bytes[offset_byte + 0]) >> 6) & 0x03) | (((bytes[offset_byte + 1]) & 0x01) << 2);
-        data[offset_data + 3] = ((bytes[offset_byte + 1]) >> 1) & 0x07;
-        data[offset_data + 4] = ((bytes[offset_byte + 1]) >> 4) & 0x07;
-        data[offset_data + 5] = (((bytes[offset_byte + 1]) >> 7) & 0x01) | (((bytes[offset_byte + 2]) & 0x03) << 1);
-        data[offset_data + 6] = ((bytes[offset_byte + 2] >> 2) & 0x07);
-        data[offset_data + 7] = ((bytes[offset_byte + 2] >> 5) & 0x07);
+        out[0] = (in[0]) & 0x07;
+        out[1] = ((in[0]) >> 3) & 0x07;
+        out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2);
+        out[3] = ((in[1]) >> 1) & 0x07;
+        out[4] = ((in[1]) >> 4) & 0x07;
+        out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1);
+        out[6] = ((in[2] >> 2) & 0x07);
+        out[7] = ((in[2] >> 5) & 0x07);
+        in += 3;
+        out += 8;
     }
 }
 
-static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
-        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5);
-        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff);
-        bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2);
-        bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7);
-        bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff);
-        bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4);
-        bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff);
-        bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1);
-        bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6);
-        bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff);
-        bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3);
-        bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
+        out[2] = ((in[1] >> 3) & 0xff);
+        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
+        out[5] = ((in[3] >> 1) & 0xff);
+        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
+        out[7] = ((in[4] >> 4) & 0xff);
+        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
+        out[10] = ((in[6] >> 2) & 0xff);
+        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
+        out[12] = ((in[7] >> 5) & 0xff);
+        in += 8;
+        out += 13;
     }
 }
 
-static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) {
-    size_t j, offset_byte, offset_data;
+static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
+        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
+        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
+        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
+        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
+        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
+        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
+        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        in += 13;
+        out += 8;
     }
 }
 
-static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 5 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
-        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2);
-        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6);
-        bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
+        out[4] = ((in[3] >> 2) & 0xff);
+        in += 4;
+        out += 5;
     }
 }
 
-static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
-    size_t j, offset_byte, offset_data;
+static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 5 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8);
-        data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6);
-        data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4);
-        data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
+        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
+        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
+        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        in += 5;
+        out += 4;
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) {
+void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        POLq2BS(bytes + i * SABER_POLYBYTES, data[i]);
+        POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) {
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        BS2POLq(data[i], bytes + i * SABER_POLYBYTES);
+        BS2POLq(&data[i], bytes + i * SABER_POLYBYTES);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) {
+void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]);
+        POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8));
+        BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) {
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) {
     size_t i, j;
     for (j = 0; j < SABER_KEYBYTES; j++) {
         for (i = 0; i < 8; i++) {
-            data[j * 8 + i] = ((bytes[j] >> i) & 0x01);
+            data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01);
         }
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) {
+void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) {
     size_t i, j;
     memset(bytes, 0, SABER_KEYBYTES);
 
     for (j = 0; j < SABER_KEYBYTES; j++) {
         for (i = 0; i < 8; i++) {
-            bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i);
+            bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i);
         }
     }
 }
diff --git a/crypto_kem/lightsaber/clean/pack_unpack.h b/crypto_kem/lightsaber/clean/pack_unpack.h
index 44ccf31a..0eda3392 100644
--- a/crypto_kem/lightsaber/clean/pack_unpack.h
+++ b/crypto_kem/lightsaber/clean/pack_unpack.h
@@ -1,27 +1,28 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
 #include "SABER_params.h"
+#include "poly.h"
 #include <stdint.h>
 #include <stdio.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]);
+void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data);
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]);
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]);
+void PQCLEAN_LIGHTSABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]);
+void PQCLEAN_LIGHTSABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]);
 
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]);
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
 
 
-void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]);
+void PQCLEAN_LIGHTSABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]);
+void PQCLEAN_LIGHTSABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data);
 
 
 #endif
diff --git a/crypto_kem/lightsaber/clean/poly.c b/crypto_kem/lightsaber/clean/poly.c
index 9bb55afe..e5be857f 100644
--- a/crypto_kem/lightsaber/clean/poly.c
+++ b/crypto_kem/lightsaber/clean/poly.c
@@ -3,32 +3,40 @@
 #include "fips202.h"
 #include "pack_unpack.h"
 #include "poly.h"
-#include "poly_mul.h"
 #include <stddef.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
+void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) {
     size_t i, j;
-    for (i = 0; i < SABER_L; i++) {
-        for (j = 0; j < SABER_L; j++) {
-            if (transpose == 1) {
-                PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]);
-            } else {
-                PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]);
+
+    if (transpose) {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1);
+            }
+        }
+    } else {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_LIGHTSABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1);
             }
         }
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
-    size_t j;
-    for (j = 0; j < SABER_L; j++) {
-        PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(res, b[j], s[j]);
+void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) {
+    size_t i;
+
+    PQCLEAN_LIGHTSABER_CLEAN_poly_mul(c, &b[0], &s[0], 0);
+    for (i = 1; i < SABER_L; i++) {
+        PQCLEAN_LIGHTSABER_CLEAN_poly_mul(c, &b[i], &s[i], 1);
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
-    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) {
     size_t i;
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
 
     shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
 
@@ -37,13 +45,13 @@ void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], c
     }
 }
 
-void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) {
-    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) {
     size_t i;
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
 
-    shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES);
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
 
     for (i = 0; i < SABER_L; i++) {
-        PQCLEAN_LIGHTSABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES);
+        PQCLEAN_LIGHTSABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES);
     }
 }
diff --git a/crypto_kem/lightsaber/clean/poly.h b/crypto_kem/lightsaber/clean/poly.h
index 1f50c48e..be074e43 100644
--- a/crypto_kem/lightsaber/clean/poly.h
+++ b/crypto_kem/lightsaber/clean/poly.h
@@ -3,13 +3,21 @@
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose);
+typedef union {
+    uint16_t coeffs[SABER_N];
+} poly;
 
-void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]);
 
-void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]);
+void PQCLEAN_LIGHTSABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose);
 
-void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]);
+void PQCLEAN_LIGHTSABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]);
+
+void PQCLEAN_LIGHTSABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_LIGHTSABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]);
+
+
+void PQCLEAN_LIGHTSABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate);
 
 
 #endif
diff --git a/crypto_kem/lightsaber/clean/poly_mul.c b/crypto_kem/lightsaber/clean/poly_mul.c
index c7f5c424..d82d8585 100644
--- a/crypto_kem/lightsaber/clean/poly_mul.c
+++ b/crypto_kem/lightsaber/clean/poly_mul.c
@@ -1,4 +1,4 @@
-#include "poly_mul.h"
+#include "poly.h"
 #include <stdint.h>
 #include <string.h>
 
@@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t
 }
 
 /* res += a*b */
-void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) {
-    uint16_t c[2 * SABER_N] = {0};
+void PQCLEAN_LIGHTSABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) {
+    uint16_t C[2 * SABER_N] = {0};
     size_t i;
 
-    toom_cook_4way(c, a, b);
+    toom_cook_4way(C, a->coeffs, b->coeffs);
 
     /* reduction */
-    for (i = SABER_N; i < 2 * SABER_N; i++) {
-        res[i - SABER_N] += (c[i - SABER_N] - c[i]);
+    if (accumulate == 0) {
+        for (i = SABER_N; i < 2 * SABER_N; i++) {
+            c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]);
+        }
+    } else {
+        for (i = SABER_N; i < 2 * SABER_N; i++) {
+            c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]);
+        }
     }
 }
diff --git a/crypto_kem/lightsaber/clean/poly_mul.h b/crypto_kem/lightsaber/clean/poly_mul.h
index 5ec233bb..b28b04f6 100644
--- a/crypto_kem/lightsaber/clean/poly_mul.h
+++ b/crypto_kem/lightsaber/clean/poly_mul.h
@@ -1,9 +1,3 @@
-#ifndef POLY_MUL_H
-#define POLY_MUL_H
-#include "SABER_params.h"
-#include <stdint.h>
-
-void PQCLEAN_LIGHTSABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]);
 
 
-#endif
+
diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml
index 87187702..7eb15ca2 100644
--- a/crypto_kem/saber/META.yml
+++ b/crypto_kem/saber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/b53a47b5/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/saber/avx2/Makefile b/crypto_kem/saber/avx2/Makefile
index 65cc21ef..070665b4 100644
--- a/crypto_kem/saber/avx2/Makefile
+++ b/crypto_kem/saber/avx2/Makefile
@@ -2,7 +2,7 @@
 
 LIB=libsaber_avx2.a
 HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
-OBJECTS=cbd.o kem.o pack_unpack.o SABER_indcpa.o verify.o 
+OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
 CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
 
diff --git a/crypto_kem/saber/avx2/SABER_indcpa.c b/crypto_kem/saber/avx2/SABER_indcpa.c
index 5515c610..e01eb650 100644
--- a/crypto_kem/saber/avx2/SABER_indcpa.c
+++ b/crypto_kem/saber/avx2/SABER_indcpa.c
@@ -1,416 +1,125 @@
-#include "./polymul/toom-cook_4way.c"
 #include "SABER_indcpa.h"
 #include "SABER_params.h"
-#include "api.h"
-#include "cbd.h"
 #include "fips202.h"
 #include "pack_unpack.h"
+#include "poly.h"
 #include "randombytes.h"
 #include <stdint.h>
-#include <stdio.h>
 #include <string.h>
-//#include "randombytes.h"
-//#include "./polymul/toom_cook_4/toom-cook_4way.c"
 
-#define h1 4 //2^(EQ-EP-1)
+#define h1 (1 << (SABER_EQ - SABER_EP - 1))
+#define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
-#define h2 ( (1<<(SABER_EP-2)) - (1<<(SABER_EP-SABER_ET-1)) + (1<<(SABER_EQ-SABER_EP-1)) )
+void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
+    size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly *skpv1 = A[0]; // use first row of A to hold sk temporarily
+    toom4_points skpv1_eval[SABER_L];
+    poly res[SABER_L];
 
-static void POL2MSG(uint8_t *message_dec, const uint16_t *message_dec_unpacked) {
-    int32_t i, j;
+    uint8_t rand[SABER_NOISESEEDBYTES];
+    uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
 
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        message_dec[j] = 0;
-        for (i = 0; i < 8; i++) {
-            message_dec[j] = message_dec[j] | (message_dec_unpacked[j * 8 + i] << i);
-        }
-    }
-}
+    randombytes(seed_A, SABER_SEEDBYTES);
+    shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
 
-/*-----------------------------------------------------------------------------------
-    This routine generates a=[Matrix K x K] of 256-coefficient polynomials
--------------------------------------------------------------------------------------*/
+    randombytes(rand, SABER_NOISESEEDBYTES);
+    PQCLEAN_SABER_AVX2_GenSecret(skpv1, rand);
+    PQCLEAN_SABER_AVX2_POLVECq2BS(sk, skpv1); // pack secret key
 
-static void GenMatrix(polyvec *a, const uint8_t *seed) {
-    uint8_t buf[SABER_K * SABER_K * 13 * SABER_N / 8];
-
-    uint16_t temp_ar[SABER_N];
-
-    int i, j, k;
-    uint16_t mod = (SABER_Q - 1);
-
-    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            PQCLEAN_SABER_AVX2_BS2POLq(temp_ar, buf + (i * SABER_K + j) * 13 * SABER_N / 8);
-            for (k = 0; k < SABER_N; k++) {
-                a[i].vec[j].coeffs[k] = (temp_ar[k])& mod ;
-            }
-        }
-    }
-}
-
-static void GenSecret(uint16_t r[SABER_K][SABER_N], const uint8_t *seed) {
-
-    uint32_t i;
-
-    uint8_t buf[SABER_MU * SABER_N * SABER_K / 8];
-
-    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
-
-    for (i = 0; i < SABER_K; i++) {
-        PQCLEAN_SABER_AVX2_cbd(r[i], buf + i * SABER_MU * SABER_N / 8);
-    }
-}
-
-//********************************matrix-vector mul routines*****************************************************
-static void matrix_vector_mul(__m256i res_avx[NUM_POLY][AVX_N1], __m256i a1_avx_combined[NUM_POLY][NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4], int isTranspose) {
-    int64_t i, j;
-
-    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
-
-    for (i = 0; i < NUM_POLY; i++) {
-        for (j = 0; j < NUM_POLY; j++) {
-
-            if (isTranspose == 0) {
-                toom_cook_4way_avx_n1(a1_avx_combined[i][j], b_bucket[j], c_bucket, j);
-            } else {
-                toom_cook_4way_avx_n1(a1_avx_combined[j][i], b_bucket[j], c_bucket, j);
-            }
-        }
-
-        TC_interpol(c_bucket, res_avx[i]);
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_SABER_AVX2_toom4_eval(&skpv1_eval[j], &skpv1[j]);
     }
 
-}
+    PQCLEAN_SABER_AVX2_GenMatrix(A, seed_A); // sample matrix A
+    PQCLEAN_SABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 1); // Matrix in transposed order
 
-static void vector_vector_mul(__m256i res_avx[AVX_N1], __m256i a_avx[NUM_POLY][AVX_N1], __m256i b_bucket[NUM_POLY][SCHB_N * 4]) {
-
-    int64_t i;
-
-    __m256i c_bucket[2 * SCM_SIZE * 4]; //Holds results for 9 Karatsuba at a time
-
-    for (i = 0; i < NUM_POLY; i++) {
-        toom_cook_4way_avx_n1(a_avx[i], b_bucket[i], c_bucket, i);
-    }
-    TC_interpol(c_bucket, res_avx);
-}
-
-//********************************matrix-vector mul routines*****************************************************
-
-void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk) {
-
-    polyvec a[SABER_K];
-
-    uint16_t skpv1[SABER_K][SABER_N];
-
-
-
-    uint8_t seed[SABER_SEEDBYTES];
-    uint8_t noiseseed[SABER_COINBYTES];
-    int32_t i, j, k;
-
-
-//--------------AVX declaration------------------
-
-    __m256i sk_avx[SABER_K][SABER_N / 16];
-    __m256i mod;
-    __m256i res_avx[SABER_K][SABER_N / 16];
-    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
-    //__m256i acc[2*SABER_N/16];
-
-    mod = _mm256_set1_epi16(SABER_Q - 1);
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-
-//--------------AVX declaration ends------------------
-
-    randombytes(seed, SABER_SEEDBYTES);
-
-    shake128(seed, SABER_SEEDBYTES, seed, SABER_SEEDBYTES); // for not revealing system RNG state
-    randombytes(noiseseed, SABER_COINBYTES);
-
-
-    GenMatrix(a, seed); //sample matrix A
-
-    GenSecret(skpv1, noiseseed);
-
-
-// Load sk into avx vectors
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
-        }
-
-    }
-
-    // Load a into avx vectors
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            for (k = 0; k < SABER_N / 16; k++) {
-                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
-            }
+    // rounding
+    for (i = 0; i < SABER_L; i++) {
+        for (j = 0; j < SABER_N; j++) {
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
 
-
-
-    //------------------------do the matrix vector multiplication and rounding------------
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sk_avx[j], b_bucket[j]);
-    }
-    matrix_vector_mul(res_avx, a_avx, b_bucket, 1);// Matrix-vector multiplication; Matrix in transposed order
-
-    // Now truncation
-
-
-    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
-        for (j = 0; j < SABER_N / 16; j++) {
-            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
-            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
-            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
-        }
-    }
-
-    //------------------Pack sk into byte string-------
-
-    PQCLEAN_SABER_AVX2_POLVEC2BS(sk, (const uint16_t (*)[SABER_N])skpv1, SABER_Q);
-
-    //------------------Pack pk into byte string-------
-
-    for (i = 0; i < SABER_K; i++) { // reuses skpv1[] for unpacking avx of public-key
-        for (j = 0; j < SABER_N / 16; j++) {
-            _mm256_maskstore_epi32 ((int *) (skpv1[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
-        }
-    }
-    PQCLEAN_SABER_AVX2_POLVEC2BS(pk, (const uint16_t (*)[SABER_N])skpv1, SABER_P); // load the public-key into pk byte string
-
-
-    for (i = 0; i < SABER_SEEDBYTES; i++) { // now load the seedbytes in PK. Easy since seed bytes are kept in byte format.
-        pk[SABER_POLYVECCOMPRESSEDBYTES + i] = seed[i];
-    }
-
+    PQCLEAN_SABER_AVX2_POLVECp2BS(pk, res); // pack public key
 }
 
 
 void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
+    size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly res[SABER_L];
+    toom4_points skpv1_eval[SABER_L];
 
-    uint32_t i, j, k;
-    polyvec a[SABER_K];     // skpv;
-    uint8_t seed[SABER_SEEDBYTES];
-    uint16_t pkcl[SABER_K][SABER_N];    //public key of received by the client
+    poly *temp = A[0]; // re-use stack space
+    poly *vprime = &A[0][0];
+    poly *message = &A[0][1];
 
+    const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+    uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
 
-    uint16_t skpv1[SABER_K][SABER_N];
-    uint16_t temp[SABER_K][SABER_N];
-    uint16_t message[SABER_KEYBYTES * 8];
-
-    uint8_t msk_c[SABER_SCALEBYTES_KEM];
-
-    //--------------AVX declaration------------------
-
-    __m256i sk_avx[SABER_K][SABER_N / 16];
-    __m256i mod, mod_p;
-    __m256i res_avx[SABER_K][SABER_N / 16];
-    __m256i vprime_avx[SABER_N / 16];
-    __m256i a_avx[SABER_K][SABER_K][SABER_N / 16];
-    //__m256i acc[2*SABER_N/16];
-
-    __m256i pkcl_avx[SABER_K][SABER_N / 16];
-
-    __m256i message_avx[SABER_N / 16];
-
-    mod = _mm256_set1_epi16(SABER_Q - 1);
-    mod_p = _mm256_set1_epi16(SABER_P - 1);
-
-
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-
-    //--------------AVX declaration ends------------------
-    for (i = 0; i < SABER_SEEDBYTES; i++) { // Load the seedbytes in the client seed from PK.
-        seed[i] = pk[ SABER_POLYVECCOMPRESSEDBYTES + i];
+    PQCLEAN_SABER_AVX2_GenSecret(temp, noiseseed);
+    for (j = 0; j < SABER_L; j++) {
+        PQCLEAN_SABER_AVX2_toom4_eval(&skpv1_eval[j], &temp[j]);
     }
 
-    GenMatrix(a, seed);
-    GenSecret(skpv1, noiseseed);
+    PQCLEAN_SABER_AVX2_GenMatrix(A, seed_A);
+    PQCLEAN_SABER_AVX2_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const toom4_points *)skpv1_eval, 0); // 0 => not transposed
 
-    // ----------- Load skpv1 into avx vectors ----------
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sk_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&skpv1[i][j * 16]));
+    // rounding
+    for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits
+        for (j = 0; j < SABER_N; j++) {
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
-
-    // ----------- Load skpv1 into avx vectors ----------
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_K; j++) {
-            for (k = 0; k < SABER_N / 16; k++) {
-                a_avx[i][j][k] = _mm256_loadu_si256 ((__m256i const *) (&a[i].vec[j].coeffs[k * 16]));
-            }
-        }
-    }
-    //-----------------matrix-vector multiplication and rounding
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sk_avx[j], b_bucket[j]);
-    }
-    matrix_vector_mul(res_avx, a_avx, b_bucket, 0);// Matrix-vector multiplication; Matrix in normal order
-
-    // Now truncation
-
-    for (i = 0; i < SABER_K; i++) { //shift right EQ-EP bits
-        for (j = 0; j < SABER_N / 16; j++) {
-            res_avx[i][j] = _mm256_add_epi16 (res_avx[i][j], _mm256_set1_epi16(h1));
-            res_avx[i][j] = _mm256_srli_epi16 (res_avx[i][j], (SABER_EQ - SABER_EP) );
-            res_avx[i][j] = _mm256_and_si256 (res_avx[i][j], mod);
-
-        }
-    }
-
-
-    //-----this result should be put in b_prime for later use in server.
-    for (i = 0; i < SABER_K; i++) { // first store in 16 bit arrays
-        for (j = 0; j < SABER_N / 16; j++) {
-            _mm256_maskstore_epi32 ((int *)(temp[i] + j * 16), _mm256_set1_epi32(-1), res_avx[i][j]);
-        }
-    }
-
-    PQCLEAN_SABER_AVX2_POLVEC2BS(ciphertext, (const uint16_t (*)[SABER_N])temp, SABER_P); // Pack b_prime into ciphertext byte string
-
-//**************client matrix-vector multiplication ends******************//
-
-    //------now calculate the v'
-
-    //-------unpack the public_key
-    PQCLEAN_SABER_AVX2_BS2POLVEC(pkcl, pk, SABER_P);
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            pkcl_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pkcl[i][j * 16]));
-        }
-    }
-
-    // InnerProduct
-    //for(k=0;k<SABER_N/16;k++){
-    //  vprime_avx[k]=_mm256_xor_si256(vprime_avx[k],vprime_avx[k]);
-    //}
+    PQCLEAN_SABER_AVX2_POLVECp2BS(ciphertext, res);
 
     // vector-vector scalar multiplication with mod p
+    PQCLEAN_SABER_AVX2_BS2POLVECp(temp, pk);
+    PQCLEAN_SABER_AVX2_InnerProd(vprime, temp, skpv1_eval);
+    PQCLEAN_SABER_AVX2_BS2POLmsg(message, m);
 
-    vector_vector_mul(vprime_avx, pkcl_avx, b_bucket);
-
-    // Computation of v'+h1
-    for (i = 0; i < SABER_N / 16; i++) { //adding h1
-        vprime_avx[i] = _mm256_add_epi16(vprime_avx[i], _mm256_set1_epi16(h1));
-    }
-
-    // unpack m;
-    for (j = 0; j < SABER_KEYBYTES; j++) {
-        for (i = 0; i < 8; i++) {
-            message[8 * j + i] = ((m[j] >> i) & 0x01);
-        }
-    }
-    // message encoding
-    for (i = 0; i < SABER_N / 16; i++) {
-        message_avx[i] = _mm256_loadu_si256 ((__m256i const *) (&message[i * 16]));
-        message_avx[i] = _mm256_slli_epi16 (message_avx[i], (SABER_EP - 1) );
-    }
-
-    // SHIFTRIGHT(v'+h1-m mod p, EP-ET)
-    for (k = 0; k < SABER_N / 16; k++) {
-        vprime_avx[k] = _mm256_sub_epi16(vprime_avx[k], message_avx[k]);
-        vprime_avx[k] = _mm256_and_si256(vprime_avx[k], mod_p);
-        vprime_avx[k] = _mm256_srli_epi16 (vprime_avx[k], (SABER_EP - SABER_ET) );
-    }
-
-    // Unpack avx
-    for (j = 0; j < SABER_N / 16; j++) {
-        _mm256_maskstore_epi32 ((int *) (temp[0] + j * 16), _mm256_set1_epi32(-1), vprime_avx[j]);
-    }
-
-    PQCLEAN_SABER_AVX2_SABER_pack_4bit(msk_c, temp[0]);
-
-
-    for (j = 0; j < SABER_SCALEBYTES_KEM; j++) {
-        ciphertext[SABER_CIPHERTEXTBYTES + j] = msk_c[j];
+    for (i = 0; i < SABER_N; i++) {
+        vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1));
+        vprime->coeffs[i] &= SABER_P - 1;
+        vprime->coeffs[i] >>= SABER_EP - SABER_ET;
     }
 
+    PQCLEAN_SABER_AVX2_POLT2BS(msk_c, vprime);
 }
 
 
 void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+    size_t i;
 
-    uint32_t i, j;
-    uint16_t sksv[SABER_K][SABER_N]; //secret key of the server
-    uint16_t pksv[SABER_K][SABER_N];
-    uint16_t message_dec_unpacked[SABER_KEYBYTES * 8];  // one element containes on decrypted bit;
-    uint8_t scale_ar[SABER_SCALEBYTES_KEM];
-    uint16_t op[SABER_N];
+    poly temp[SABER_L];
+    toom4_points sksv_eval[SABER_L];
 
-    //--------------AVX declaration------------------
+    const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
+    poly *v = &temp[0];
+    poly *cm = &temp[1];
 
-
-    //__m256i mod_p;
-
-    __m256i v_avx[SABER_N / 16];
-
-    //__m256i acc[2*SABER_N/16];
-
-    __m256i sksv_avx[SABER_K][SABER_N / 16];
-    __m256i pksv_avx[SABER_K][SABER_N / 16];
-
-    //mod_p=_mm256_set1_epi16(SABER_P-1);
-
-    __m256i b_bucket[NUM_POLY][SCHB_N * 4];
-    //--------------AVX declaration ends------------------
-
-    //-------unpack the public_key
-
-    PQCLEAN_SABER_AVX2_BS2POLVEC(sksv, sk, SABER_Q); //sksv is the secret-key
-    PQCLEAN_SABER_AVX2_BS2POLVEC(pksv, ciphertext, SABER_P); //pksv is the ciphertext
-
-    for (i = 0; i < SABER_K; i++) {
-        for (j = 0; j < SABER_N / 16; j++) {
-            sksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&sksv[i][j * 16]));
-            pksv_avx[i][j] = _mm256_loadu_si256 ((__m256i const *) (&pksv[i][j * 16]));
-        }
+    PQCLEAN_SABER_AVX2_BS2POLVECq(temp, sk);
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_SABER_AVX2_toom4_eval(&sksv_eval[i], &temp[i]);
     }
 
-    for (i = 0; i < SABER_N / 16; i++) {
-        v_avx[i] = _mm256_xor_si256(v_avx[i], v_avx[i]);
-    }
+    PQCLEAN_SABER_AVX2_BS2POLVECp(temp, ciphertext);
+    PQCLEAN_SABER_AVX2_InnerProd(v, temp, sksv_eval);
 
+    PQCLEAN_SABER_AVX2_BS2POLT(cm, packed_cm);
 
-    // InnerProduct(b', s, mod p)
-
-    for (j = 0; j < NUM_POLY; j++) {
-        TC_eval(sksv_avx[j], b_bucket[j]);
-    }
-
-    vector_vector_mul(v_avx, pksv_avx, b_bucket);
-
-    for (i = 0; i < SABER_N / 16; i++) {
-        _mm256_maskstore_epi32 ((int *)(message_dec_unpacked + i * 16), _mm256_set1_epi32(-1), v_avx[i]);
-    }
-
-
-    for (i = 0; i < SABER_SCALEBYTES_KEM; i++) {
-        scale_ar[i] = ciphertext[SABER_CIPHERTEXTBYTES + i];
-    }
-
-    PQCLEAN_SABER_AVX2_SABER_un_pack4bit(op, scale_ar);
-
-
-    //addition of h2
     for (i = 0; i < SABER_N; i++) {
-        message_dec_unpacked[i] = ( ( message_dec_unpacked[i] + h2 - (op[i] << (SABER_EP - SABER_ET)) ) & (SABER_P - 1) ) >> (SABER_EP - 1);
+        v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET));
+        v->coeffs[i] &= SABER_P - 1;
+        v->coeffs[i] >>= SABER_EP - 1;
     }
 
-
-    POL2MSG(m, message_dec_unpacked);
+    PQCLEAN_SABER_AVX2_POLmsg2BS(m, v);
 }
diff --git a/crypto_kem/saber/avx2/SABER_params.h b/crypto_kem/saber/avx2/SABER_params.h
index 9b0edafe..d1a5ddd7 100644
--- a/crypto_kem/saber/avx2/SABER_params.h
+++ b/crypto_kem/saber/avx2/SABER_params.h
@@ -1,46 +1,41 @@
 #ifndef PARAMS_H
 #define PARAMS_H
-#include "api.h"
 
 
-
-
-#define SABER_K 3
+/* Don't change anything below this line */
+#define SABER_L 3
 #define SABER_MU 8
 #define SABER_ET 4
 
+#define SABER_N 256
+
+#define SABER_EP 10
+#define SABER_P (1 << SABER_EP)
 
 #define SABER_EQ 13
-#define SABER_EP 10
+#define SABER_Q (1 << SABER_EQ)
 
-#define SABER_N 256
-#define SABER_Q 8192 //2^13
-#define SABER_P 1024
+#define SABER_SEEDBYTES 32
+#define SABER_NOISESEEDBYTES 32
+#define SABER_KEYBYTES 32
+#define SABER_HASHBYTES 32
 
-#define SABER_SEEDBYTES       32
-#define SABER_NOISESEEDBYTES  32
-#define SABER_COINBYTES       32
-#define SABER_KEYBYTES        32
+#define SABER_POLYCOINBYTES (SABER_MU * SABER_N / 8)
 
-#define SABER_HASHBYTES       32
+#define SABER_POLYBYTES (SABER_EQ * SABER_N / 8)
+#define SABER_POLYVECBYTES (SABER_L * SABER_POLYBYTES)
 
-#define SABER_POLYBYTES              416 //13*256/8 
+#define SABER_POLYCOMPRESSEDBYTES (SABER_EP * SABER_N / 8)
+#define SABER_POLYVECCOMPRESSEDBYTES (SABER_L * SABER_POLYCOMPRESSEDBYTES)
 
-#define SABER_POLYVECBYTES           (SABER_K * SABER_POLYBYTES)
-
-#define SABER_POLYVECCOMPRESSEDBYTES (SABER_K * 320) //10*256/8 NOTE : changed till here due to parameter adaptation
-
-#define SABER_CIPHERTEXTBYTES (SABER_POLYVECCOMPRESSEDBYTES)
-
-#define SABER_SCALEBYTES_KEM ((SABER_ET)*SABER_N/8)
+#define SABER_SCALEBYTES_KEM (SABER_ET * SABER_N / 8)
 
 #define SABER_INDCPA_PUBLICKEYBYTES (SABER_POLYVECCOMPRESSEDBYTES + SABER_SEEDBYTES)
 #define SABER_INDCPA_SECRETKEYBYTES (SABER_POLYVECBYTES)
 
 #define SABER_PUBLICKEYBYTES (SABER_INDCPA_PUBLICKEYBYTES)
+#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES + SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
 
-#define SABER_SECRETKEYBYTES (SABER_INDCPA_SECRETKEYBYTES +  SABER_INDCPA_PUBLICKEYBYTES + SABER_HASHBYTES + SABER_KEYBYTES)
-
-#define SABER_BYTES_CCA_DEC   (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM) /* Second part is for Targhi-Unruh */
+#define SABER_BYTES_CCA_DEC (SABER_POLYVECCOMPRESSEDBYTES + SABER_SCALEBYTES_KEM)
 
 #endif
diff --git a/crypto_kem/saber/avx2/cbd.c b/crypto_kem/saber/avx2/cbd.c
index 7639d7d2..53335375 100644
--- a/crypto_kem/saber/avx2/cbd.c
+++ b/crypto_kem/saber/avx2/cbd.c
@@ -11,7 +11,7 @@ Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
 
 
-static uint64_t load_littleendian(const unsigned char *x, int bytes) {
+static uint64_t load_littleendian(const uint8_t *x, int bytes) {
     int i;
     uint64_t r = x[0];
     for (i = 1; i < bytes; i++) {
@@ -20,32 +20,29 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) {
     return r;
 }
 
-
-void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf) {
-    uint16_t Qmod_minus1 = SABER_Q - 1;
-
+void PQCLEAN_SABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]) {
     uint32_t t, d, a[4], b[4];
     int i, j;
 
     for (i = 0; i < SABER_N / 4; i++) {
-        t = load_littleendian(buf + 4 * i, 4);
+        t = (uint32_t) load_littleendian(buf + 4 * i, 4);
         d = 0;
         for (j = 0; j < 4; j++) {
             d += (t >> j) & 0x11111111;
         }
 
-        a[0] =  d & 0xf;
-        b[0] = (d >>  4) & 0xf;
-        a[1] = (d >>  8) & 0xf;
+        a[0] = d & 0xf;
+        b[0] = (d >> 4) & 0xf;
+        a[1] = (d >> 8) & 0xf;
         b[1] = (d >> 12) & 0xf;
         a[2] = (d >> 16) & 0xf;
         b[2] = (d >> 20) & 0xf;
         a[3] = (d >> 24) & 0xf;
         b[3] = (d >> 28);
 
-        r[4 * i + 0] = (uint16_t)(a[0]  - b[0]) & Qmod_minus1;
-        r[4 * i + 1] = (uint16_t)(a[1]  - b[1]) & Qmod_minus1;
-        r[4 * i + 2] = (uint16_t)(a[2]  - b[2]) & Qmod_minus1;
-        r[4 * i + 3] = (uint16_t)(a[3]  - b[3]) & Qmod_minus1;
+        s[4 * i + 0] = (uint16_t)(a[0] - b[0]);
+        s[4 * i + 1] = (uint16_t)(a[1] - b[1]);
+        s[4 * i + 2] = (uint16_t)(a[2] - b[2]);
+        s[4 * i + 3] = (uint16_t)(a[3] - b[3]);
     }
 }
diff --git a/crypto_kem/saber/avx2/cbd.h b/crypto_kem/saber/avx2/cbd.h
index e80ffc75..afe84bf3 100644
--- a/crypto_kem/saber/avx2/cbd.h
+++ b/crypto_kem/saber/avx2/cbd.h
@@ -7,10 +7,10 @@ of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
 by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
 Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
 ----------------------------------------------------------------------*/
-#include "poly.h"
+#include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_SABER_AVX2_cbd(uint16_t *r, const unsigned char *buf);
+void PQCLEAN_SABER_AVX2_cbd(uint16_t s[SABER_N], const uint8_t buf[SABER_POLYCOINBYTES]);
 
 
 #endif
diff --git a/crypto_kem/saber/avx2/kem.c b/crypto_kem/saber/avx2/kem.c
index c88bb315..e47e985f 100644
--- a/crypto_kem/saber/avx2/kem.c
+++ b/crypto_kem/saber/avx2/kem.c
@@ -4,14 +4,12 @@
 #include "fips202.h"
 #include "randombytes.h"
 #include "verify.h"
-#include <immintrin.h>
+#include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 
 
 int PQCLEAN_SABER_AVX2_crypto_kem_keypair(uint8_t *pk, uint8_t *sk) {
-    int i;
+    size_t i;
 
     PQCLEAN_SABER_AVX2_indcpa_kem_keypair(pk, sk); // sk[0:SABER_INDCPA_SECRETKEYBYTES-1] <-- sk
     for (i = 0; i < SABER_INDCPA_PUBLICKEYBYTES; i++) {
@@ -39,7 +37,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk)
     sha3_512(kr, buf, 64);               // kr[0:63] <-- Hash(buf[0:63]);
     // K^ <-- kr[0:31]
     // noiseseed (r) <-- kr[32:63];
-    PQCLEAN_SABER_AVX2_indcpa_kem_enc(c, buf, (const uint8_t *) (kr + 32), pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
+    PQCLEAN_SABER_AVX2_indcpa_kem_enc(c, buf, kr + 32, pk); // buf[0:31] contains message; kr[32:63] contains randomness r;
 
     sha3_256(kr + 32, c, SABER_BYTES_CCA_DEC);
 
@@ -49,7 +47,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_enc(uint8_t *c, uint8_t *k, const uint8_t *pk)
 }
 
 int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_t *sk) {
-    int i;
+    size_t i;
     uint8_t fail;
     uint8_t cmp[SABER_BYTES_CCA_DEC];
     uint8_t buf[64];
@@ -65,7 +63,7 @@ int PQCLEAN_SABER_AVX2_crypto_kem_dec(uint8_t *k, const uint8_t *c, const uint8_
 
     sha3_512(kr, buf, 64);
 
-    PQCLEAN_SABER_AVX2_indcpa_kem_enc(cmp, buf, (const uint8_t *) (kr + 32), pk);
+    PQCLEAN_SABER_AVX2_indcpa_kem_enc(cmp, buf, kr + 32, pk);
 
     fail = PQCLEAN_SABER_AVX2_verify(c, cmp, SABER_BYTES_CCA_DEC);
 
diff --git a/crypto_kem/saber/avx2/kem.h b/crypto_kem/saber/avx2/kem.h
index 612ff4ff..b28b04f6 100644
--- a/crypto_kem/saber/avx2/kem.h
+++ b/crypto_kem/saber/avx2/kem.h
@@ -1,35 +1,3 @@
-#ifndef INDCPA_H
-#define INDCPA_H
-
-#include <stdint.h>
-
-void PQCLEAN_SABER_AVX2_indcpa_keypair(uint8_t *pk, uint8_t *sk);
-
-
-void PQCLEAN_SABER_AVX2_indcpa_client(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
-
-
-void PQCLEAN_SABER_AVX2_indcpa_server(uint8_t *pk, uint8_t *b_prime, uint8_t *c, uint8_t *key);
-
-
-void PQCLEAN_SABER_AVX2_indcpa_kem_keypair(uint8_t *pk, uint8_t *sk);
-
-void PQCLEAN_SABER_AVX2_indcpa_kem_enc(uint8_t *message, uint8_t *noiseseed, uint8_t *pk,  uint8_t *ciphertext);
-
-void PQCLEAN_SABER_AVX2_indcpa_kem_dec(uint8_t *sk, uint8_t *ciphertext, uint8_t message_dec[]);
-
-
-int PQCLEAN_SABER_AVX2_crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
-
-int PQCLEAN_SABER_AVX2_crypto_kem_enc(unsigned char *c, unsigned char *k, const unsigned char *pk);
-
-int PQCLEAN_SABER_AVX2_crypto_kem_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk);
 
 
 
-//uint64_t clock1,clock2;
-
-//uint64_t clock_kp_kex, clock_enc_kex, clock_dec_kex;
-
-
-#endif
diff --git a/crypto_kem/saber/avx2/pack_unpack.c b/crypto_kem/saber/avx2/pack_unpack.c
index 00bf9c08..9bb46acb 100644
--- a/crypto_kem/saber/avx2/pack_unpack.c
+++ b/crypto_kem/saber/avx2/pack_unpack.c
@@ -1,502 +1,145 @@
+#include "SABER_params.h"
 #include "pack_unpack.h"
+#include "poly.h"
+#include <string.h>
 
-
-void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x7) | ( (data[offset_data + 1] & 0x7) << 3 ) | ((data[offset_data + 2] & 0x3) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 2] >> 2 ) & 0x01)  | ( (data[offset_data + 3] & 0x7) << 1 ) | ( (data[offset_data + 4] & 0x7) << 4 ) | (((data[offset_data + 5]) & 0x01) << 7);
-        bytes[offset_byte + 2] = ((data[offset_data + 5] >> 1 ) & 0x03) | ( (data[offset_data + 6] & 0x7) << 2 ) | ( (data[offset_data + 7] & 0x7) << 5 );
-    }
-}
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 3 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0]) & 0x07;
-        data[offset_data + 1] = ( (bytes[offset_byte + 0]) >> 3 ) & 0x07;
-        data[offset_data + 2] = ( ( (bytes[offset_byte + 0]) >> 6 ) & 0x03) | ( ( (bytes[offset_byte + 1]) & 0x01) << 2 );
-        data[offset_data + 3] = ( (bytes[offset_byte + 1]) >> 1 ) & 0x07;
-        data[offset_data + 4] = ( (bytes[offset_byte + 1]) >> 4 ) & 0x07;
-        data[offset_data + 5] = ( ( (bytes[offset_byte + 1]) >> 7 ) & 0x01) | ( ( (bytes[offset_byte + 2]) & 0x03) << 1 );
-        data[offset_data + 6] = ( (bytes[offset_byte + 2] >> 2) & 0x07 );
-        data[offset_data + 7] = ( (bytes[offset_byte + 2] >> 5) & 0x07 );
-    }
-
-}
-
-void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0;
-
+void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        bytes[j] = (data[offset_data] & 0x0f) | ( (data[offset_data + 1] & 0x0f) << 4 );
+        out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4);
+        in += 2;
+        out += 1;
     }
 }
 
-void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0;
-
+void PQCLEAN_SABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 2; j++) {
-        offset_data = 2 * j;
-        data[offset_data] = bytes[j] & 0x0f;
-        data[offset_data + 1] = (bytes[j] >> 4) & 0x0f;
+        out[0] = in[0] & 0x0f;
+        out[1] = (in[0] >> 4) & 0x0f;
+        in += 1;
+        out += 2;
     }
 }
 
-void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & 0x3f) | ((data[offset_data + 1] & 0x03) << 6);
-        bytes[offset_byte + 1] = ((data[offset_data + 1] >> 2) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 2] = ((data[offset_data + 2] >> 4) & 0x03) | ((data[offset_data + 3] & 0x3f) << 2);
-    }
-}
-
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 3 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = bytes[offset_byte + 0] & 0x3f;
-        data[offset_data + 1] = ((bytes[offset_byte + 0] >> 6) & 0x03) |  ((bytes[offset_byte + 1] & 0x0f) << 2)  ;
-        data[offset_data + 2] = ((bytes[offset_byte + 1] & 0xff) >> 4) | ((bytes[offset_byte + 2] & 0x03) << 4) ;
-        data[offset_data + 3] = ((bytes[offset_byte + 2] & 0xff) >> 2);
-    }
-
-}
-
-void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
-        }
-    }
-}
-
-void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x03 ) | ((data[i][ offset_data + 1 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 6) & 0x0f ) | ( (data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 4) & 0x3f ) | ((data[i][ offset_data + 3 ] & 0x03) << 6);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 3 ] >> 2) & 0xff );
-        }
-    }
-}
-
-void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
-
-            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
-
-        }
-    }
-
-
-}
-
-void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
+        out[2] = ((in[1] >> 3) & 0xff);
+        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
+        out[5] = ((in[3] >> 1) & 0xff);
+        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
+        out[7] = ((in[4] >> 4) & 0xff);
+        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
+        out[10] = ((in[6] >> 2) & 0xff);
+        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
+        out[12] = ((in[7] >> 5) & 0xff);
+        in += 8;
+        out += 13;
     }
 }
 
-
-
-void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
-            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
-            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
-            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
-
-        }
-    }
-}
-
-void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-
-
-}
-
-
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 10) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 5 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) |  ((bytes[ offset_byte + 1 ] & 0x03) << 8);
-            data[i][offset_data + 1] = ( (bytes[ offset_byte + 1 ] >> 2) & (0x3f)) |  ((bytes[ offset_byte + 2 ] & 0x0f) << 6);
-            data[i][offset_data + 2] = ( (bytes[ offset_byte + 2 ] >> 4) & (0x0f)) |  ((bytes[ offset_byte + 3 ] & 0x3f) << 4);
-            data[i][offset_data + 3] = ( (bytes[ offset_byte + 3 ] >> 6) & (0x03)) |  ((bytes[ offset_byte + 4 ] & 0xff) << 2);
-
-        }
-    }
-
-
-}
-
-
-void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x1f ) | ((data[i][ offset_data + 1 ] & 0x07) << 5);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 3) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 11) & 0x03 ) | ((data[i][ offset_data + 2 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 6) & 0x7f ) | ( (data[i][ offset_data + 3 ] & 0x01) << 7 );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 9) & 0x0f ) | ( (data[i][ offset_data + 4 ] & 0x0f) << 4 );
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 4] >> 4) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 4 ] >> 12) & 0x01 ) | ( (data[i][ offset_data + 5 ] & 0x7f) << 1 );
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 5 ] >> 7) & 0x3f ) | ( (data[i][ offset_data + 6 ] & 0x03) << 6 );
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 6 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 11] = ( (data[i][ offset_data + 6 ] >> 10) & 0x07 ) | ( (data[i][ offset_data + 7 ] & 0x1f) << 3 );
-
-            bytes[offset_byte + 12] = ( (data[i][ offset_data + 7 ] >> 5) & 0xff );
-
-        }
-    }
-
-
-}
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 13) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 13 * j;
-            offset_data = 8 * j;
-            data[i][offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-            data[i][offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-            data[i][offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-            data[i][offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-            data[i][offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-            data[i][offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-            data[i][offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-            data[i][offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
-        }
-    }
-
-
-}
-
-void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes) {
-
-    uint32_t j;
-    uint32_t offset_data = 0, offset_byte = 0;
-
-    //for(i=0;i<SABER_K;i++){
-    //i=0;
-    //offset_byte1=i*(SABER_N*13)/8;
+static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        //offset_byte=offset_byte1+13*j;
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = ( bytes[ offset_byte + 0 ] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = ( bytes[ offset_byte + 1 ] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = ( bytes[ offset_byte + 3 ] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = ( bytes[ offset_byte + 4 ] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = ( bytes[ offset_byte + 6 ] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = ( bytes[ offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = ( bytes[ offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = ( bytes[ offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
+        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
+        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
+        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
+        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
+        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
+        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
+        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        in += 13;
+        out += 8;
     }
-    //}
-
-
 }
 
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
+    for (j = 0; j < SABER_N / 4; j++) {
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
+        out[4] = ((in[3] >> 2) & 0xff);
+        in += 4;
+        out += 5;
+    }
+}
 
-void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-    /*This function packs 11 bit data stream into 8 bits of data.
-    */
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
+static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
+    for (j = 0; j < SABER_N / 4; j++) {
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
+        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
+        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
+        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        in += 5;
+        out += 4;
+    }
+}
 
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 11) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 11 * j;
-            offset_data = 8 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
+void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]);
+    }
+}
 
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x07 ) | ((data[i][ offset_data + 1 ] & 0x1f) << 3);
+void PQCLEAN_SABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLq(&data[i], bytes + i * SABER_POLYBYTES);
+    }
+}
 
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 5) & 0x3f ) | ((data[i][ offset_data + 2 ] & 0x03) << 6);
+void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]);
+    }
+}
 
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 2 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 10) & 0x01 ) | ((data[i][ offset_data + 3 ] & 0x7f) << 1);
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 3 ] >> 7) & 0x0f ) | ((data[i][ offset_data + 4 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 4 ] >> 4) & 0x7f ) | ((data[i][ offset_data + 5 ] & 0x01) << 7);
-
-            bytes[offset_byte + 7] = ( (data[i][ offset_data + 5 ] >> 1) & 0xff );
-
-            bytes[offset_byte + 8] = ( (data[i][ offset_data + 5 ] >> 9) & 0x03 ) | ((data[i][ offset_data + 6 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 9] = ( (data[i][ offset_data + 6 ] >> 6) & 0x1f ) | ((data[i][ offset_data + 7 ] & 0x07) << 5);
-
-            bytes[offset_byte + 10] = ( (data[i][ offset_data + 7 ] >> 3) & 0xff );
+void PQCLEAN_SABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+    size_t i;
+    for (i = 0; i < SABER_L; i++) {
+        BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES);
+    }
+}
 
+void PQCLEAN_SABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) {
+    size_t i, j;
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01);
         }
     }
-
 }
 
-void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
+void PQCLEAN_SABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) {
+    size_t i, j;
+    memset(bytes, 0, SABER_KEYBYTES);
 
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 11) / 8;
-        for (j = 0; j < SABER_N / 8; j++) {
-            offset_byte = offset_byte1 + 11 * j;
-            offset_data = 8 * j;
-
-            data[i][offset_data + 0] = (bytes[offset_byte + 0]) | ( (bytes[offset_byte + 1] & 0x07) << 8 );
-
-            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 3) & 0x1f) | ( (bytes[offset_byte + 2] & 0x3f) << 5 );
-
-            data[i][offset_data + 2] = ( (bytes[offset_byte + 2] >> 6) & 0x03) | ( (bytes[offset_byte + 3] & 0xff) << 2 ) | ( (bytes[offset_byte + 4] & 0x01) << 10 );
-
-            data[i][offset_data + 3] = ( (bytes[offset_byte + 4] >> 1) & 0x7f) | ( (bytes[offset_byte + 5] & 0x0f) << 7 );
-
-            data[i][offset_data + 4] = ( (bytes[offset_byte + 5] >> 4) & 0x0f) | ( (bytes[offset_byte + 6] & 0x7f) << 4 );
-
-            data[i][offset_data + 5] = ( (bytes[offset_byte + 6] >> 7) & 0x01) | ( (bytes[offset_byte + 7] & 0xff) << 1 ) | ( (bytes[offset_byte + 8] & 0x03) << 9 );
-
-            data[i][offset_data + 6] = ( (bytes[offset_byte + 8] >> 2) & 0x3f) | ( (bytes[offset_byte + 9] & 0x1f) << 6 );
-
-            data[i][offset_data + 7] = ( (bytes[offset_byte + 9] >> 5) & 0x07) | ( (bytes[offset_byte + 10] & 0xff) << 3 );
+    for (j = 0; j < SABER_KEYBYTES; j++) {
+        for (i = 0; i < 8; i++) {
+            bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i);
         }
     }
-
-
-}
-
-void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 14) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 7 * j;
-            offset_data = 4 * j;
-            bytes[offset_byte + 0] = ( data[i][ offset_data + 0 ] & (0xff));
-
-            bytes[offset_byte + 1] = ( (data[i][ offset_data + 0 ] >> 8) & 0x3f ) | ((data[i][ offset_data + 1 ] & 0x03) << 6);
-
-            bytes[offset_byte + 2] = ( (data[i][ offset_data + 1 ] >> 2) & 0xff );
-
-            bytes[offset_byte + 3] = ( (data[i][ offset_data + 1 ] >> 10) & 0x0f ) | ((data[i][ offset_data + 2 ] & 0x0f) << 4);
-
-            bytes[offset_byte + 4] = ( (data[i][ offset_data + 2 ] >> 4) & 0xff );
-
-            bytes[offset_byte + 5] = ( (data[i][ offset_data + 2 ] >> 12) & 0x03 ) | ((data[i][ offset_data + 3 ] & 0x3f) << 2);
-
-            bytes[offset_byte + 6] = ( (data[i][ offset_data + 3 ] >> 6) & 0xff );
-        }
-    }
-
-
-}
-
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes) {
-
-    uint32_t i, j;
-    uint32_t offset_data = 0, offset_byte = 0, offset_byte1 = 0;
-
-    for (i = 0; i < SABER_K; i++) {
-        offset_byte1 = i * (SABER_N * 14) / 8;
-        for (j = 0; j < SABER_N / 4; j++) {
-            offset_byte = offset_byte1 + 7 * j;
-            offset_data = 4 * j;
-            data[i][offset_data + 0] = (bytes[offset_byte + 0] & 0xff) | ( (bytes[offset_byte + 1] & 0x3f) << 8 );
-
-            data[i][offset_data + 1] = ( (bytes[offset_byte + 1] >> 6) & 0x03) | ((bytes[offset_byte + 2] & 0xff) << 2 ) | ( (bytes[offset_byte + 3] & 0x0f) << 10 );
-
-            data[i][offset_data + 2] = ( (bytes[offset_byte + 3] >> 4) & 0x0f) | ( (bytes[offset_byte + 4] ) << 4 ) | ( (bytes[offset_byte + 5] & 0x03) << 12 );
-
-            data[i][offset_data + 3] = ( (bytes[offset_byte + 5] >> 2) & 0x3f) | ( (bytes[offset_byte + 6] ) << 6 );
-        }
-    }
-
-
-}
-
-void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus) {
-
-    if (modulus == 1024) {
-        PQCLEAN_SABER_AVX2_POLVECp2BS(bytes, data);
-    } else if (modulus == 8192) {
-        PQCLEAN_SABER_AVX2_POLVECq2BS(bytes, data);
-    }
-}
-
-void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus) {
-
-    if (modulus == 1024) {
-        PQCLEAN_SABER_AVX2_BS2POLVECp(data, bytes);
-    } else if (modulus == 8192) {
-        PQCLEAN_SABER_AVX2_BS2POLVECq(data, bytes);
-    }
-
 }
diff --git a/crypto_kem/saber/avx2/pack_unpack.h b/crypto_kem/saber/avx2/pack_unpack.h
index e1608d4c..0965bbcd 100644
--- a/crypto_kem/saber/avx2/pack_unpack.h
+++ b/crypto_kem/saber/avx2/pack_unpack.h
@@ -1,56 +1,28 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
 #include "SABER_params.h"
+#include "poly.h"
 #include <stdint.h>
 #include <stdio.h>
 
-void PQCLEAN_SABER_AVX2_BS2POLq(uint16_t data[SABER_N], const uint8_t *bytes);
+void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data);
 
-void PQCLEAN_SABER_AVX2_BS2POLVEC(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes, uint16_t modulus);
-
-void PQCLEAN_SABER_AVX2_BS2POLVECq(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_SABER_AVX2_BS2POLVECp(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+void PQCLEAN_SABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_SABER_AVX2_POLVEC2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N], uint16_t modulus);
+void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]);
 
-void PQCLEAN_SABER_AVX2_POLVECq2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+void PQCLEAN_SABER_AVX2_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]);
 
 
-void PQCLEAN_SABER_AVX2_SABER_pack_3bit(uint8_t *bytes, const uint16_t *data);
+void PQCLEAN_SABER_AVX2_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]);
 
-void PQCLEAN_SABER_AVX2_SABER_pack_4bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_SABER_AVX2_SABER_pack_6bit(uint8_t *bytes, const uint16_t *data);
-
-void PQCLEAN_SABER_AVX2_SABER_pack10bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_SABER_AVX2_SABER_pack11bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_SABER_AVX2_SABER_pack13bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
-
-void PQCLEAN_SABER_AVX2_SABER_pack14bit(uint8_t *bytes, const uint16_t data[SABER_K][SABER_N]);
+void PQCLEAN_SABER_AVX2_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
 
 
-void PQCLEAN_SABER_AVX2_SABER_poly_un_pack13bit(uint16_t data[SABER_N], const uint8_t *bytes);
+void PQCLEAN_SABER_AVX2_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]);
 
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack3bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack4bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack6bit(uint16_t *data, const uint8_t *bytes);
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack10bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack11bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack13bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
-
-void PQCLEAN_SABER_AVX2_SABER_un_pack14bit(uint16_t data[SABER_K][SABER_N], const uint8_t *bytes);
+void PQCLEAN_SABER_AVX2_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data);
 
 
 #endif
diff --git a/crypto_kem/saber/avx2/poly.c b/crypto_kem/saber/avx2/poly.c
new file mode 100644
index 00000000..1bc268b6
--- /dev/null
+++ b/crypto_kem/saber/avx2/poly.c
@@ -0,0 +1,62 @@
+#include "cbd.h"
+#include "fips202.h"
+#include "pack_unpack.h"
+#include "poly.h"
+
+
+void PQCLEAN_SABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose) {
+    size_t i, j;
+    toom4_points_product c_eval;
+
+    if (transpose) {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[0][i], &s_eval[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[j][i], &s_eval[j], 1);
+            }
+            PQCLEAN_SABER_AVX2_toom4_interp(&c[i], &c_eval);
+        }
+    } else {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][0], &s_eval[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &A[i][j], &s_eval[j], 1);
+            }
+            PQCLEAN_SABER_AVX2_toom4_interp(&c[i], &c_eval);
+        }
+    }
+}
+
+void PQCLEAN_SABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]) {
+    size_t i;
+    toom4_points_product c_eval; //Holds results for 9 Karatsuba at a time
+
+    PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[0], &s_eval[0], 0);
+    for (i = 1; i < SABER_L; i++) {
+        PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(&c_eval, &b[i], &s_eval[i], 1);
+    }
+
+    PQCLEAN_SABER_AVX2_toom4_interp(c, &c_eval);
+}
+
+void PQCLEAN_SABER_AVX2_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) {
+    size_t i;
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+
+    shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_SABER_AVX2_BS2POLVECq(A[i], buf + i * SABER_POLYVECBYTES);
+    }
+}
+
+void PQCLEAN_SABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) {
+    size_t i;
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
+
+    for (i = 0; i < SABER_L; i++) {
+        PQCLEAN_SABER_AVX2_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES);
+    }
+}
diff --git a/crypto_kem/saber/avx2/poly.h b/crypto_kem/saber/avx2/poly.h
index 2978d0d8..188e31e7 100644
--- a/crypto_kem/saber/avx2/poly.h
+++ b/crypto_kem/saber/avx2/poly.h
@@ -1,27 +1,38 @@
 #ifndef POLY_H
 #define POLY_H
-/*---------------------------------------------------------------------
-This file has been adapted from the implementation
-(available at, Public Domain https://github.com/pq-crystals/kyber)
-of "CRYSTALS – Kyber: a CCA-secure module-lattice-based KEM"
-by : Joppe Bos, Leo Ducas, Eike Kiltz, Tancrede Lepoint,
-Vadim Lyubashevsky, John M. Schanck, Peter Schwabe & Damien stehle
-----------------------------------------------------------------------*/
 #include "SABER_params.h"
+#include <immintrin.h>
 #include <stdint.h>
 
-typedef struct {
+typedef union {
     uint16_t coeffs[SABER_N];
+    __m256i dummy;
 } poly;
 
-typedef struct {
-    poly vec[SABER_K];
-} polyvec;
+typedef union {
+    uint16_t coeffs[4 * SABER_N];
+    __m256i dummy;
+} toom4_points;
 
-void PQCLEAN_SABER_AVX2_poly_getnoise(uint16_t *r, const unsigned char *seed, unsigned char nonce);
+typedef union {
+    uint16_t coeffs[8 * SABER_N];
+    __m256i dummy;
+} toom4_points_product;
+
+void PQCLEAN_SABER_AVX2_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const toom4_points s_eval[SABER_L], int transpose);
+
+void PQCLEAN_SABER_AVX2_InnerProd(poly *c, const poly b[SABER_L], const toom4_points s_eval[SABER_L]);
+
+void PQCLEAN_SABER_AVX2_GenMatrix(poly a[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_SABER_AVX2_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]);
 
 
-void PQCLEAN_SABER_AVX2_poly_getnoise4x(uint16_t *r0, uint16_t *r1, uint16_t *r2, const unsigned char *seed, unsigned char nonce0, unsigned char nonce1, unsigned char nonce2, unsigned char nonce3);
+void PQCLEAN_SABER_AVX2_toom4_interp(poly *res_avx, const toom4_points_product *c_eval);
+
+void PQCLEAN_SABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b);
+
+void PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a_avx, const toom4_points *b_eval, int accumulate);
 
 
 #endif
diff --git a/crypto_kem/saber/avx2/poly_mul.c b/crypto_kem/saber/avx2/poly_mul.c
new file mode 100644
index 00000000..5ec0aa73
--- /dev/null
+++ b/crypto_kem/saber/avx2/poly_mul.c
@@ -0,0 +1,1524 @@
+#include "SABER_params.h"
+#include "poly.h"
+
+
+#define L (SABER_N / 64)
+
+static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) {
+    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
+}
+
+static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+    __m256i temp;
+
+    a0 = a[0];
+    a1 = a[1];
+    a2 = a[2];
+    a3 = a[3];
+    a4 = a[4];
+    a5 = a[5];
+    a6 = a[6];
+    a7 = a[7];
+
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    b4 = b[4];
+    b5 = b[5];
+    b6 = b[6];
+    b7 = b[7];
+
+    c[0] = mul_add(a0, b0, c[0]);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    temp = mul_add(a1, b0, temp);
+    c[1] = _mm256_add_epi16(temp, c[1]);
+
+    temp = _mm256_mullo_epi16(a0, b2);
+    temp = mul_add(a1, b1, temp);
+    temp = mul_add(a2, b0, temp);
+    c[2] = _mm256_add_epi16(temp, c[2]);
+
+    temp = _mm256_mullo_epi16(a0, b3);
+    temp = mul_add(a1, b2, temp);
+    temp = mul_add(a2, b1, temp);
+    temp = mul_add(a3, b0, temp);
+    c[3] = _mm256_add_epi16(temp, c[3]);
+
+    temp = _mm256_mullo_epi16(a0, b4);
+    temp = mul_add(a1, b3, temp);
+    temp = mul_add(a3, b1, temp);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    c[4] = _mm256_add_epi16(temp, c[4]);
+
+    temp = _mm256_mullo_epi16(a0, b5);
+    temp = mul_add(a1, b4, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add( a4, b1, temp);
+    temp = mul_add(a5, b0, temp);
+    c[5] = _mm256_add_epi16(temp, c[5]);
+
+    temp = _mm256_mullo_epi16(a0, b6);
+    temp = mul_add(a1, b5, temp);
+    temp = mul_add(a5, b1, temp);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a4, b2, temp);
+    c[6] = _mm256_add_epi16(temp, c[6]);
+
+    temp = _mm256_mullo_epi16(a0, b7);
+    temp = mul_add(a1, b6, temp);
+    temp = mul_add(a6, b1, temp);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a5, b2, temp);
+    c[7] = _mm256_add_epi16(temp, c[7]);
+
+    temp = _mm256_mullo_epi16(a0, b[8]);
+    temp = mul_add(a1, b7, temp);
+    temp = mul_add(a7, b1, temp);
+    temp = mul_add(a[8], b0, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a6, b2, temp);
+    c[8] = _mm256_add_epi16(temp, c[8]);
+
+    temp = _mm256_mullo_epi16(a0, b[9]);
+    temp = mul_add(a1, b[8], temp);
+    temp = mul_add(a[8], b1, temp);
+    temp = mul_add(a[9], b0, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a7, b2, temp);
+    c[9] = _mm256_add_epi16(temp, c[9]);
+
+    temp = _mm256_mullo_epi16(a0, b[10]);
+    temp = mul_add(a1, b[9], temp);
+    temp = mul_add(a[9], b1, temp);
+    temp = mul_add(a[10], b0, temp);
+    temp = mul_add(a2, b[8], temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a[8], b2, temp);
+    c[10] = _mm256_add_epi16(temp, c[10]);
+
+    temp = _mm256_mullo_epi16(a0, b[11]);
+    temp = mul_add(a1, b[10], temp);
+    temp = mul_add(a[10], b1, temp);
+    temp = mul_add(a[11], b0, temp);
+    temp = mul_add(a2, b[9], temp);
+    temp = mul_add(a3, b[8], temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a[8], b3, temp);
+    temp = mul_add(a[9], b2, temp);
+    c[11] = _mm256_add_epi16(temp, c[11]);
+
+    temp = _mm256_mullo_epi16(a0, b[12]);
+    temp = mul_add(a1, b[11], temp);
+    temp = mul_add(a[11], b1, temp);
+    temp = mul_add(a[12], b0, temp);
+    temp = mul_add(a2, b[10], temp);
+    temp = mul_add(a3, b[9], temp);
+    temp = mul_add(a4, b[8], temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a[8], b4, temp);
+    temp = mul_add(a[9], b3, temp);
+    temp = mul_add(a[10], b2, temp);
+    c[12] = _mm256_add_epi16(temp, c[12]);
+
+    temp = _mm256_mullo_epi16(a0, b[13]);
+    temp = mul_add(a1, b[12], temp);
+    temp = mul_add(a[12], b1, temp);
+    temp = mul_add(a[13], b0, temp);
+    temp = mul_add(a2, b[11], temp);
+    temp = mul_add(a3, b[10], temp);
+    temp = mul_add(a4, b[9], temp);
+    temp = mul_add(a5, b[8], temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a[8], b5, temp);
+    temp = mul_add(a[9], b4, temp);
+    temp = mul_add(a[10], b3, temp);
+    temp = mul_add(a[11], b2, temp);
+    c[13] = _mm256_add_epi16(temp, c[13]);
+
+    temp = _mm256_mullo_epi16(a0, b[14]);
+    temp = mul_add(a1, b[13], temp);
+    temp = mul_add(a[13], b1, temp);
+    temp = mul_add(a[14], b0, temp);
+    temp = mul_add(a2, b[12], temp);
+    temp = mul_add(a3, b[11], temp);
+    temp = mul_add(a4, b[10], temp);
+    temp = mul_add(a5, b[9], temp);
+    temp = mul_add(a6, b[8], temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a[8], b6, temp);
+    temp = mul_add(a[9], b5, temp);
+    temp = mul_add(a[10], b4, temp);
+    temp = mul_add(a[11], b3, temp);
+    temp = mul_add(a[12], b2, temp);
+    c[14] = _mm256_add_epi16(temp, c[14]);
+
+    temp = _mm256_mullo_epi16(a0, b[15]);
+    temp = mul_add(a1, b[14], temp);
+    temp = mul_add(a[14], b1, temp);
+    temp = mul_add(a[15], b0, temp);
+    temp = mul_add(a2, b[13], temp);
+    temp = mul_add(a3, b[12], temp);
+    temp = mul_add(a4, b[11], temp);
+    temp = mul_add(a5, b[10], temp);
+    temp = mul_add(a6, b[9], temp);
+    temp = mul_add(a7, b[8], temp);
+    temp = mul_add(a[8], b7, temp);
+    temp = mul_add(a[9], b6, temp);
+    temp = mul_add(a[10], b5, temp);
+    temp = mul_add(a[11], b4, temp);
+    temp = mul_add(a[12], b3, temp);
+    temp = mul_add(a[13], b2, temp);
+    c[15] = _mm256_add_epi16(temp, c[15]);
+
+    a0 = a[14];
+    a1 = a[15];
+    a2 = a[13];
+    a3 = a[12];
+    a4 = a[11];
+    a5 = a[10];
+    a6 = a[9];
+    a7 = a[8];
+
+    b0 = b[14];
+    b1 = b[15];
+    b2 = b[13];
+    b3 = b[12];
+    b4 = b[11];
+    b5 = b[10];
+    b6 = b[9];
+    b7 = b[8];
+
+    temp = _mm256_mullo_epi16(a[1], b1);
+    temp = mul_add(a[2], b0, temp);
+    temp = mul_add(a[3], b2, temp);
+    temp = mul_add(a[4], b3, temp);
+    temp = mul_add(a[5], b4, temp);
+    temp = mul_add(a[6], b5, temp);
+    temp = mul_add(a[7], b6, temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a6, b[7], temp);
+    temp = mul_add(a5, b[6], temp);
+    temp = mul_add(a4, b[5], temp);
+    temp = mul_add(a3, b[4], temp);
+    temp = mul_add(a2, b[3], temp);
+    temp = mul_add(a0, b[2], temp);
+    temp = mul_add(a1, b[1], temp);
+    c[16] = _mm256_add_epi16(temp, c[16]);
+
+    temp = _mm256_mullo_epi16(a[2], b1);
+    temp = mul_add(a[3], b0, temp);
+    temp = mul_add(a[4], b2, temp);
+    temp = mul_add(a[5], b3, temp);
+    temp = mul_add(a[6], b4, temp);
+    temp = mul_add(a[7], b5, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a5, b[7], temp);
+    temp = mul_add(a4, b[6], temp);
+    temp = mul_add(a3, b[5], temp);
+    temp = mul_add(a2, b[4], temp);
+    temp = mul_add(a0, b[3], temp);
+    temp = mul_add(a1, b[2], temp);
+    c[17] = _mm256_add_epi16(temp, c[17]);
+
+    temp = _mm256_mullo_epi16(a[3], b1);
+    temp = mul_add(a[4], b0, temp);
+    temp = mul_add(a[5], b2, temp);
+    temp = mul_add(a[6], b3, temp);
+    temp = mul_add(a[7], b4, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a4, b[7], temp);
+    temp = mul_add(a3, b[6], temp);
+    temp = mul_add(a2, b[5], temp);
+    temp = mul_add(a0, b[4], temp);
+    temp = mul_add(a1, b[3], temp);
+    c[18] = _mm256_add_epi16(temp, c[18]);
+
+    temp = _mm256_mullo_epi16(a[4], b1);
+    temp = mul_add(a[5], b0, temp);
+    temp = mul_add(a[6], b2, temp);
+    temp = mul_add(a[7], b3, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a3, b[7], temp);
+    temp = mul_add(a2, b[6], temp);
+    temp = mul_add(a0, b[5], temp);
+    temp = mul_add(a1, b[4], temp);
+    c[19] = _mm256_add_epi16(temp, c[19]);
+
+    temp = _mm256_mullo_epi16(a[5], b1);
+    temp = mul_add(a[6], b0, temp);
+    temp = mul_add(a[7], b2, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a2, b[7], temp);
+    temp = mul_add(a0, b[6], temp);
+    temp = mul_add(a1, b[5], temp);
+    c[20] = _mm256_add_epi16(temp, c[20]);
+
+    temp = _mm256_mullo_epi16(a[6], b1);
+    temp = mul_add(a[7], b0, temp);
+    temp = mul_add(a7, b2, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a0, b[7], temp);
+    temp = mul_add(a1, b[6], temp);
+    c[21] = _mm256_add_epi16(temp, c[21]);
+
+    temp = _mm256_mullo_epi16(a[7], b1);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a6, b2, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a0, b7, temp);
+    temp = mul_add(a1, b[7], temp);
+    c[22] = _mm256_add_epi16(temp, c[22]);
+
+    temp = _mm256_mullo_epi16(a7, b1);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a5, b2, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a0, b6, temp);
+    temp = mul_add(a1, b7, temp);
+    c[23] = _mm256_add_epi16(temp, c[23]);
+
+    temp = _mm256_mullo_epi16(a6, b1);
+    temp = mul_add(a5, b0, temp);
+    temp = mul_add(a4, b2, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a0, b5, temp);
+    temp = mul_add(a1, b6, temp);
+    c[24] = _mm256_add_epi16(temp, c[24]);
+
+    temp = _mm256_mullo_epi16(a5, b1);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a0, b4, temp);
+    temp = mul_add(a1, b5, temp);
+    c[25] = _mm256_add_epi16(temp, c[25]);
+
+    temp = _mm256_mullo_epi16(a4, b1);
+    temp = mul_add(a3, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    temp = mul_add(a0, b3, temp);
+    temp = mul_add(a1, b4, temp);
+    c[26] = _mm256_add_epi16(temp, c[26]);
+
+    temp = _mm256_mullo_epi16(a3, b1);
+    temp = mul_add(a2, b0, temp);
+    temp = mul_add(a0, b2, temp);
+    temp = mul_add(a1, b3, temp);
+    c[27] = _mm256_add_epi16(temp, c[27]);
+
+    temp = _mm256_mullo_epi16(a2, b1);
+    temp = mul_add(a0, b0, temp);
+    temp = mul_add(a1, b2, temp);
+    c[28] = _mm256_add_epi16(temp, c[28]);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    temp = mul_add(a1, b0, temp);
+    c[29] = _mm256_add_epi16(temp, c[29]);
+
+    c[30] = mul_add(a1, b1, c[30]);
+
+    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+}
+
+
+static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
+    __m256i temp;
+
+    a0 = a[0];
+    a1 = a[1];
+    a2 = a[2];
+    a3 = a[3];
+    a4 = a[4];
+    a5 = a[5];
+    a6 = a[6];
+    a7 = a[7];
+
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    b4 = b[4];
+    b5 = b[5];
+    b6 = b[6];
+    b7 = b[7];
+
+    c[0] = _mm256_mullo_epi16(a0, b0);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    c[1] = mul_add(a1, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b2);
+    temp = mul_add(a1, b1, temp);
+    c[2] = mul_add(a2, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b3);
+    temp = mul_add(a1, b2, temp);
+    temp = mul_add(a2, b1, temp);
+    c[3] = mul_add(a3, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b4);
+    temp = mul_add(a1, b3, temp);
+    temp = mul_add(a3, b1, temp);
+    temp = mul_add(a4, b0, temp);
+    c[4] = mul_add(a2, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b5);
+    temp = mul_add(a1, b4, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add( a4, b1, temp);
+    c[5] = mul_add(a5, b0, temp);
+
+    temp = _mm256_mullo_epi16(a0, b6);
+    temp = mul_add(a1, b5, temp);
+    temp = mul_add(a5, b1, temp);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a3, b3, temp);
+    c[6] = mul_add(a4, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b7);
+    temp = mul_add(a1, b6, temp);
+    temp = mul_add(a6, b1, temp);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a4, b3, temp);
+    c[7] = mul_add(a5, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[8]);
+    temp = mul_add(a1, b7, temp);
+    temp = mul_add(a7, b1, temp);
+    temp = mul_add(a[8], b0, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a5, b3, temp);
+    c[8] = mul_add(a6, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[9]);
+    temp = mul_add(a1, b[8], temp);
+    temp = mul_add(a[8], b1, temp);
+    temp = mul_add(a[9], b0, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a6, b3, temp);
+    c[9] = mul_add(a7, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[10]);
+    temp = mul_add(a1, b[9], temp);
+    temp = mul_add(a[9], b1, temp);
+    temp = mul_add(a[10], b0, temp);
+    temp = mul_add(a2, b[8], temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a7, b3, temp);
+    c[10] = mul_add(a[8], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[11]);
+    temp = mul_add(a1, b[10], temp);
+    temp = mul_add(a[10], b1, temp);
+    temp = mul_add(a[11], b0, temp);
+    temp = mul_add(a2, b[9], temp);
+    temp = mul_add(a3, b[8], temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a[8], b3, temp);
+    c[11] = mul_add(a[9], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[12]);
+    temp = mul_add(a1, b[11], temp);
+    temp = mul_add(a[11], b1, temp);
+    temp = mul_add(a[12], b0, temp);
+    temp = mul_add(a2, b[10], temp);
+    temp = mul_add(a3, b[9], temp);
+    temp = mul_add(a4, b[8], temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a[8], b4, temp);
+    temp = mul_add(a[9], b3, temp);
+    c[12] = mul_add(a[10], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[13]);
+    temp = mul_add(a1, b[12], temp);
+    temp = mul_add(a[12], b1, temp);
+    temp = mul_add(a[13], b0, temp);
+    temp = mul_add(a2, b[11], temp);
+    temp = mul_add(a3, b[10], temp);
+    temp = mul_add(a4, b[9], temp);
+    temp = mul_add(a5, b[8], temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a[8], b5, temp);
+    temp = mul_add(a[9], b4, temp);
+    temp = mul_add(a[10], b3, temp);
+    c[13] = mul_add(a[11], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[14]);
+    temp = mul_add(a1, b[13], temp);
+    temp = mul_add(a[13], b1, temp);
+    temp = mul_add(a[14], b0, temp);
+    temp = mul_add(a2, b[12], temp);
+    temp = mul_add(a3, b[11], temp);
+    temp = mul_add(a4, b[10], temp);
+    temp = mul_add(a5, b[9], temp);
+    temp = mul_add(a6, b[8], temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a[8], b6, temp);
+    temp = mul_add(a[9], b5, temp);
+    temp = mul_add(a[10], b4, temp);
+    temp = mul_add(a[11], b3, temp);
+    c[14] = mul_add(a[12], b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b[15]);
+    temp = mul_add(a1, b[14], temp);
+    temp = mul_add(a[14], b1, temp);
+    temp = mul_add(a[15], b0, temp);
+    temp = mul_add(a2, b[13], temp);
+    temp = mul_add(a3, b[12], temp);
+    temp = mul_add(a4, b[11], temp);
+    temp = mul_add(a5, b[10], temp);
+    temp = mul_add(a6, b[9], temp);
+    temp = mul_add(a7, b[8], temp);
+    temp = mul_add(a[8], b7, temp);
+    temp = mul_add(a[9], b6, temp);
+    temp = mul_add(a[10], b5, temp);
+    temp = mul_add(a[11], b4, temp);
+    temp = mul_add(a[12], b3, temp);
+    c[15] = mul_add(a[13], b2, temp);
+
+    // unrolled second triangle
+    a0 = a[14];
+    a1 = a[15];
+    a2 = a[13];
+    a3 = a[12];
+    a4 = a[11];
+    a5 = a[10];
+    a6 = a[9];
+    a7 = a[8];
+
+    b0 = b[14];
+    b1 = b[15];
+    b2 = b[13];
+    b3 = b[12];
+    b4 = b[11];
+    b5 = b[10];
+    b6 = b[9];
+    b7 = b[8];
+
+    temp = _mm256_mullo_epi16(a[1], b1);
+    temp = mul_add(a[2], b0, temp);
+    temp = mul_add(a[3], b2, temp);
+    temp = mul_add(a[4], b3, temp);
+    temp = mul_add(a[5], b4, temp);
+    temp = mul_add(a[6], b5, temp);
+    temp = mul_add(a[7], b6, temp);
+    temp = mul_add(a7, b7, temp);
+    temp = mul_add(a6, b[7], temp);
+    temp = mul_add(a5, b[6], temp);
+    temp = mul_add(a4, b[5], temp);
+    temp = mul_add(a3, b[4], temp);
+    temp = mul_add(a2, b[3], temp);
+    temp = mul_add(a0, b[2], temp);
+    c[16] = mul_add(a1, b[1], temp);
+
+    temp = _mm256_mullo_epi16(a[2], b1);
+    temp = mul_add(a[3], b0, temp);
+    temp = mul_add(a[4], b2, temp);
+    temp = mul_add(a[5], b3, temp);
+    temp = mul_add(a[6], b4, temp);
+    temp = mul_add(a[7], b5, temp);
+    temp = mul_add(a7, b6, temp);
+    temp = mul_add(a6, b7, temp);
+    temp = mul_add(a5, b[7], temp);
+    temp = mul_add(a4, b[6], temp);
+    temp = mul_add(a3, b[5], temp);
+    temp = mul_add(a2, b[4], temp);
+    temp = mul_add(a0, b[3], temp);
+    c[17] = mul_add(a1, b[2], temp);
+
+    temp = _mm256_mullo_epi16(a[3], b1);
+    temp = mul_add(a[4], b0, temp);
+    temp = mul_add(a[5], b2, temp);
+    temp = mul_add(a[6], b3, temp);
+    temp = mul_add(a[7], b4, temp);
+    temp = mul_add(a7, b5, temp);
+    temp = mul_add(a6, b6, temp);
+    temp = mul_add(a5, b7, temp);
+    temp = mul_add(a4, b[7], temp);
+    temp = mul_add(a3, b[6], temp);
+    temp = mul_add(a2, b[5], temp);
+    temp = mul_add(a0, b[4], temp);
+    c[18] = mul_add(a1, b[3], temp);
+
+    temp = _mm256_mullo_epi16(a[4], b1);
+    temp = mul_add(a[5], b0, temp);
+    temp = mul_add(a[6], b2, temp);
+    temp = mul_add(a[7], b3, temp);
+    temp = mul_add(a7, b4, temp);
+    temp = mul_add(a6, b5, temp);
+    temp = mul_add(a5, b6, temp);
+    temp = mul_add(a4, b7, temp);
+    temp = mul_add(a3, b[7], temp);
+    temp = mul_add(a2, b[6], temp);
+    temp = mul_add(a0, b[5], temp);
+    c[19] = mul_add(a1, b[4], temp);
+
+    temp = _mm256_mullo_epi16(a[5], b1);
+    temp = mul_add(a[6], b0, temp);
+    temp = mul_add(a[7], b2, temp);
+    temp = mul_add(a7, b3, temp);
+    temp = mul_add(a6, b4, temp);
+    temp = mul_add(a5, b5, temp);
+    temp = mul_add(a4, b6, temp);
+    temp = mul_add(a3, b7, temp);
+    temp = mul_add(a2, b[7], temp);
+    temp = mul_add(a0, b[6], temp);
+    c[20] = mul_add(a1, b[5], temp);
+
+    temp = _mm256_mullo_epi16(a[6], b1);
+    temp = mul_add(a[7], b0, temp);
+    temp = mul_add(a7, b2, temp);
+    temp = mul_add(a6, b3, temp);
+    temp = mul_add(a5, b4, temp);
+    temp = mul_add(a4, b5, temp);
+    temp = mul_add(a3, b6, temp);
+    temp = mul_add(a2, b7, temp);
+    temp = mul_add(a0, b[7], temp);
+    c[21] = mul_add(a1, b[6], temp);
+
+    temp = _mm256_mullo_epi16(a[7], b1);
+    temp = mul_add(a7, b0, temp);
+    temp = mul_add(a6, b2, temp);
+    temp = mul_add(a5, b3, temp);
+    temp = mul_add(a4, b4, temp);
+    temp = mul_add(a3, b5, temp);
+    temp = mul_add(a2, b6, temp);
+    temp = mul_add(a0, b7, temp);
+    c[22] = mul_add(a1, b[7], temp);
+
+    temp = _mm256_mullo_epi16(a7, b1);
+    temp = mul_add(a6, b0, temp);
+    temp = mul_add(a5, b2, temp);
+    temp = mul_add(a4, b3, temp);
+    temp = mul_add(a3, b4, temp);
+    temp = mul_add(a2, b5, temp);
+    temp = mul_add(a0, b6, temp);
+    c[23] = mul_add(a1, b7, temp);
+
+    temp = _mm256_mullo_epi16(a6, b1);
+    temp = mul_add(a5, b0, temp);
+    temp = mul_add(a4, b2, temp);
+    temp = mul_add(a3, b3, temp);
+    temp = mul_add(a2, b4, temp);
+    temp = mul_add(a0, b5, temp);
+    c[24] = mul_add(a1, b6, temp);
+
+    temp = _mm256_mullo_epi16(a5, b1);
+    temp = mul_add(a4, b0, temp);
+    temp = mul_add(a3, b2, temp);
+    temp = mul_add(a2, b3, temp);
+    temp = mul_add(a0, b4, temp);
+    c[25] = mul_add(a1, b5, temp);
+
+    temp = _mm256_mullo_epi16(a4, b1);
+    temp = mul_add(a3, b0, temp);
+    temp = mul_add(a2, b2, temp);
+    temp = mul_add(a0, b3, temp);
+    c[26] = mul_add(a1, b4, temp);
+
+    temp = _mm256_mullo_epi16(a3, b1);
+    temp = mul_add(a2, b0, temp);
+    temp = mul_add(a0, b2, temp);
+    c[27] = mul_add(a1, b3, temp);
+
+    temp = _mm256_mullo_epi16(a2, b1);
+    temp = mul_add(a0, b0, temp);
+    c[28] = mul_add(a1, b2, temp);
+
+    temp = _mm256_mullo_epi16(a0, b1);
+    c[29] = mul_add(a1, b0, temp);
+
+    c[30] = _mm256_mullo_epi16(a1, b1);
+
+    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+}
+
+static void transpose(__m256i *M) {
+    __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
+    __m256i temp, temp0, temp1, temp2;
+
+    r0 = _mm256_unpacklo_epi16(M[0], M[1]);
+    r1 = _mm256_unpacklo_epi16(M[2], M[3]);
+    r2 = _mm256_unpacklo_epi16(M[4], M[5]);
+    r3 = _mm256_unpacklo_epi16(M[6], M[7]);
+    r4 = _mm256_unpacklo_epi16(M[8], M[9]);
+    r5 = _mm256_unpacklo_epi16(M[10], M[11]);
+    r6 = _mm256_unpacklo_epi16(M[12], M[13]);
+    r7 = _mm256_unpacklo_epi16(M[14], M[15]);
+
+    temp = _mm256_unpacklo_epi32(r0, r1);
+    temp0 = _mm256_unpacklo_epi32(r2, r3);
+    temp1 = _mm256_unpacklo_epi32(r4, r5);
+    temp2 = _mm256_unpacklo_epi32(r6, r7);
+
+    r8 = _mm256_unpackhi_epi32(r0, r1);
+    r9 = _mm256_unpackhi_epi32(r2, r3);
+    r10 = _mm256_unpackhi_epi32(r4, r5);
+    r11 = _mm256_unpackhi_epi32(r6, r7);
+
+    r0 = _mm256_unpacklo_epi64(temp, temp0);
+    r2 = _mm256_unpackhi_epi64(temp, temp0);
+    r1 = _mm256_unpacklo_epi64(temp1, temp2);
+    r3 = _mm256_unpackhi_epi64(temp1, temp2);
+
+    temp = _mm256_unpackhi_epi16(M[0], M[1]);
+    temp0 = _mm256_unpackhi_epi16(M[2], M[3]);
+    temp1 = _mm256_unpackhi_epi16(M[4], M[5]);
+    temp2 = _mm256_unpackhi_epi16(M[6], M[7]);
+
+    r4 = _mm256_unpackhi_epi16(M[8], M[9]);
+    M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
+    M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
+    M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
+    M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
+    r5 = _mm256_unpackhi_epi16(M[10], M[11]);
+    r6 = _mm256_unpackhi_epi16(M[12], M[13]);
+    r7 = _mm256_unpackhi_epi16(M[14], M[15]);
+
+    r0 = _mm256_unpacklo_epi64(r8, r9);
+    r1 = _mm256_unpacklo_epi64(r10, r11);
+    r2 = _mm256_unpackhi_epi64(r8, r9);
+    r3 = _mm256_unpackhi_epi64(r10, r11);
+
+    M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
+    M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
+    M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
+    M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
+
+    r0 = _mm256_unpacklo_epi32(temp, temp0);
+    r1 = _mm256_unpacklo_epi32(temp1, temp2);
+    r2 = _mm256_unpacklo_epi32(r4, r5);
+    r3 = _mm256_unpacklo_epi32(r6, r7);
+
+    r8 = _mm256_unpacklo_epi64(r0, r1);
+    r10 = _mm256_unpackhi_epi64(r0, r1);
+    r9 = _mm256_unpacklo_epi64(r2, r3);
+    r11 = _mm256_unpackhi_epi64(r2, r3);
+
+    M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
+    M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
+    M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
+    M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
+
+    r0 = _mm256_unpackhi_epi32(temp, temp0);
+    r1 = _mm256_unpackhi_epi32(temp1, temp2);
+    r2 = _mm256_unpackhi_epi32(r4, r5);
+    r3 = _mm256_unpackhi_epi32(r6, r7);
+
+    r4 = _mm256_unpacklo_epi64(r0, r1);
+    r6 = _mm256_unpackhi_epi64(r0, r1);
+    r5 = _mm256_unpacklo_epi64(r2, r3);
+    r7 = _mm256_unpackhi_epi64(r2, r3);
+
+    M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
+    M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
+    M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
+    M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
+}
+
+static void batch_64coefficient_multiplications(toom4_points_product *c_eval, const __m256i *a, const toom4_points *b_eval, int accumulate) {
+    toom4_points a_eval;// Holds evaluation (a & b) for 7 Karatsuba at a time
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+    __m256i *va = (__m256i *)a_eval.coeffs;
+    __m256i *vb = (__m256i *)b_eval->coeffs;
+    __m256i *vc = (__m256i *)c_eval->coeffs;
+
+    //------------------AVX evaluation for 1st poly-----------------------
+    r0_avx = a[0 * L + 0];
+    r1_avx = a[0 * L + 1];
+    r2_avx = a[0 * L + 2];
+    r3_avx = a[0 * L + 3];
+
+    va[0] = r0_avx;
+    va[1] = r1_avx;
+    va[2] = r2_avx;
+    va[3] = r3_avx;
+    va[4] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8] = _mm256_add_epi16(va[6], va[7]);
+    //------------------AVX evaluation for 1st poly ends------------------
+
+    //------------------AVX evaluation for 2nd poly-----------------------
+    r0_avx = a[1 * L + 0];
+    r1_avx = a[1 * L + 1];
+    r2_avx = a[1 * L + 2];
+    r3_avx = a[1 * L + 3];
+
+    va[0 + 9] = r0_avx;
+    va[1 + 9] = r1_avx;
+    va[2 + 9] = r2_avx;
+    va[3 + 9] = r3_avx;
+    va[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 9] = _mm256_add_epi16(va[6 + 9], va[7 + 9]);
+    //------------------AVX evaluation for 2nd poly ends------------------
+
+    //------------------AVX evaluation for 3rd poly-----------------------
+    r0_avx = a[2 * L + 0];
+    r1_avx = a[2 * L + 1];
+    r2_avx = a[2 * L + 2];
+    r3_avx = a[2 * L + 3];
+
+    va[0 + 18] = r0_avx;
+    va[1 + 18] = r1_avx;
+    va[2 + 18] = r2_avx;
+    va[3 + 18] = r3_avx;
+    va[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 18] = _mm256_add_epi16(va[6 + 18], va[7 + 18]);
+    //------------------AVX evaluation for 3rd poly ends------------------
+
+    //------------------AVX evaluation for 4th poly-----------------------
+    r0_avx = a[3 * L + 0];
+    r1_avx = a[3 * L + 1];
+    r2_avx = a[3 * L + 2];
+    r3_avx = a[3 * L + 3];
+
+    va[0 + 27] = r0_avx;
+    va[1 + 27] = r1_avx;
+    va[2 + 27] = r2_avx;
+    va[3 + 27] = r3_avx;
+    va[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 27] = _mm256_add_epi16(va[6 + 27], va[7 + 27]);
+    //------------------AVX evaluation for 4th poly ends------------------
+
+    //------------------AVX evaluation for 5th poly-----------------------
+    r0_avx = a[4 * L + 0];
+    r1_avx = a[4 * L + 1];
+    r2_avx = a[4 * L + 2];
+    r3_avx = a[4 * L + 3];
+
+    va[0 + 36] = r0_avx;
+    va[1 + 36] = r1_avx;
+    va[2 + 36] = r2_avx;
+    va[3 + 36] = r3_avx;
+    va[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 36] = _mm256_add_epi16(va[6 + 36], va[7 + 36]);
+    //------------------AVX evaluation for 5th poly ends------------------
+
+    //------------------AVX evaluation for 6th poly-----------------------
+    r0_avx = a[5 * L + 0];
+    r1_avx = a[5 * L + 1];
+    r2_avx = a[5 * L + 2];
+    r3_avx = a[5 * L + 3];
+
+    va[0 + 45] = r0_avx;
+    va[1 + 45] = r1_avx;
+    va[2 + 45] = r2_avx;
+    va[3 + 45] = r3_avx;
+    va[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 45] = _mm256_add_epi16(va[6 + 45], va[7 + 45]);
+    //------------------AVX evaluation for 6th poly ends------------------
+
+    //------------------AVX evaluation for 7th poly-----------------------
+    r0_avx = a[6 * L + 0];
+    r1_avx = a[6 * L + 1];
+    r2_avx = a[6 * L + 2];
+    r3_avx = a[6 * L + 3];
+
+    va[0 + 54] = r0_avx;
+    va[1 + 54] = r1_avx;
+    va[2 + 54] = r2_avx;
+    va[3 + 54] = r3_avx;
+    va[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx);
+    va[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx);
+    va[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx);
+    va[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx);
+    va[8 + 54] = _mm256_add_epi16(va[6 + 54], va[7 + 54]);
+    //------------------AVX evaluation for 7th poly ends------------------
+
+    //-----------------Forward transposes--------------------------------------
+    transpose(va);
+    transpose(va + 16);
+    transpose(va + 32);
+    transpose(va + 48);
+    //-----------------Forward transposes ends---------------------------------
+
+    if (accumulate == 0) {
+        schoolbook_avx(vc, va, vb);
+        schoolbook_avx(vc + 32, va + 16, vb + 16);
+        schoolbook_avx(vc + 64, va + 32, vb + 32);
+        schoolbook_avx(vc + 96, va + 48, vb + 48);
+    } else {
+        schoolbook_avx_acc(vc, va, vb);
+        schoolbook_avx_acc(vc + 32, va + 16, vb + 16);
+        schoolbook_avx_acc(vc + 64, va + 32, vb + 32);
+        schoolbook_avx_acc(vc + 96, va + 48, vb + 48);
+    }
+}
+
+static void karatsuba_eval(__m256i *b_eval, const __m256i *b) {
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx;
+
+    //-------1st poly----------------------------------------------------
+    r0_avx = b[0 * L + 0];
+    r1_avx = b[0 * L + 1];
+    r2_avx = b[0 * L + 2];
+    r3_avx = b[0 * L + 3];
+
+    b_eval[0] = r0_avx;
+    b_eval[1] = r1_avx;
+    b_eval[2] = r2_avx;
+    b_eval[3] = r3_avx;
+    b_eval[4] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8] = _mm256_add_epi16(b_eval[6], b_eval[7]);
+
+    //-------2nd poly----------------------------------------------------
+    r0_avx = b[1 * L + 0];
+    r1_avx = b[1 * L + 1];
+    r2_avx = b[1 * L + 2];
+    r3_avx = b[1 * L + 3];
+
+    b_eval[0 + 9] = r0_avx;
+    b_eval[1 + 9] = r1_avx;
+    b_eval[2 + 9] = r2_avx;
+    b_eval[3 + 9] = r3_avx;
+    b_eval[4 + 9] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 9] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 9] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 9] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 9] = _mm256_add_epi16(b_eval[6 + 9], b_eval[7 + 9]);
+
+    //-------3rd poly----------------------------------------------------
+    r0_avx = b[2 * L + 0];
+    r1_avx = b[2 * L + 1];
+    r2_avx = b[2 * L + 2];
+    r3_avx = b[2 * L + 3];
+
+    b_eval[0 + 18] = r0_avx;
+    b_eval[1 + 18] = r1_avx;
+    b_eval[2 + 18] = r2_avx;
+    b_eval[3 + 18] = r3_avx;
+    b_eval[4 + 18] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 18] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 18] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 18] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 18] = _mm256_add_epi16(b_eval[6 + 18], b_eval[7 + 18]);
+
+    //-------4th poly----------------------------------------------------
+    r0_avx = b[3 * L + 0];
+    r1_avx = b[3 * L + 1];
+    r2_avx = b[3 * L + 2];
+    r3_avx = b[3 * L + 3];
+
+    b_eval[0 + 27] = r0_avx;
+    b_eval[1 + 27] = r1_avx;
+    b_eval[2 + 27] = r2_avx;
+    b_eval[3 + 27] = r3_avx;
+    b_eval[4 + 27] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 27] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 27] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 27] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 27] = _mm256_add_epi16(b_eval[6 + 27], b_eval[7 + 27]);
+
+    //-------5th poly----------------------------------------------------
+    r0_avx = b[4 * L + 0];
+    r1_avx = b[4 * L + 1];
+    r2_avx = b[4 * L + 2];
+    r3_avx = b[4 * L + 3];
+
+    b_eval[0 + 36] = r0_avx;
+    b_eval[1 + 36] = r1_avx;
+    b_eval[2 + 36] = r2_avx;
+    b_eval[3 + 36] = r3_avx;
+    b_eval[4 + 36] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 36] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 36] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 36] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 36] = _mm256_add_epi16(b_eval[6 + 36], b_eval[7 + 36]);
+
+    //-------6th poly----------------------------------------------------
+    r0_avx = b[5 * L + 0];
+    r1_avx = b[5 * L + 1];
+    r2_avx = b[5 * L + 2];
+    r3_avx = b[5 * L + 3];
+
+    b_eval[0 + 45] = r0_avx;
+    b_eval[1 + 45] = r1_avx;
+    b_eval[2 + 45] = r2_avx;
+    b_eval[3 + 45] = r3_avx;
+    b_eval[4 + 45] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 45] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 45] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 45] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 45] = _mm256_add_epi16(b_eval[6 + 45], b_eval[7 + 45]);
+
+    //-------7th poly----------------------------------------------------
+    r0_avx = b[6 * L + 0];
+    r1_avx = b[6 * L + 1];
+    r2_avx = b[6 * L + 2];
+    r3_avx = b[6 * L + 3];
+
+    b_eval[0 + 54] = r0_avx;
+    b_eval[1 + 54] = r1_avx;
+    b_eval[2 + 54] = r2_avx;
+    b_eval[3 + 54] = r3_avx;
+    b_eval[4 + 54] = _mm256_add_epi16(r0_avx, r1_avx);
+    b_eval[5 + 54] = _mm256_add_epi16(r2_avx, r3_avx);
+    b_eval[6 + 54] = _mm256_add_epi16(r0_avx, r2_avx);
+    b_eval[7 + 54] = _mm256_add_epi16(r1_avx, r3_avx);
+    b_eval[8 + 54] = _mm256_add_epi16(b_eval[6 + 54], b_eval[7 + 54]);
+
+    //--------------Evaluating B poly ends-------------------------------
+    transpose(b_eval);
+    transpose(b_eval + 16);
+    transpose(b_eval + 32);
+    transpose(b_eval + 48);
+}
+
+static void karatsuba_interp(__m256i *result_final0, __m256i *result_final1, __m256i *result_final2, __m256i *result_final3, __m256i *result_final4, __m256i *result_final5, __m256i *result_final6, const __m256i *c_eval) {
+    __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
+    __m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
+
+    //------------------------AVX interpolation for 1st poly external-------------------
+    res_avx0 = c_eval[0];
+    res_avx2 = c_eval[1];
+    res_avx4 = c_eval[2];
+    res_avx6 = c_eval[3];
+    c6_avx = c_eval[6];
+    c7_avx = c_eval[7];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[8], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[16];
+    res_avx3 = c_eval[17];
+    res_avx5 = c_eval[18];
+    res_avx7 = c_eval[19];
+    c22_avx = c_eval[22];
+    c23_avx = c_eval[23];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[21], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[24], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[20], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[5], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[4], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final0[0] = res_avx0;
+    result_final0[1] = res_avx1;
+    result_final0[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final0[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final0[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final0[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final0[6] = res_avx6;
+    result_final0[7] = res_avx7;
+    //------------------------AVX interpolation for 1st poly ends--------------
+
+
+    //------------------------AVX interpolation for 2nd poly external-------------------
+    res_avx0 = c_eval[9]; //c_eval0
+    res_avx2 = c_eval[10]; //c_eval1
+    res_avx4 = c_eval[11]; //c_eval2
+    res_avx6 = c_eval[12]; //c_eval3
+    c6_avx = c_eval[15]; //c_eval6
+    c7_avx = c_eval[32]; //c_eval7
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[33], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[25]; //c_eval0
+    res_avx3 = c_eval[26]; //c_eval1
+    res_avx5 = c_eval[27]; //c_eval2
+    res_avx7 = c_eval[28]; //c_eval3
+    c22_avx = c_eval[31];
+    c23_avx = c_eval[48];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[30], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[49], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[29], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[14], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[13], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final1[0] = res_avx0;
+    result_final1[1] = res_avx1;
+    result_final1[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final1[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final1[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final1[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final1[6] = res_avx6;
+    result_final1[7] = res_avx7;
+    //------------------------AVX interpolation for 2nd poly ends--------------
+
+    //------------------------AVX interpolation for 3rd poly external-------------------
+    res_avx0 = c_eval[34]; //c_eval0
+    res_avx2 = c_eval[35]; //c_eval1
+    res_avx4 = c_eval[36];
+    res_avx6 = c_eval[37];
+    c6_avx = c_eval[40];
+    c7_avx = c_eval[41];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[42], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[50]; //c_eval0
+    res_avx3 = c_eval[51]; //c_eval1
+    res_avx5 = c_eval[52];
+    res_avx7 = c_eval[53];
+    c22_avx = c_eval[56];
+    c23_avx = c_eval[57];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[55], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[58], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[54], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[39], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[38], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final2[0] = res_avx0;
+    result_final2[1] = res_avx1;
+    result_final2[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final2[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final2[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final2[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final2[6] = res_avx6;
+    result_final2[7] = res_avx7;
+    //------------------------AVX interpolation for 3rd poly ends--------------
+
+    //------------------------AVX interpolation for 4th poly external-------------------
+    res_avx0 = c_eval[43];
+    res_avx2 = c_eval[44];
+    res_avx4 = c_eval[45];
+    res_avx6 = c_eval[46];
+    c6_avx = c_eval[65];
+    c7_avx = c_eval[66];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[67], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[59];
+    res_avx3 = c_eval[60];
+    res_avx5 = c_eval[61];
+    res_avx7 = c_eval[62];
+    c22_avx = c_eval[81];
+    c23_avx = c_eval[82];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[80], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[83], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[63], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[64], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[47], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final3[0] = res_avx0;
+    result_final3[1] = res_avx1;
+    result_final3[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final3[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final3[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final3[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final3[6] = res_avx6;
+    result_final3[7] = res_avx7;
+    //------------------------AVX interpolation for 4th poly ends--------------
+
+    //------------------------AVX interpolation for 5th poly external-------------------
+    res_avx0 = c_eval[68];
+    res_avx2 = c_eval[69];
+    res_avx4 = c_eval[70];
+    res_avx6 = c_eval[71];
+    c6_avx = c_eval[74];
+    c7_avx = c_eval[75];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[76], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[84];
+    res_avx3 = c_eval[85];
+    res_avx5 = c_eval[86];
+    res_avx7 = c_eval[87];
+    c22_avx = c_eval[90];
+    c23_avx = c_eval[91];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[89], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[92], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[88], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[73], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[72], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final4[0] = res_avx0;
+    result_final4[1] = res_avx1;
+    result_final4[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final4[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final4[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final4[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final4[6] = res_avx6;
+    result_final4[7] = res_avx7;
+    //------------------------AVX interpolation for 5th poly ends--------------
+
+    //------------------------AVX interpolation for 6th poly external-------------------
+    res_avx0 = c_eval[77];
+    res_avx2 = c_eval[78];
+    res_avx4 = c_eval[79];
+    res_avx6 = c_eval[96];
+    c6_avx = c_eval[99];
+    c7_avx = c_eval[100];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[101], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[93];
+    res_avx3 = c_eval[94];
+    res_avx5 = c_eval[95];
+    res_avx7 = c_eval[112];
+    c22_avx = c_eval[115];
+    c23_avx = c_eval[116];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[114], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[117], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[113], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[98], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[97], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final5[0] = res_avx0;
+    result_final5[1] = res_avx1;
+    result_final5[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final5[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final5[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final5[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final5[6] = res_avx6;
+    result_final5[7] = res_avx7;
+    //------------------------AVX interpolation for 6th poly ends--------------
+
+    //------------------------AVX interpolation for 7th poly external-------------------
+    res_avx0 = c_eval[102];
+    res_avx2 = c_eval[103];
+    res_avx4 = c_eval[104];
+    res_avx6 = c_eval[105];
+    c6_avx = c_eval[108];
+    c7_avx = c_eval[109];
+
+    c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[110], c6_avx), c7_avx);
+
+    res_avx1 = c_eval[118];
+    res_avx3 = c_eval[119];
+    res_avx5 = c_eval[120];
+    res_avx7 = c_eval[121];
+    c22_avx = c_eval[124];
+    c23_avx = c_eval[125];
+
+    c21_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[123], res_avx5), res_avx7);
+    c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[126], c22_avx), c23_avx);
+    c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[122], res_avx1), res_avx3);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[107], res_avx4), res_avx6);
+    res_avx5 = _mm256_add_epi16(res_avx5, temp);
+    temp = _mm256_sub_epi16(_mm256_sub_epi16(c_eval[106], res_avx0), res_avx2);
+    res_avx1 = _mm256_add_epi16(res_avx1, temp);
+    c22_avx = _mm256_add_epi16(c22_avx, c8_avx);
+    res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
+    res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
+    c7_avx = _mm256_add_epi16(c7_avx, c24_avx);
+    c6_avx = _mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
+    c22_avx = _mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
+    c7_avx = _mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
+    c23_avx = _mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
+
+    result_final6[0] = res_avx0;
+    result_final6[1] = res_avx1;
+    result_final6[2] = _mm256_add_epi16(res_avx2, c6_avx);
+    result_final6[3] = _mm256_add_epi16(res_avx3, c22_avx);
+    result_final6[4] = _mm256_add_epi16(res_avx4, c7_avx);
+    result_final6[5] = _mm256_add_epi16(res_avx5, c23_avx);
+    result_final6[6] = res_avx6;
+    result_final6[7] = res_avx7;
+    //------------------------AVX interpolation for 7th poly ends--------------
+}
+
+void PQCLEAN_SABER_AVX2_toom4_mul_A_by_B_eval(toom4_points_product *c_eval, const poly *a, const toom4_points *b_eval, int accumulate) {
+    size_t i;
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+    __m256i aw_avx[7 * L];
+    __m256i *va = (__m256i *)a->coeffs;
+
+    for (i = 0; i < L; i++) {
+        r0_avx = va[0 * L + i];
+        r1_avx = va[1 * L + i];
+        r2_avx = va[2 * L + i];
+        r3_avx = va[3 * L + i];
+        r4_avx = _mm256_add_epi16(r0_avx, r2_avx);
+        r5_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        aw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        aw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r0_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r2_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r5_avx = _mm256_slli_epi16(r1_avx, 2);
+        r5_avx = _mm256_add_epi16(r5_avx, r3_avx);
+        aw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        aw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r3_avx, 3);
+        r6_avx = _mm256_slli_epi16(r2_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        r6_avx = _mm256_slli_epi16(r1_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        aw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx);
+        aw_avx[6 * L + i] = r0_avx;
+        aw_avx[0 * L + i] = r3_avx;
+    }
+
+    batch_64coefficient_multiplications(c_eval, aw_avx, b_eval, accumulate);
+}
+
+void PQCLEAN_SABER_AVX2_toom4_eval(toom4_points *b_eval, const poly *b) {
+    size_t i;
+    __m256i bw_avx[7 * L];
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
+    __m256i *vb = (__m256i *)b->coeffs;
+    __m256i *vb_eval = (__m256i *)b_eval->coeffs;
+
+    for (i = 0; i < L; i++) {
+        r0_avx = vb[0 * L + i];
+        r1_avx = vb[1 * L + i];
+        r2_avx = vb[2 * L + i];
+        r3_avx = vb[3 * L + i];
+        r4_avx = _mm256_add_epi16(r0_avx, r2_avx);
+        r5_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        bw_avx[2 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        bw_avx[3 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r0_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r2_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r5_avx = _mm256_slli_epi16(r1_avx, 2);
+        r5_avx = _mm256_add_epi16(r5_avx, r3_avx);
+        bw_avx[4 * L + i] = _mm256_add_epi16(r4_avx, r5_avx);
+        bw_avx[5 * L + i] = _mm256_sub_epi16(r4_avx, r5_avx);
+        r4_avx = _mm256_slli_epi16(r3_avx, 3);
+        r6_avx = _mm256_slli_epi16(r2_avx, 2);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        r6_avx = _mm256_slli_epi16(r1_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r6_avx);
+        bw_avx[1 * L + i] = _mm256_add_epi16(r4_avx, r0_avx);
+        bw_avx[6 * L + i] = r0_avx;
+        bw_avx[0 * L + i] = r3_avx;
+    }
+
+    karatsuba_eval(vb_eval, bw_avx);
+}
+
+
+void PQCLEAN_SABER_AVX2_toom4_interp(poly *res, const toom4_points_product *c_eval) {
+    size_t i;
+    __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
+    __m256i w1_avx[2 * L], w2_avx[2 * L], w3_avx[2 * L], w4_avx[2 * L], w5_avx[2 * L], w6_avx[2 * L], w7_avx[2 * L];
+    __m256i res_full[32];
+    __m256i *vc = (__m256i *)c_eval->coeffs;
+    __m256i *vres = (__m256i *)res->coeffs;
+
+    transpose(vc);
+    transpose(vc + 16);
+    transpose(vc + 32);
+    transpose(vc + 48);
+    transpose(vc + 64);
+    transpose(vc + 80);
+    transpose(vc + 96);
+    transpose(vc + 112);
+
+    karatsuba_interp(w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx, vc);
+
+    for (i = 0; i < 2 * L; i++) {
+        r0_avx = w1_avx[i];
+        r1_avx = w2_avx[i];
+        r2_avx = w3_avx[i];
+        r3_avx = w4_avx[i];
+        r4_avx = w5_avx[i];
+        r5_avx = w6_avx[i];
+        r6_avx = w7_avx[i];
+
+        r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
+        r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
+        r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
+        r3_avx = _mm256_srli_epi16(r3_avx, 1);
+        r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
+        temp_avx = _mm256_slli_epi16(r6_avx, 6);
+
+        r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+        r4_avx = _mm256_slli_epi16(r4_avx, 1);
+        r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
+        r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
+        temp_avx = _mm256_slli_epi16(r2_avx, 6);
+
+        r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
+        r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
+        r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
+        r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
+        temp_avx = _mm256_mullo_epi16(r2_avx, _mm256_set1_epi16(45));
+
+        r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+        temp_avx = _mm256_slli_epi16(r2_avx, 3);
+
+        r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
+        r4_avx = _mm256_mullo_epi16(r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
+        r4_avx = _mm256_srli_epi16(r4_avx, 3);
+        r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
+        temp_avx = _mm256_slli_epi16(r3_avx, 4);
+
+        r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
+        r1_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
+        r1_avx = _mm256_srli_epi16(r1_avx, 1);
+        r3_avx = _mm256_add_epi16(r1_avx, r3_avx);
+        r3_avx = _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
+        temp_avx = _mm256_mullo_epi16(r1_avx, _mm256_set1_epi16(30));
+        temp_avx = _mm256_sub_epi16(temp_avx, r5_avx);
+        temp_avx = _mm256_mullo_epi16(temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
+
+        r5_avx = _mm256_srli_epi16(temp_avx, 2);
+        r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
+        r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
+
+        if (i < L) {
+            res_full[0 * L + i] = r6_avx;
+            res_full[1 * L + i] = r5_avx;
+            res_full[2 * L + i] = r4_avx;
+            res_full[3 * L + i] = r3_avx;
+            res_full[4 * L + i] = r2_avx;
+            res_full[5 * L + i] = r1_avx;
+            res_full[6 * L + i] = r0_avx;
+        } else {
+            res_full[0 * L + i] = _mm256_add_epi16(res_full[0 * L + i], r6_avx);
+            res_full[1 * L + i] = _mm256_add_epi16(res_full[1 * L + i], r5_avx);
+            res_full[2 * L + i] = _mm256_add_epi16(res_full[2 * L + i], r4_avx);
+            res_full[3 * L + i] = _mm256_add_epi16(res_full[3 * L + i], r3_avx);
+            res_full[4 * L + i] = _mm256_add_epi16(res_full[4 * L + i], r2_avx);
+            res_full[5 * L + i] = _mm256_add_epi16(res_full[5 * L + i], r1_avx);
+            res_full[6 * L + i] = r0_avx;
+        }
+    }
+
+    // Reduction by X^256 + 1
+    for (i = 0; i < 16; i++) {
+        vres[i] = _mm256_sub_epi16(res_full[i], res_full[i + 16]);
+    }
+}
diff --git a/crypto_kem/saber/avx2/polymul/consts.h b/crypto_kem/saber/avx2/polymul/consts.h
deleted file mode 100644
index 40826398..00000000
--- a/crypto_kem/saber/avx2/polymul/consts.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "../SABER_params.h"
-
-#define AVX_N (SABER_N >> 4)
-#define small_len_avx (AVX_N >> 2)
-
-#define SCHB_N 16
-
-#define N_SB (SABER_N >> 2)
-#define N_SB_RES (2*N_SB-1)
-
-#define N_SB_16 (N_SB >> 2)
-#define N_SB_16_RES (2*N_SB_16-1)
-
-#define AVX_N1 16 /*N/16*/ 
-
-#define SCM_SIZE 16
-
-// The dimension of a vector. i.e vector has NUM_POLY elements and Matrix has NUM_POLY X NUM_POLY elements
-#define NUM_POLY SABER_K
-//int NUM_POLY=2; 
diff --git a/crypto_kem/saber/avx2/polymul/matrix.c b/crypto_kem/saber/avx2/polymul/matrix.c
deleted file mode 100644
index 5fa35783..00000000
--- a/crypto_kem/saber/avx2/polymul/matrix.c
+++ /dev/null
@@ -1,303 +0,0 @@
-#include <immintrin.h>
-
-static void transpose_n1(__m256i *M)
-{
-	//int i;
-	register __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
-	register __m256i temp, temp0, temp1, temp2;
-
-	//for(i=0; i<8; i=i+1)
-	//{
-		r0 = _mm256_unpacklo_epi16(M[0], M[1]); 
-		r1 = _mm256_unpacklo_epi16(M[2], M[3]); 
-		r2 = _mm256_unpacklo_epi16(M[4], M[5]); 
-		r3 = _mm256_unpacklo_epi16(M[6], M[7]);
-		r4 = _mm256_unpacklo_epi16(M[8], M[9]); 
-		r5 = _mm256_unpacklo_epi16(M[10], M[11]);
-		r6 = _mm256_unpacklo_epi16(M[12], M[13]); 
-		r7 = _mm256_unpacklo_epi16(M[14], M[15]); 
-
-
-		temp = _mm256_unpacklo_epi32(r0, r1); 
-		temp0 = _mm256_unpacklo_epi32(r2, r3); 
-		temp1 = _mm256_unpacklo_epi32(r4, r5); 
-		temp2 = _mm256_unpacklo_epi32(r6, r7); 
-
-		r8 = _mm256_unpackhi_epi32(r0, r1); 
-		r9 = _mm256_unpackhi_epi32(r2, r3); 
-		r10 = _mm256_unpackhi_epi32(r4, r5); 
-		r11 = _mm256_unpackhi_epi32(r6, r7);
-
-		r0 = _mm256_unpacklo_epi64(temp, temp0); 
-		r2 = _mm256_unpackhi_epi64(temp, temp0); 
-
-		r1 = _mm256_unpacklo_epi64(temp1, temp2); 
-		r3 = _mm256_unpackhi_epi64(temp1, temp2);
-
-		temp = _mm256_unpackhi_epi16(M[0], M[1]); 
-		temp0 = _mm256_unpackhi_epi16(M[2], M[3]); 
-		temp1 = _mm256_unpackhi_epi16(M[4], M[5]); 
-		temp2 = _mm256_unpackhi_epi16(M[6], M[7]); 
-		r4 = _mm256_unpackhi_epi16(M[8], M[9]); 
-
-		M[0] = _mm256_permute2f128_si256(r0, r1, 0x20);
-		M[8] = _mm256_permute2f128_si256(r0, r1, 0x31);
-		M[1] = _mm256_permute2f128_si256(r2, r3, 0x20);
-		M[9] = _mm256_permute2f128_si256(r2, r3, 0x31);
-
-
-		r5 = _mm256_unpackhi_epi16(M[10], M[11]); 
-		r6 = _mm256_unpackhi_epi16(M[12], M[13]); 
-		r7 = _mm256_unpackhi_epi16(M[14], M[15]); 
-
-
-
-		r0 = _mm256_unpacklo_epi64(r8, r9); 
-		r1 = _mm256_unpacklo_epi64(r10, r11); 
-
-		r2 = _mm256_unpackhi_epi64(r8, r9); 
-		r3 = _mm256_unpackhi_epi64(r10, r11); 
-
-
-
-		M[3] = _mm256_permute2f128_si256(r2, r3, 0x20);
-		M[11] = _mm256_permute2f128_si256(r2, r3, 0x31);
-		M[2] = _mm256_permute2f128_si256(r0, r1, 0x20);
-		M[10] = _mm256_permute2f128_si256(r0, r1, 0x31);
-
-
-	//for(i=0; i<4; i=i+1)
-	//{
-		r0 = _mm256_unpacklo_epi32(temp, temp0); 
-		r1 = _mm256_unpacklo_epi32(temp1, temp2);
-		r2 = _mm256_unpacklo_epi32(r4, r5); 
-		r3 = _mm256_unpacklo_epi32(r6, r7); 
-
-	//}
-
-
-	//for(i=0; i<2; i=i+1)
-	//{
-		r8 = _mm256_unpacklo_epi64(r0, r1); 
-		r10 = _mm256_unpackhi_epi64(r0, r1); 
-
-		r9 = _mm256_unpacklo_epi64(r2, r3); 
-		r11 = _mm256_unpackhi_epi64(r2, r3); 
-
-		M[4] = _mm256_permute2f128_si256(r8, r9, 0x20);
-		M[12] = _mm256_permute2f128_si256(r8, r9, 0x31);
-		M[5] = _mm256_permute2f128_si256(r10, r11, 0x20);
-		M[13] = _mm256_permute2f128_si256(r10, r11, 0x31);
-
-		r0 = _mm256_unpackhi_epi32(temp, temp0); 
-		r1 = _mm256_unpackhi_epi32(temp1, temp2); 
-		r2 = _mm256_unpackhi_epi32(r4, r5); 
-		r3 = _mm256_unpackhi_epi32(r6, r7); 
-
-	//}
-//	for(i=0; i<2; i=i+1)
-//	{
-		r4 = _mm256_unpacklo_epi64(r0, r1); 
-		r6 = _mm256_unpackhi_epi64(r0, r1); 
-
-		r5 = _mm256_unpacklo_epi64(r2, r3); 
-		r7 = _mm256_unpackhi_epi64(r2, r3); 
-
-//	}
-
-	//-------------------------------------------------------
-
-	M[6] = _mm256_permute2f128_si256(r4, r5, 0x20);
-	M[14] = _mm256_permute2f128_si256(r4, r5, 0x31);
-	M[7] = _mm256_permute2f128_si256(r6, r7, 0x20);
-	M[15] = _mm256_permute2f128_si256(r6, r7, 0x31);
-}
-
-/*
-void transpose_unrolled(__m256i *M)
-{
-	int i;
-	__m256i tL[8], tH[8];
-	__m256i bL[4], bH[4], cL[4], cH[4];
-	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
-
-	__m256i r0, r1, r2, r3, r4, r5, r6, r7;
-
-	//for(i=0; i<8; i=i+1)
-	//{
-		tL[0] = _mm256_unpacklo_epi16(M[0], M[1]); 
-		tH[0] = _mm256_unpackhi_epi16(M[0], M[1]); 
-
-		tL[1] = _mm256_unpacklo_epi16(M[2], M[3]); 
-		tH[1] = _mm256_unpackhi_epi16(M[2], M[3]); 
-
-		tL[2] = _mm256_unpacklo_epi16(M[4], M[5]); 
-		tH[2] = _mm256_unpackhi_epi16(M[4], M[5]); 
-
-		tL[3] = _mm256_unpacklo_epi16(M[6], M[7]); 
-		tH[3] = _mm256_unpackhi_epi16(M[6], M[7]); 
-
-		tL[4] = _mm256_unpacklo_epi16(M[8], M[9]); 
-		tH[4] = _mm256_unpackhi_epi16(M[8], M[9]); 
-
-		tL[5] = _mm256_unpacklo_epi16(M[10], M[11]); 
-		tH[5] = _mm256_unpackhi_epi16(M[10], M[11]); 
-
-		tL[6] = _mm256_unpacklo_epi16(M[12], M[13]); 
-		tH[6] = _mm256_unpackhi_epi16(M[12], M[13]); 
-
-		tL[7] = _mm256_unpacklo_epi16(M[14], M[15]); 
-		tH[7] = _mm256_unpackhi_epi16(M[14], M[15]); 
-
-	//}
-
-	//-------------------------------------------------------
-	//for(i=0; i<4; i=i+1)
-	//{
-		bL[0] = _mm256_unpacklo_epi32(tL[0], tL[1]); 
-		bH[0] = _mm256_unpackhi_epi32(tL[0], tL[1]); 
-
-		bL[1] = _mm256_unpacklo_epi32(tL[2], tL[3]); 
-		bH[1] = _mm256_unpackhi_epi32(tL[2], tL[3]); 
-
-		bL[2] = _mm256_unpacklo_epi32(tL[4], tL[5]); 
-		bH[2] = _mm256_unpackhi_epi32(tL[4], tL[5]); 
-
-		bL[3] = _mm256_unpacklo_epi32(tL[6], tL[7]); 
-		bH[3] = _mm256_unpackhi_epi32(tL[6], tL[7]); 
-
-	//}
-
-	//for(i=0; i<2; i=i+1)
-	//{
-		dL[0] = _mm256_unpacklo_epi64(bL[0], bL[1]); 
-		dH[0] = _mm256_unpackhi_epi64(bL[0], bL[1]); 
-
-		dL[1] = _mm256_unpacklo_epi64(bL[2], bL[3]); 
-		dH[1] = _mm256_unpackhi_epi64(bL[2], bL[3]);
-
-		M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
-		M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
-		M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
-		M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
-
-	//}
-	//for(i=0; i<2; i=i+1)
-	//{
-		eL[0] = _mm256_unpacklo_epi64(bH[0], bH[1]); 
-		eH[0] = _mm256_unpackhi_epi64(bH[0], bH[1]); 
-
-		eL[1] = _mm256_unpacklo_epi64(bH[2], bH[3]); 
-		eH[1] = _mm256_unpackhi_epi64(bH[2], bH[3]); 
-
-	//}
-
-	//-------------------------------------------------------
-
-	//-------------------------------------------------------
-	for(i=0; i<4; i=i+1)
-	{
-		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
-		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
-	}
-
-
-	for(i=0; i<2; i=i+1)
-	{
-		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
-		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
-		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
-	}
-
-	//-------------------------------------------------------
-
-
-
-	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
-	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
-	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
-	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
-
-	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
-	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
-	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
-	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
-
-	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
-	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
-	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
-	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
-}
-
-
-void transpose1(__m256i *M)
-{
-	int i;
-	__m256i tL[8], tH[8];
-	__m256i bL[4], bH[4], cL[4], cH[4];
-	__m256i dL[2], dH[2], eL[2], eH[2], fL[2], fH[2], gL[2], gH[2];
-
-	for(i=0; i<8; i=i+1)
-	{
-		tL[i] = _mm256_unpacklo_epi16(M[2*i], M[2*i+1]); 
-		tH[i] = _mm256_unpackhi_epi16(M[2*i], M[2*i+1]); 
-	}
-
-	for(i=0; i<4; i=i+1)
-	{
-		bL[i] = _mm256_unpacklo_epi32(tL[2*i], tL[2*i+1]); 
-		bH[i] = _mm256_unpackhi_epi32(tL[2*i], tL[2*i+1]); 
-	}
-	for(i=0; i<4; i=i+1)
-	{
-		cL[i] = _mm256_unpacklo_epi32(tH[2*i], tH[2*i+1]); 
-		cH[i] = _mm256_unpackhi_epi32(tH[2*i], tH[2*i+1]); 
-	}
-
-	for(i=0; i<2; i=i+1)
-	{
-		dL[i] = _mm256_unpacklo_epi64(bL[2*i], bL[2*i+1]); 
-		dH[i] = _mm256_unpackhi_epi64(bL[2*i], bL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		eL[i] = _mm256_unpacklo_epi64(bH[2*i], bH[2*i+1]); 
-		eH[i] = _mm256_unpackhi_epi64(bH[2*i], bH[2*i+1]); 
-	}
-
-	for(i=0; i<2; i=i+1)
-	{
-		fL[i] = _mm256_unpacklo_epi64(cL[2*i], cL[2*i+1]); 
-		fH[i] = _mm256_unpackhi_epi64(cL[2*i], cL[2*i+1]); 
-	}
-	for(i=0; i<2; i=i+1)
-	{
-		gL[i] = _mm256_unpacklo_epi64(cH[2*i], cH[2*i+1]); 
-		gH[i] = _mm256_unpackhi_epi64(cH[2*i], cH[2*i+1]); 
-	}
-
-	M[0] = _mm256_permute2f128_si256(dL[0], dL[1], 0x20);
-	M[8] = _mm256_permute2f128_si256(dL[0], dL[1], 0x31);
-	M[1] = _mm256_permute2f128_si256(dH[0], dH[1], 0x20);
-	M[9] = _mm256_permute2f128_si256(dH[0], dH[1], 0x31);
-
-	M[2] = _mm256_permute2f128_si256(eL[0], eL[1], 0x20);
-	M[10] = _mm256_permute2f128_si256(eL[0], eL[1], 0x31);
-	M[3] = _mm256_permute2f128_si256(eH[0], eH[1], 0x20);
-	M[11] = _mm256_permute2f128_si256(eH[0], eH[1], 0x31);
-
-	M[4] = _mm256_permute2f128_si256(fL[0], fL[1], 0x20);
-	M[12] = _mm256_permute2f128_si256(fL[0], fL[1], 0x31);
-	M[5] = _mm256_permute2f128_si256(fH[0], fH[1], 0x20);
-	M[13] = _mm256_permute2f128_si256(fH[0], fH[1], 0x31);
-
-	M[6] = _mm256_permute2f128_si256(gL[0], gL[1], 0x20);
-	M[14] = _mm256_permute2f128_si256(gL[0], gL[1], 0x31);
-	M[7] = _mm256_permute2f128_si256(gH[0], gH[1], 0x20);
-	M[15] = _mm256_permute2f128_si256(gH[0], gH[1], 0x31);
-}
-*/
diff --git a/crypto_kem/saber/avx2/polymul/scm_avx.c b/crypto_kem/saber/avx2/polymul/scm_avx.c
deleted file mode 100644
index 48870f51..00000000
--- a/crypto_kem/saber/avx2/polymul/scm_avx.c
+++ /dev/null
@@ -1,753 +0,0 @@
-//#define SCM_SIZE 16
-
-//#pragma STDC FP_CONTRACT ON
-
-#include <immintrin.h>
-
-static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) { 
-    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
-}
-
-
-static void schoolbook_avx_new3_acc(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
-									      //the c_avx are added cummulatively
-{
-
-	register __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-	register __m256i temp;
-
-
-	a0=a[0];
-	a1=a[1];
-	a2=a[2];
-	a3=a[3];
-	a4=a[4];
-	a5=a[5];
-	a6=a[6];
-	a7=a[7];
-
-	b0=b[0];
-	b1=b[1];
-	b2=b[2];
-	b3=b[3];
-	b4=b[4];
-	b5=b[5];
-	b6=b[6];
-	b7=b[7];
-
-	// New Unrolled first triangle
-
-	//otherwise accumulate
-	c_avx[0] = mul_add(a0, b0, c_avx[0]);
-	
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	temp=mul_add(a1, b0, temp);
-	c_avx[1] = _mm256_add_epi16(temp, c_avx[1]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b2);
-	temp = mul_add(a1, b1, temp);
-	temp=mul_add(a2, b0, temp);
-	c_avx[2] = _mm256_add_epi16(temp, c_avx[2]);
-	
-
-	temp = _mm256_mullo_epi16 (a0, b3);
-	temp = mul_add(a1, b2, temp);
-	temp = mul_add(a2, b1, temp);
-	temp=mul_add(a3, b0, temp);
-	c_avx[3] = _mm256_add_epi16(temp, c_avx[3]);
-
-	temp = _mm256_mullo_epi16 (a0, b4);
-	temp = mul_add(a1, b3, temp);
-	temp = mul_add(a3, b1, temp);
-	temp = mul_add(a4, b0, temp);
-	temp=mul_add(a2, b2, temp);
-	c_avx[4] = _mm256_add_epi16(temp, c_avx[4]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b5);
-	temp = mul_add(a1, b4 , temp);
-	temp = mul_add(a2, b3, temp);
-	temp = mul_add(a3, b2, temp);
-	temp = mul_add( a4, b1, temp);
-	temp=mul_add(a5, b0, temp);
-	c_avx[5] = _mm256_add_epi16(temp, c_avx[5]);
-	
-	temp = _mm256_mullo_epi16 (a0, b6);
-	temp = mul_add(a1, b5, temp);
-	temp = mul_add(a5, b1, temp);
-	temp = mul_add(a6, b0, temp);
-	temp = mul_add(a2, b4, temp);
-	temp = mul_add(a3, b3, temp);
-	temp=mul_add(a4, b2, temp);
-	c_avx[6] = _mm256_add_epi16(temp, c_avx[6]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b7);
-	temp = mul_add(a1, b6, temp);
-	temp = mul_add (a6, b1, temp);
-	temp = mul_add (a7, b0, temp);
-	temp = mul_add(a2, b5, temp);
-	temp = mul_add (a3, b4, temp);
-	temp = mul_add (a4, b3, temp);
-	temp=mul_add(a5, b2, temp);
-	c_avx[7] = _mm256_add_epi16(temp, c_avx[7]);
-
-	temp = _mm256_mullo_epi16 (a0, b[8]);
-	temp = mul_add (a1, b7, temp);
-	temp = mul_add (a7, b1, temp);
-	temp = mul_add (a[8], b0, temp);
-	temp = mul_add (a2, b6,temp);
-	temp = mul_add(a3, b5, temp);
-	temp = mul_add (a4, b4,temp);
-	temp = mul_add (a5, b3, temp);
-	
-		temp=mul_add(a6, b2, temp);
-		c_avx[8] = _mm256_add_epi16(temp, c_avx[8]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[9]);
-	temp = mul_add (a1, b[8], temp);
-	temp = mul_add (a[8], b1, temp);
-	temp = mul_add (a[9], b0, temp);
-	temp = mul_add (a2, b7, temp);
-	temp = mul_add (a3, b6, temp);
-	temp = mul_add (a4, b5, temp);
-	temp = mul_add (a5, b4, temp);
-	temp = mul_add (a6, b3, temp);
-		temp=mul_add(a7, b2, temp);
-		c_avx[9] = _mm256_add_epi16(temp, c_avx[9]);
-
-
-	temp= _mm256_mullo_epi16 (a0, b[10]);
-	temp = mul_add (a1, b[9], temp);
-	temp = mul_add (a[9], b1, temp);
-	temp = mul_add (a[10], b0, temp);
-	temp = mul_add (a2, b[8], temp);
-	temp = mul_add (a3, b7, temp);
-	temp = mul_add (a4, b6, temp);
-	temp = mul_add (a5, b5, temp);
-	temp = mul_add (a6, b4, temp);
-	temp = mul_add (a7, b3, temp);
-		temp=mul_add(a[8], b2, temp);
-		c_avx[10] = _mm256_add_epi16(temp, c_avx[10]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[11]);
-	temp = mul_add (a1, b[10], temp );
-	temp = mul_add (a[10], b1, temp );
-	temp = mul_add (a[11], b0, temp );
-	temp = mul_add (a2, b[9], temp );
-	temp = mul_add (a3, b[8], temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a[8], b3, temp );
-		temp=mul_add(a[9], b2, temp);
-		c_avx[11] = _mm256_add_epi16(temp, c_avx[11]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[12]);
-	temp = mul_add (a1, b[11], temp);
-	temp = mul_add (a[11], b1, temp);
-	temp = mul_add (a[12], b0, temp);
-	temp = mul_add (a2, b[10], temp);
-	temp = mul_add (a3, b[9], temp);
-	temp = mul_add (a4, b[8], temp);
-	temp = mul_add (a5, b7, temp);
-	temp = mul_add (a6, b6, temp);
-	temp = mul_add (a7, b5, temp);
-	temp = mul_add (a[8], b4, temp);
-	temp = mul_add (a[9], b3, temp);
-		temp=mul_add(a[10], b2, temp);
-		c_avx[12] = _mm256_add_epi16(temp, c_avx[12]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[13]);
-	temp = mul_add (a1, b[12], temp );
-	temp = mul_add (a[12], b1, temp );
-	temp = mul_add (a[13], b0, temp );
-	temp = mul_add (a2, b[11], temp );
-	temp = mul_add (a3, b[10], temp );
-	temp = mul_add (a4, b[9], temp );
-	temp = mul_add (a5, b[8], temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a[8], b5, temp );
-	temp = mul_add (a[9], b4, temp );
-	temp = mul_add (a[10], b3, temp );
-		temp=mul_add(a[11], b2, temp);
-		c_avx[13] = _mm256_add_epi16(temp, c_avx[13]);
-
-
-
-	temp = _mm256_mullo_epi16 (a0, b[14]);
-	temp = mul_add (a1, b[13], temp );
-	temp = mul_add (a[13], b1, temp );
-	temp = mul_add (a[14], b0, temp );
-	temp = mul_add (a2, b[12], temp );
-	temp = mul_add (a3, b[11], temp );
-	temp = mul_add (a4, b[10], temp );
-	temp = mul_add (a5, b[9], temp );
-	temp = mul_add (a6, b[8], temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a[8], b6, temp );
-	temp = mul_add (a[9], b5, temp );
-	temp = mul_add (a[10], b4, temp );
-	temp = mul_add (a[11], b3, temp );
-		temp=mul_add(a[12], b2, temp);
-		c_avx[14] = _mm256_add_epi16(temp, c_avx[14]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b[15]);
-	temp = mul_add (a1, b[14], temp );
-	temp = mul_add (a[14], b1, temp );
-	temp = mul_add (a[15], b0, temp );
-	temp = mul_add (a2, b[13], temp );
-	temp = mul_add (a3, b[12], temp );
-	temp = mul_add (a4, b[11], temp );
-	temp = mul_add (a5, b[10], temp );
-	temp = mul_add (a6, b[9], temp );
-	temp = mul_add (a7, b[8], temp );
-	temp = mul_add (a[8], b7, temp );
-	temp = mul_add (a[9], b6, temp );
-	temp = mul_add (a[10], b5, temp );
-	temp = mul_add (a[11], b4, temp );
-	temp = mul_add (a[12], b3, temp );
-		temp=mul_add(a[13], b2, temp);
-		c_avx[15] = _mm256_add_epi16(temp, c_avx[15]);
-
-
-	// unrolled second triangle
-	a0=a[14];
-	a1=a[15];
-	a2=a[13];
-	a3=a[12];
-	a4=a[11];
-	a5=a[10];
-	a6=a[9];
-	a7=a[8];
-
-	b0=b[14];
-	b1=b[15];
-	b2=b[13];
-	b3=b[12];
-	b4=b[11];
-	b5=b[10];
-	b6=b[9];
-	b7=b[8];
-
-	temp = _mm256_mullo_epi16 (a[1], b1);
-	temp = mul_add (a[2], b0, temp );
-	temp = mul_add (a[3], b2, temp );
-	temp = mul_add (a[4], b3, temp );
-	temp = mul_add (a[5], b4, temp );
-	temp = mul_add (a[6], b5, temp );
-	temp = mul_add (a[7], b6, temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a6, b[7], temp );
-	temp = mul_add (a5, b[6], temp );
-	temp = mul_add (a4, b[5], temp );
-	temp = mul_add (a3, b[4], temp );
-	temp = mul_add (a2, b[3], temp );
-	temp = mul_add (a0, b[2], temp );
-		temp=mul_add(a1, b[1], temp);
-		c_avx[16] = _mm256_add_epi16(temp, c_avx[16]);
-
-
-	temp = _mm256_mullo_epi16 (a[2], b1);
-	temp = mul_add (a[3], b0, temp );
-	temp = mul_add (a[4], b2, temp );
-	temp = mul_add (a[5], b3, temp );
-	temp = mul_add (a[6], b4, temp );
-	temp = mul_add (a[7], b5, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a5, b[7], temp );
-	temp = mul_add (a4, b[6], temp );
-	temp = mul_add (a3, b[5], temp );
-	temp = mul_add (a2, b[4], temp );
-	temp = mul_add (a0, b[3], temp );
-		temp=mul_add(a1, b[2], temp);
-		c_avx[17] = _mm256_add_epi16(temp, c_avx[17]);
-
-
-	temp = _mm256_mullo_epi16 (a[3], b1);
-	temp = mul_add (a[4], b0, temp );
-	temp = mul_add (a[5], b2, temp );
-	temp = mul_add (a[6], b3, temp );
-	temp = mul_add (a[7], b4, temp );
-	temp = mul_add (a7, b5, temp );
-	temp = mul_add (a6, b6, temp );
-	temp = mul_add (a5, b7, temp );
-	temp = mul_add (a4, b[7], temp );
-	temp = mul_add (a3, b[6], temp );
-	temp = mul_add (a2, b[5], temp );
-	temp = mul_add (a0, b[4], temp );
-		temp=mul_add(a1, b[3], temp);
-		c_avx[18] = _mm256_add_epi16(temp, c_avx[18]);
-
-
-	temp = _mm256_mullo_epi16 (a[4], b1);
-	temp = mul_add (a[5], b0, temp );
-	temp = mul_add (a[6], b2, temp );
-	temp = mul_add (a[7], b3, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a3, b[7], temp );
-	temp = mul_add (a2, b[6], temp );
-	temp = mul_add (a0, b[5], temp );
-		temp=mul_add(a1, b[4], temp);
-		c_avx[19] = _mm256_add_epi16(temp, c_avx[19]);
-
-
-	temp = _mm256_mullo_epi16 (a[5], b1);
-	temp = mul_add (a[6], b0, temp );
-	temp = mul_add (a[7], b2, temp );
-	temp = mul_add (a7, b3, temp );
-	temp = mul_add (a6, b4, temp );
-	temp = mul_add (a5, b5, temp );
-	temp = mul_add (a4, b6, temp );
-	temp = mul_add (a3, b7, temp );
-	temp = mul_add (a2, b[7], temp );
-	temp = mul_add (a0, b[6], temp );
-		temp=mul_add(a1, b[5], temp);
-		c_avx[20] = _mm256_add_epi16(temp, c_avx[20]);
-
-
-	temp = _mm256_mullo_epi16 (a[6], b1);
-	temp = mul_add (a[7], b0, temp );
-	temp = mul_add (a7, b2, temp );
-	temp = mul_add (a6, b3, temp );
-	temp = mul_add (a5, b4, temp );
-	temp = mul_add (a4, b5, temp );
-	temp = mul_add (a3, b6, temp );
-	temp = mul_add (a2, b7, temp );
-	temp = mul_add (a0, b[7], temp );
-		temp=mul_add(a1, b[6], temp);
-		c_avx[21] = _mm256_add_epi16(temp, c_avx[21]);
-
-
-	temp = _mm256_mullo_epi16 (a[7], b1);
-	temp = mul_add (a7, b0, temp );
-	temp = mul_add (a6, b2, temp );
-	temp = mul_add (a5, b3, temp );
-	temp = mul_add (a4, b4, temp );
-	temp = mul_add (a3, b5, temp );
-	temp = mul_add (a2, b6, temp );
-	temp = mul_add (a0, b7, temp );
-		temp=mul_add(a1, b[7], temp);
-		c_avx[22] = _mm256_add_epi16(temp, c_avx[22]);
-
-
-	temp = _mm256_mullo_epi16 (a7, b1);
-	temp = mul_add (a6, b0, temp );
-	temp = mul_add (a5, b2, temp );
-	temp = mul_add (a4, b3, temp );
-	temp = mul_add (a3, b4, temp );
-	temp = mul_add (a2, b5, temp );
-	temp = mul_add (a0, b6, temp );
-		temp=mul_add(a1, b7, temp);
-		c_avx[23] = _mm256_add_epi16(temp, c_avx[23]);
-
-
-	temp = _mm256_mullo_epi16 (a6, b1);
-	temp = mul_add (a5, b0, temp );
-	temp = mul_add (a4, b2, temp );
-	temp = mul_add (a3, b3, temp );
-	temp = mul_add (a2, b4, temp );
-	temp = mul_add (a0, b5, temp );
-		temp=mul_add(a1, b6, temp);
-		c_avx[24] = _mm256_add_epi16(temp, c_avx[24]);
-
-
-	temp = _mm256_mullo_epi16 (a5, b1);
-	temp = mul_add (a4, b0, temp );
-	temp = mul_add (a3, b2, temp );
-	temp = mul_add (a2, b3, temp );
-	temp = mul_add (a0, b4, temp );
-		temp=mul_add(a1, b5, temp);
-		c_avx[25] = _mm256_add_epi16(temp, c_avx[25]);
-
-
-	temp = _mm256_mullo_epi16 (a4, b1);
-	temp = mul_add (a3, b0, temp );
-	temp = mul_add (a2, b2, temp );
-	temp = mul_add (a0, b3, temp );
-		temp=mul_add(a1, b4, temp);
-		c_avx[26] = _mm256_add_epi16(temp, c_avx[26]);
-
-
-	temp = _mm256_mullo_epi16 (a3, b1);
-	temp = mul_add (a2, b0, temp );
-	temp = mul_add (a0, b2, temp );
-		temp=mul_add(a1, b3, temp);
-		c_avx[27] = _mm256_add_epi16(temp, c_avx[27]);
-
-
-	temp = _mm256_mullo_epi16 (a2, b1);
-	temp = mul_add (a0, b0, temp );
-		temp=mul_add(a1, b2, temp);
-		c_avx[28] = _mm256_add_epi16(temp, c_avx[28]);
-
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-		temp=mul_add(a1, b0, temp);
-		c_avx[29] = _mm256_add_epi16(temp, c_avx[29]);
-
-
-		c_avx[30] = mul_add(a1, b1, c_avx[30]);
-
-
-
-	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
-
-
-}
-
-
-
-static void schoolbook_avx_new2(__m256i* a, __m256i* b, __m256i* c_avx) ////8 coefficients of a and b has been prefetched
-									      //the c_avx are not added cummulatively
-{
-
-	__m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-	__m256i temp;
-
-
-	a0=a[0];
-	a1=a[1];
-	a2=a[2];
-	a3=a[3];
-	a4=a[4];
-	a5=a[5];
-	a6=a[6];
-	a7=a[7];
-
-	b0=b[0];
-	b1=b[1];
-	b2=b[2];
-	b3=b[3];
-	b4=b[4];
-	b5=b[5];
-	b6=b[6];
-	b7=b[7];
-
-	// New Unrolled first triangle
-	c_avx[0] = _mm256_mullo_epi16 (a0, b0);
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	c_avx[1]=mul_add(a1, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b2);
-
-	temp = mul_add(a1, b1, temp);
-	c_avx[2]= mul_add(a2, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b3);
-	temp = mul_add(a1, b2, temp);
-	temp = mul_add(a2, b1, temp);
-	c_avx[3]= mul_add(a3, b0, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b4);
-	temp = mul_add(a1, b3, temp);
-	temp = mul_add(a3, b1, temp);
-	temp = mul_add(a4, b0, temp);
-	c_avx[4]= mul_add(a2, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b5);
-	temp = mul_add(a1, b4 , temp);
-	temp = mul_add(a2, b3, temp);
-	temp = mul_add(a3, b2, temp);
-	temp = mul_add( a4, b1, temp);
-	c_avx[5] = mul_add(a5, b0, temp);
-	
-	temp = _mm256_mullo_epi16 (a0, b6);
-	temp = mul_add(a1, b5, temp);
-	temp = mul_add(a5, b1, temp);
-	temp = mul_add(a6, b0, temp);
-	temp = mul_add(a2, b4, temp);
-	temp = mul_add(a3, b3, temp);
-	c_avx[6] = mul_add(a4, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b7);
-	temp = mul_add(a1, b6, temp);
-	temp = mul_add (a6, b1, temp);
-	temp = mul_add (a7, b0, temp);
-	temp = mul_add(a2, b5, temp);
-	temp = mul_add (a3, b4, temp);
-	temp = mul_add (a4, b3, temp);
-	c_avx[7] = mul_add (a5, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[8]);
-	temp = mul_add (a1, b7, temp);
-	temp = mul_add (a7, b1, temp);
-	temp = mul_add (a[8], b0, temp);
-	temp = mul_add (a2, b6,temp);
-	temp = mul_add(a3, b5, temp);
-	temp = mul_add (a4, b4,temp);
-	temp = mul_add (a5, b3, temp);
-	c_avx[8] = mul_add (a6, b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[9]);
-	temp = mul_add (a1, b[8], temp);
-	temp = mul_add (a[8], b1, temp);
-	temp = mul_add (a[9], b0, temp);
-	temp = mul_add (a2, b7, temp);
-	temp = mul_add (a3, b6, temp);
-	temp = mul_add (a4, b5, temp);
-	temp = mul_add (a5, b4, temp);
-	temp = mul_add (a6, b3, temp);
-	c_avx[9] = mul_add (a7, b2, temp);
-
-	temp= _mm256_mullo_epi16 (a0, b[10]);
-	temp = mul_add (a1, b[9], temp);
-	temp = mul_add (a[9], b1, temp);
-	temp = mul_add (a[10], b0, temp);
-	temp = mul_add (a2, b[8], temp);
-	temp = mul_add (a3, b7, temp);
-	temp = mul_add (a4, b6, temp);
-	temp = mul_add (a5, b5, temp);
-	temp = mul_add (a6, b4, temp);
-	temp = mul_add (a7, b3, temp);
-	c_avx[10] = mul_add (a[8], b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[11]);
-	temp = mul_add (a1, b[10], temp );
-	temp = mul_add (a[10], b1, temp );
-	temp = mul_add (a[11], b0, temp );
-	temp = mul_add (a2, b[9], temp );
-	temp = mul_add (a3, b[8], temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a[8], b3, temp );
-	c_avx[11] = mul_add (a[9], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[12]);
-	temp = mul_add (a1, b[11], temp);
-	temp = mul_add (a[11], b1, temp);
-	temp = mul_add (a[12], b0, temp);
-	temp = mul_add (a2, b[10], temp);
-	temp = mul_add (a3, b[9], temp);
-	temp = mul_add (a4, b[8], temp);
-	temp = mul_add (a5, b7, temp);
-	temp = mul_add (a6, b6, temp);
-	temp = mul_add (a7, b5, temp);
-	temp = mul_add (a[8], b4, temp);
-	temp = mul_add (a[9], b3, temp);
-	c_avx[12] = mul_add (a[10], b2, temp);
-
-	temp = _mm256_mullo_epi16 (a0, b[13]);
-	temp = mul_add (a1, b[12], temp );
-	temp = mul_add (a[12], b1, temp );
-	temp = mul_add (a[13], b0, temp );
-	temp = mul_add (a2, b[11], temp );
-	temp = mul_add (a3, b[10], temp );
-	temp = mul_add (a4, b[9], temp );
-	temp = mul_add (a5, b[8], temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a[8], b5, temp );
-	temp = mul_add (a[9], b4, temp );
-	temp = mul_add (a[10], b3, temp );
-	c_avx[13] = mul_add (a[11], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[14]);
-	temp = mul_add (a1, b[13], temp );
-	temp = mul_add (a[13], b1, temp );
-	temp = mul_add (a[14], b0, temp );
-	temp = mul_add (a2, b[12], temp );
-	temp = mul_add (a3, b[11], temp );
-	temp = mul_add (a4, b[10], temp );
-	temp = mul_add (a5, b[9], temp );
-	temp = mul_add (a6, b[8], temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a[8], b6, temp );
-	temp = mul_add (a[9], b5, temp );
-	temp = mul_add (a[10], b4, temp );
-	temp = mul_add (a[11], b3, temp );
-	c_avx[14] = mul_add (a[12], b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b[15]);
-	temp = mul_add (a1, b[14], temp );
-	temp = mul_add (a[14], b1, temp );
-	temp = mul_add (a[15], b0, temp );
-	temp = mul_add (a2, b[13], temp );
-	temp = mul_add (a3, b[12], temp );
-	temp = mul_add (a4, b[11], temp );
-	temp = mul_add (a5, b[10], temp );
-	temp = mul_add (a6, b[9], temp );
-	temp = mul_add (a7, b[8], temp );
-	temp = mul_add (a[8], b7, temp );
-	temp = mul_add (a[9], b6, temp );
-	temp = mul_add (a[10], b5, temp );
-	temp = mul_add (a[11], b4, temp );
-	temp = mul_add (a[12], b3, temp );
-	c_avx[15] = mul_add (a[13], b2, temp );
-
-
-	// unrolled second triangle
-	a0=a[14];
-	a1=a[15];
-	a2=a[13];
-	a3=a[12];
-	a4=a[11];
-	a5=a[10];
-	a6=a[9];
-	a7=a[8];
-
-	b0=b[14];
-	b1=b[15];
-	b2=b[13];
-	b3=b[12];
-	b4=b[11];
-	b5=b[10];
-	b6=b[9];
-	b7=b[8];
-	
-
-	temp = _mm256_mullo_epi16 (a[1], b1);
-	temp = mul_add (a[2], b0, temp );
-	temp = mul_add (a[3], b2, temp );
-	temp = mul_add (a[4], b3, temp );
-	temp = mul_add (a[5], b4, temp );
-	temp = mul_add (a[6], b5, temp );
-	temp = mul_add (a[7], b6, temp );
-	temp = mul_add (a7, b7, temp );
-	temp = mul_add (a6, b[7], temp );
-	temp = mul_add (a5, b[6], temp );
-	temp = mul_add (a4, b[5], temp );
-	temp = mul_add (a3, b[4], temp );
-	temp = mul_add (a2, b[3], temp );
-	temp = mul_add (a0, b[2], temp );
-	c_avx[16] = mul_add (a1, b[1], temp );
-
-	temp = _mm256_mullo_epi16 (a[2], b1);
-	temp = mul_add (a[3], b0, temp );
-	temp = mul_add (a[4], b2, temp );
-	temp = mul_add (a[5], b3, temp );
-	temp = mul_add (a[6], b4, temp );
-	temp = mul_add (a[7], b5, temp );
-	temp = mul_add (a7, b6, temp );
-	temp = mul_add (a6, b7, temp );
-	temp = mul_add (a5, b[7], temp );
-	temp = mul_add (a4, b[6], temp );
-	temp = mul_add (a3, b[5], temp );
-	temp = mul_add (a2, b[4], temp );
-	temp = mul_add (a0, b[3], temp );
-	c_avx[17] = mul_add (a1, b[2], temp );
-
-	temp = _mm256_mullo_epi16 (a[3], b1);
-	temp = mul_add (a[4], b0, temp );
-	temp = mul_add (a[5], b2, temp );
-	temp = mul_add (a[6], b3, temp );
-	temp = mul_add (a[7], b4, temp );
-	temp = mul_add (a7, b5, temp );
-	temp = mul_add (a6, b6, temp );
-	temp = mul_add (a5, b7, temp );
-	temp = mul_add (a4, b[7], temp );
-	temp = mul_add (a3, b[6], temp );
-	temp = mul_add (a2, b[5], temp );
-	temp = mul_add (a0, b[4], temp );
-	c_avx[18] = mul_add (a1, b[3], temp );
-
-	temp = _mm256_mullo_epi16 (a[4], b1);
-	temp = mul_add (a[5], b0, temp );
-	temp = mul_add (a[6], b2, temp );
-	temp = mul_add (a[7], b3, temp );
-	temp = mul_add (a7, b4, temp );
-	temp = mul_add (a6, b5, temp );
-	temp = mul_add (a5, b6, temp );
-	temp = mul_add (a4, b7, temp );
-	temp = mul_add (a3, b[7], temp );
-	temp = mul_add (a2, b[6], temp );
-	temp = mul_add (a0, b[5], temp );
-	c_avx[19] = mul_add (a1, b[4], temp );
-
-	temp = _mm256_mullo_epi16 (a[5], b1);
-	temp = mul_add (a[6], b0, temp );
-	temp = mul_add (a[7], b2, temp );
-	temp = mul_add (a7, b3, temp );
-	temp = mul_add (a6, b4, temp );
-	temp = mul_add (a5, b5, temp );
-	temp = mul_add (a4, b6, temp );
-	temp = mul_add (a3, b7, temp );
-	temp = mul_add (a2, b[7], temp );
-	temp = mul_add (a0, b[6], temp );
-	c_avx[20] = mul_add (a1, b[5], temp );
-
-	temp = _mm256_mullo_epi16 (a[6], b1);
-	temp = mul_add (a[7], b0, temp );
-	temp = mul_add (a7, b2, temp );
-	temp = mul_add (a6, b3, temp );
-	temp = mul_add (a5, b4, temp );
-	temp = mul_add (a4, b5, temp );
-	temp = mul_add (a3, b6, temp );
-	temp = mul_add (a2, b7, temp );
-	temp = mul_add (a0, b[7], temp );
-	c_avx[21] = mul_add (a1, b[6], temp );
-
-	temp = _mm256_mullo_epi16 (a[7], b1);
-	temp = mul_add (a7, b0, temp );
-	temp = mul_add (a6, b2, temp );
-	temp = mul_add (a5, b3, temp );
-	temp = mul_add (a4, b4, temp );
-	temp = mul_add (a3, b5, temp );
-	temp = mul_add (a2, b6, temp );
-	temp = mul_add (a0, b7, temp );
-	c_avx[22] = mul_add (a1, b[7], temp );
-
-	temp = _mm256_mullo_epi16 (a7, b1);
-	temp = mul_add (a6, b0, temp );
-	temp = mul_add (a5, b2, temp );
-	temp = mul_add (a4, b3, temp );
-	temp = mul_add (a3, b4, temp );
-	temp = mul_add (a2, b5, temp );
-	temp = mul_add (a0, b6, temp );
-	c_avx[23] = mul_add (a1, b7, temp );
-
-	temp = _mm256_mullo_epi16 (a6, b1);
-	temp = mul_add (a5, b0, temp );
-	temp = mul_add (a4, b2, temp );
-	temp = mul_add (a3, b3, temp );
-	temp = mul_add (a2, b4, temp );
-	temp = mul_add (a0, b5, temp );
-	c_avx[24] = mul_add (a1, b6, temp );
-
-	temp = _mm256_mullo_epi16 (a5, b1);
-	temp = mul_add (a4, b0, temp );
-	temp = mul_add (a3, b2, temp );
-	temp = mul_add (a2, b3, temp );
-	temp = mul_add (a0, b4, temp );
-	c_avx[25] = mul_add (a1, b5, temp );
-
-	temp = _mm256_mullo_epi16 (a4, b1);
-	temp = mul_add (a3, b0, temp );
-	temp = mul_add (a2, b2, temp );
-	temp = mul_add (a0, b3, temp );
-	c_avx[26] = mul_add (a1, b4, temp );
-
-	temp = _mm256_mullo_epi16 (a3, b1);
-	temp = mul_add (a2, b0, temp );
-	temp = mul_add (a0, b2, temp );
-	c_avx[27] = mul_add (a1, b3, temp );
-
-	temp = _mm256_mullo_epi16 (a2, b1);
-	temp = mul_add (a0, b0, temp );
-	c_avx[28] = mul_add (a1, b2, temp );
-
-	temp = _mm256_mullo_epi16 (a0, b1);
-	c_avx[29] = mul_add (a1, b0, temp);
-
-	c_avx[30] = _mm256_mullo_epi16 (a1, b1);
-
-
-	c_avx[2*SCM_SIZE-1] = _mm256_set_epi64x(0, 0, 0, 0);
-
-}
diff --git a/crypto_kem/saber/avx2/polymul/toom-cook_4way.c b/crypto_kem/saber/avx2/polymul/toom-cook_4way.c
deleted file mode 100644
index 78fb86c2..00000000
--- a/crypto_kem/saber/avx2/polymul/toom-cook_4way.c
+++ /dev/null
@@ -1,1010 +0,0 @@
-/*
-Cleaned version for step by step approach look into the _debug file
-*/
-//#include "timing.c"
-#include "consts.h"
-#include "matrix.c"
-#include "scm_avx.c"
-
-static void batch_64coefficient_multiplications_new(__m256i* a, __m256i* b_bucket, __m256i* c_bucket, int f)//all 7 Karatsuba evaluation and interpolation are done in AVX.
-{
-	__m256i a_bucket[SCM_SIZE*4]; //SCM_SIZE = 16; Holds evaluation (a & b) for 7 Karatsuba at a time
-
-	//uint16_t i;
-
-	register __m256i r0_avx, r1_avx, r2_avx, r3_avx;
-
-
-
-		//CLOCK1=cpucycles();
-		
-		//------------------AVX evaluation for 1st poly-----------------------
-
-                    r0_avx=a[0];
-                    r1_avx=a[1];
-                    r2_avx=a[2];
-                    r3_avx=a[3];
-		    a_bucket[0]=r0_avx;
-		    a_bucket[1]=r1_avx;
-		    a_bucket[2]=r2_avx;
-		    a_bucket[3]=r3_avx;
-		    a_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8]= _mm256_add_epi16(a_bucket[6],a_bucket[7]);
-
-
-		//------------------AVX evaluation for 1st poly ends------------------
-
-
-		//------------------AVX evaluation for 2nd poly-----------------------
-                    r0_avx=a[small_len_avx];
-                    r1_avx=a[small_len_avx+1];
-                    r2_avx=a[small_len_avx+2];
-                    r3_avx=a[small_len_avx+3];
-		    a_bucket[0+9]=r0_avx;
-		    a_bucket[1+9]=r1_avx;
-		    a_bucket[2+9]=r2_avx;
-		    a_bucket[3+9]=r3_avx;
-		    a_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+9]= _mm256_add_epi16(a_bucket[6+9],a_bucket[7+9]);
-
-	
-		//------------------AVX evaluation for 2nd poly ends------------------
-
-
-		//------------------AVX evaluation for 3rd poly-----------------------
-                    r0_avx=a[2*small_len_avx];
-                    r1_avx=a[2*small_len_avx+1];
-                    r2_avx=a[2*small_len_avx+2];
-                    r3_avx=a[2*small_len_avx+3];
-		    a_bucket[0+18]=r0_avx;
-		    a_bucket[1+18]=r1_avx;
-		    a_bucket[2+18]=r2_avx;
-		    a_bucket[3+18]=r3_avx;
-		    a_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+18]= _mm256_add_epi16(a_bucket[6+18],a_bucket[7+18]);
-		
-		//------------------AVX evaluation for 3rd poly ends------------------
-
-
-		//------------------AVX evaluation for 4th poly-----------------------
-
-                    r0_avx=a[3*small_len_avx];
-                    r1_avx=a[3*small_len_avx+1];
-                    r2_avx=a[3*small_len_avx+2];
-                    r3_avx=a[3*small_len_avx+3];
-		    a_bucket[0+27]=r0_avx;
-		    a_bucket[1+27]=r1_avx;
-		    a_bucket[2+27]=r2_avx;
-		    a_bucket[3+27]=r3_avx;
-		    a_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+27]= _mm256_add_epi16(a_bucket[6+27],a_bucket[7+27]);
-		
-		//------------------AVX evaluation for 4th poly ends------------------
-
-		//------------------AVX evaluation for 5th poly-----------------------
-		
-                    r0_avx=a[4*small_len_avx+0];
-                    r1_avx=a[4*small_len_avx+1];
-                    r2_avx=a[4*small_len_avx+2];
-                    r3_avx=a[4*small_len_avx+3];
-		    a_bucket[0+36]=r0_avx;
-		    a_bucket[1+36]=r1_avx;
-		    a_bucket[2+36]=r2_avx;
-		    a_bucket[3+36]=r3_avx;
-		    a_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+36]= _mm256_add_epi16(a_bucket[6+36],a_bucket[7+36]);
-		
-		//------------------AVX evaluation for 5th poly ends------------------
-
-
-		//------------------AVX evaluation for 6th poly-----------------------
-                    r0_avx=a[5*small_len_avx];
-                    r1_avx=a[5*small_len_avx+1];
-                    r2_avx=a[5*small_len_avx+2];
-                    r3_avx=a[5*small_len_avx+3];
-		    a_bucket[0+45]=r0_avx;
-		    a_bucket[1+45]=r1_avx;
-		    a_bucket[2+45]=r2_avx;
-		    a_bucket[3+45]=r3_avx;
-		    a_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+45]= _mm256_add_epi16(a_bucket[6+45],a_bucket[7+45]);
-		
-		//------------------AVX evaluation for 6th poly ends------------------
-
-		//------------------AVX evaluation for 7th poly-----------------------
-
-                    r0_avx=a[6*small_len_avx];
-                    r1_avx=a[6*small_len_avx+1];
-                    r2_avx=a[6*small_len_avx+2];
-                    r3_avx=a[6*small_len_avx+3];
-		    a_bucket[0+54]=r0_avx;
-		    a_bucket[1+54]=r1_avx;
-		    a_bucket[2+54]=r2_avx;
-		    a_bucket[3+54]=r3_avx;
-		    a_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
-		    a_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
-		    a_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
-		    a_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
-		    a_bucket[8+54]= _mm256_add_epi16(a_bucket[6+54],a_bucket[7+54]);
-
-		//------------------AVX evaluation for 7th poly ends------------------
-		
-	
-
-		//CLOCK2=cpucycles();
-		//CLOCK_EVAL=CLOCK_EVAL+(CLOCK2-CLOCK1);
-		//printf("\nTime for multiplication : %llu\n", CLOCK2-CLOCK1);
-
-
-		//CLOCK1=cpucycles();
-		//-----------------Forward transposes--------------------------------------
-			transpose_n1(a_bucket);
-			transpose_n1(a_bucket+16);
-			transpose_n1(a_bucket+32);
-			transpose_n1(a_bucket+48);
-
-		//-----------------Forwatrd transposes ends---------------------------------
-
-		//----------------------all multiplications---------------------------------
-		if(f==0){
-			schoolbook_avx_new2(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new2(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
-			schoolbook_avx_new2(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
-			schoolbook_avx_new2(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
-		}
-		else{
-			schoolbook_avx_new3_acc(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new3_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE);
-			//schoolbook_avx_new3_acc_fused(a_bucket, b_bucket, c_bucket);
-			schoolbook_avx_new3_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE);
-			schoolbook_avx_new3_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE);
-		}
-		/*
-		schoolbook_avx_new2_acc(a_bucket, b_bucket, c_bucket, f);
-		schoolbook_avx_new2_acc(a_bucket+16, b_bucket+16, c_bucket+2*SCM_SIZE, f);
-		schoolbook_avx_new2_acc(a_bucket+32, b_bucket+32, c_bucket+4*SCM_SIZE, f);
-		schoolbook_avx_new2_acc(a_bucket+48, b_bucket+48, c_bucket+6*SCM_SIZE, f);
-		*/
-
-
-		//----------------------all multiplications ends-----------------------------
-
-
-		//-----------------Reverse transposes--------------------------------------
-
-			/*
-			transpose(c_bucket);
-			transpose(c_bucket+16);
-
-			transpose(c_bucket+2*SCM_SIZE);
-			transpose(c_bucket+16+2*SCM_SIZE);
-
-			transpose(c_bucket+4*SCM_SIZE);
-			transpose(c_bucket+16+4*SCM_SIZE);
-
-			transpose(c_bucket+6*SCM_SIZE);
-			transpose(c_bucket+16+6*SCM_SIZE);
-			*/
-		//-----------------Reverse transposes ends---------------------------------
-
-		//CLOCK2=cpucycles();
-		//CLOCK_MULT=CLOCK_MULT+(CLOCK2-CLOCK1);
-
-		//KARA_interpol(c_bucket, result_final0, result_final1, result_final2, result_final3, result_final4, result_final5, result_final6);
-		
-}
-
-static void KARA_eval(__m256i* b, __m256i *b_bucket){
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx;
-
-
-		//-------1st poly----------------------------------------------------
-                    r0_avx=b[0];
-                    r1_avx=b[1];
-                    r2_avx=b[2];
-                    r3_avx=b[3];
-		    b_bucket[0]=r0_avx;
-		    b_bucket[1]=r1_avx;
-		    b_bucket[2]=r2_avx;
-		    b_bucket[3]=r3_avx;
-		    b_bucket[4]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8]= _mm256_add_epi16(b_bucket[6],b_bucket[7]);
-		//-------2nd poly----------------------------------------------------
-
-                    r0_avx=b[small_len_avx];
-                    r1_avx=b[small_len_avx+1];
-                    r2_avx=b[small_len_avx+2];
-                    r3_avx=b[small_len_avx+3];
-		    b_bucket[0+9]=r0_avx;
-		    b_bucket[1+9]=r1_avx;
-		    b_bucket[2+9]=r2_avx;
-		    b_bucket[3+9]=r3_avx;
-		    b_bucket[4+9]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+9]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+9]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+9]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+9]= _mm256_add_epi16(b_bucket[6+9],b_bucket[7+9]);
-
-		//-------3rd poly----------------------------------------------------
-
-                    r0_avx=b[2*small_len_avx+0];
-                    r1_avx=b[2*small_len_avx+1];
-                    r2_avx=b[2*small_len_avx+2];
-                    r3_avx=b[2*small_len_avx+3];
-		    b_bucket[0+18]=r0_avx;
-		    b_bucket[1+18]=r1_avx;
-		    b_bucket[2+18]=r2_avx;
-		    b_bucket[3+18]=r3_avx;
-		    b_bucket[4+18]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+18]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+18]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+18]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+18]= _mm256_add_epi16(b_bucket[6+18],b_bucket[7+18]);
-
-		//-------4th poly----------------------------------------------------
-                    r0_avx=b[3*small_len_avx];
-                    r1_avx=b[3*small_len_avx+1];
-                    r2_avx=b[3*small_len_avx+2];
-                    r3_avx=b[3*small_len_avx+3];
-		    b_bucket[0+27]=r0_avx;
-		    b_bucket[1+27]=r1_avx;
-		    b_bucket[2+27]=r2_avx;
-		    b_bucket[3+27]=r3_avx;
-		    b_bucket[4+27]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+27]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+27]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+27]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+27]= _mm256_add_epi16(b_bucket[6+27],b_bucket[7+27]);
-
-		//-------5th poly----------------------------------------------------
-
-                    r0_avx=b[4*small_len_avx];
-                    r1_avx=b[4*small_len_avx+1];
-                    r2_avx=b[4*small_len_avx+2];
-                    r3_avx=b[4*small_len_avx+3];
-		    b_bucket[0+36]=r0_avx;
-		    b_bucket[1+36]=r1_avx;
-		    b_bucket[2+36]=r2_avx;
-		    b_bucket[3+36]=r3_avx;
-		    b_bucket[4+36]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+36]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+36]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+36]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+36]= _mm256_add_epi16(b_bucket[6+36],b_bucket[7+36]);
-
-		//-------6th poly----------------------------------------------------
-
-                    r0_avx=b[5*small_len_avx];
-                    r1_avx=b[5*small_len_avx+1];
-                    r2_avx=b[5*small_len_avx+2];
-                    r3_avx=b[5*small_len_avx+3];
-		    b_bucket[0+45]=r0_avx;
-		    b_bucket[1+45]=r1_avx;
-		    b_bucket[2+45]=r2_avx;
-		    b_bucket[3+45]=r3_avx;
-		    b_bucket[4+45]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+45]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+45]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+45]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+45]= _mm256_add_epi16(b_bucket[6+45],b_bucket[7+45]);
-
-		//-------7th poly----------------------------------------------------
-
-                    r0_avx=b[6*small_len_avx];
-                    r1_avx=b[6*small_len_avx+1];
-                    r2_avx=b[6*small_len_avx+2];
-                    r3_avx=b[6*small_len_avx+3];
-		    b_bucket[0+54]=r0_avx;
-		    b_bucket[1+54]=r1_avx;
-		    b_bucket[2+54]=r2_avx;
-		    b_bucket[3+54]=r3_avx;
-		    b_bucket[4+54]= _mm256_add_epi16(r0_avx, r1_avx);
-		    b_bucket[5+54]= _mm256_add_epi16(r2_avx, r3_avx);
-		    b_bucket[6+54]= _mm256_add_epi16(r0_avx, r2_avx);
-		    b_bucket[7+54]= _mm256_add_epi16(r1_avx, r3_avx);
-		    b_bucket[8+54]= _mm256_add_epi16(b_bucket[6+54],b_bucket[7+54]);
-
-		//--------------Evaluating B poly ends-------------------------------
-
-			transpose_n1(b_bucket);
-			transpose_n1(b_bucket+16);
-			transpose_n1(b_bucket+32);
-			transpose_n1(b_bucket+48);	
-}
-
-static void KARA_interpol(__m256i *c_bucket, __m256i* result_final0, __m256i* result_final1, __m256i* result_final2, __m256i* result_final3, __m256i* result_final4, __m256i* result_final5, __m256i* result_final6){
-
-		//int64_t i;
-		register __m256i res_avx0, res_avx1, res_avx2, res_avx3, res_avx4, res_avx5, res_avx6, res_avx7; // to hold each 64X64 poly mul results
-
-		__m256i temp, c6_avx, c7_avx, c8_avx, c20_avx, c21_avx, c22_avx, c23_avx, c24_avx;
-
-		//CLOCK1=cpucycles();
-
-		   //------------------------AVX interpolation for 1st poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[0];
-				res_avx2 = c_bucket[1];
-				res_avx4 = c_bucket[2];
-				res_avx6 = c_bucket[3];
-
-				c6_avx=c_bucket[6];
-				c7_avx=c_bucket[7];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[8], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[16];
-				res_avx3 = c_bucket[17];
-				res_avx5 = c_bucket[18];
-				res_avx7 = c_bucket[19];
-
-				c22_avx=c_bucket[22];
-				c23_avx=c_bucket[23];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[21], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[24], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[20], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[5], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[4], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final0[0]=res_avx0;
-				result_final0[1]=res_avx1;
-
-				result_final0[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final0[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final0[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final0[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final0[6]=res_avx6;
-				result_final0[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 1st poly ends--------------
-
-
-		   //------------------------AVX interpolation for 2nd poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[9]; //c_bucket0
-				res_avx2 = c_bucket[10]; //c_bucket1
-				res_avx4 = c_bucket[11]; //c_bucket2
-				res_avx6 = c_bucket[12]; //c_bucket3
-
-				c6_avx=c_bucket[15]; //c_bucket6
-				c7_avx=c_bucket[32]; //c_bucket7
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[33], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[25]; //c_bucket0
-				res_avx3 = c_bucket[26]; //c_bucket1
-				res_avx5 = c_bucket[27]; //c_bucket2
-				res_avx7 = c_bucket[28]; //c_bucket3
-
-				c22_avx=c_bucket[31];
-				c23_avx=c_bucket[48];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[30], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[49], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[29], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[14], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[13], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final1[0]=res_avx0;
-				result_final1[1]=res_avx1;
-
-				result_final1[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final1[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final1[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final1[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final1[6]=res_avx6;
-				result_final1[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 2nd poly ends--------------
-
-		   //------------------------AVX interpolation for 3rd poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[34]; //c_bucket0
-				res_avx2 = c_bucket[35]; //c_bucket1
-				res_avx4 = c_bucket[36];
-				res_avx6 = c_bucket[37];
-
-				c6_avx=c_bucket[40];
-				c7_avx=c_bucket[41];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[42], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[50]; //c_bucket0
-				res_avx3 = c_bucket[51]; //c_bucket1
-				res_avx5 = c_bucket[52];
-				res_avx7 = c_bucket[53];
-
-				c22_avx=c_bucket[56];
-				c23_avx=c_bucket[57];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[55], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[58], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[54], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[39], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[38], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-			//loop4
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-			//loop5
-				result_final2[0]=res_avx0;
-				result_final2[1]=res_avx1;
-
-				result_final2[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final2[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final2[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final2[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final2[6]=res_avx6;
-				result_final2[7]=res_avx7;
-
-		   //------------------------AVX interpolation for 3rd poly ends--------------
-		
-		   //------------------------AVX interpolation for 4th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[43];
-				res_avx2 = c_bucket[44];
-				res_avx4 = c_bucket[45];
-				res_avx6 = c_bucket[46];
-
-				c6_avx=c_bucket[65];
-				c7_avx=c_bucket[66];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[67], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[59];
-				res_avx3 = c_bucket[60];
-				res_avx5 = c_bucket[61];
-				res_avx7 = c_bucket[62];
-
-				c22_avx=c_bucket[81];
-				c23_avx=c_bucket[82];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[80], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[83], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[63], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[64], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[47], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final3[0]=res_avx0;
-				result_final3[1]=res_avx1;
-
-				result_final3[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final3[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final3[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final3[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final3[6]=res_avx6;
-				result_final3[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 4th poly ends--------------
-
-		   //------------------------AVX interpolation for 5th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[68];
-				res_avx2 = c_bucket[69];
-				res_avx4 = c_bucket[70];
-				res_avx6 = c_bucket[71];
-
-				c6_avx=c_bucket[74];
-				c7_avx=c_bucket[75];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[76], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[84];
-				res_avx3 = c_bucket[85];
-				res_avx5 = c_bucket[86];
-				res_avx7 = c_bucket[87];
-
-				c22_avx=c_bucket[90];
-				c23_avx=c_bucket[91];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[89], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[92], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[88], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[73], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[72], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final4[0]=res_avx0;
-				result_final4[1]=res_avx1;
-
-				result_final4[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final4[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final4[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final4[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final4[6]=res_avx6;
-				result_final4[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 5th poly ends--------------
-
-		   //------------------------AVX interpolation for 6th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[77];
-				res_avx2 = c_bucket[78];
-				res_avx4 = c_bucket[79];
-				res_avx6 = c_bucket[96];
-
-				c6_avx=c_bucket[99];
-				c7_avx=c_bucket[100];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[101], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[93];
-				res_avx3 = c_bucket[94];
-				res_avx5 = c_bucket[95];
-				res_avx7 = c_bucket[112];
-
-				c22_avx=c_bucket[115];
-				c23_avx=c_bucket[116];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[114], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[117], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[113], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[98], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[97], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final5[0]=res_avx0;
-				result_final5[1]=res_avx1;
-
-				result_final5[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final5[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final5[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final5[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final5[6]=res_avx6;
-				result_final5[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 6th poly ends--------------
-
-		   //------------------------AVX interpolation for 7th poly external-------------------		
-			
-			//loop1
-				res_avx0 = c_bucket[102];
-				res_avx2 = c_bucket[103];
-				res_avx4 = c_bucket[104];
-				res_avx6 = c_bucket[105];
-
-				c6_avx=c_bucket[108];
-				c7_avx=c_bucket[109];
-		
-				c8_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[110], c6_avx), c7_avx);
-
-				res_avx1 = c_bucket[118];
-				res_avx3 = c_bucket[119];
-				res_avx5 = c_bucket[120];
-				res_avx7 = c_bucket[121];
-
-				c22_avx=c_bucket[124];
-				c23_avx=c_bucket[125];
-
-				c21_avx=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[123], res_avx5),res_avx7);
-
-				c24_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[126], c22_avx), c23_avx);
-
-				c20_avx = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[122], res_avx1), res_avx3);
-
-				temp=_mm256_sub_epi16(_mm256_sub_epi16(c_bucket[107], res_avx4),res_avx6);
-				res_avx5 = _mm256_add_epi16(res_avx5, temp);
-
-				temp = _mm256_sub_epi16(_mm256_sub_epi16(c_bucket[106], res_avx0), res_avx2);
-				res_avx1 = _mm256_add_epi16(res_avx1, temp);
-
-				c22_avx=_mm256_add_epi16(c22_avx, c8_avx);
-
-				res_avx6 = _mm256_add_epi16(res_avx6, c21_avx);
-
-				res_avx2 = _mm256_add_epi16(res_avx2, c20_avx);
-
-				c7_avx=_mm256_add_epi16(c7_avx, c24_avx);
-
-
-			//loop4
-
-				c6_avx=_mm256_sub_epi16(_mm256_sub_epi16(c6_avx, res_avx0), res_avx4);
-				c22_avx=_mm256_sub_epi16(_mm256_sub_epi16(c22_avx, res_avx1), res_avx5);
-
-				c7_avx=_mm256_sub_epi16(_mm256_sub_epi16(c7_avx, res_avx2), res_avx6);
-				c23_avx=_mm256_sub_epi16(_mm256_sub_epi16(c23_avx, res_avx3), res_avx7);
-
-			//loop5
-				result_final6[0]=res_avx0;
-				result_final6[1]=res_avx1;
-
-				result_final6[2]=_mm256_add_epi16(res_avx2, c6_avx);
-				result_final6[3]=_mm256_add_epi16(res_avx3, c22_avx);
-
-
-				result_final6[4]=_mm256_add_epi16(res_avx4, c7_avx);
-				result_final6[5]=_mm256_add_epi16(res_avx5, c23_avx);
-
-				result_final6[6]=res_avx6;
-				result_final6[7]=res_avx7;
-
-
-		   //------------------------AVX interpolation for 7th poly ends--------------
-
-		//CLOCK2=cpucycles();
-		//CLOCK_INTER=CLOCK_INTER+(CLOCK2-CLOCK1);
-		//printf("\nTime for interpolation : %llu\n", CLOCK2-CLOCK1);
-
-
-
-}
-
-static void toom_cook_4way_avx_n1(__m256i* a_avx,__m256i* b_bucket, __m256i *c_bucket, int f){ 
-
-	int i;
-
-//---------------AVX data-----------------------------
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
-	__m256i aw_avx[7*small_len_avx];
-
-//----------------AVX data----------------------------
-
-
-// EVALUATION
-
-	//CLOCK1=cpucycles();
-
-	for (i=0; i<small_len_avx; i++){
-		r0_avx=a_avx[i];
-		r1_avx=a_avx[i + small_len_avx];
-		r2_avx=a_avx[i + 2*small_len_avx];
-		r3_avx=a_avx[i + 3*small_len_avx];
-		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
-		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		aw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		aw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx=_mm256_slli_epi16(r0_avx,2);
-		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
-		r4_avx=_mm256_slli_epi16(r4_avx,1);
-		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
-		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
-		aw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		aw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx= _mm256_slli_epi16(r3_avx, 3);
-		r6_avx= _mm256_slli_epi16(r2_avx, 2);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		r6_avx= _mm256_slli_epi16(r1_avx, 1);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		aw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
-		aw_avx[6*small_len_avx+i]= r0_avx; 
-		aw_avx[i]= r3_avx;
-	}
-
-
-	//CLOCK2=cpucycles();
-	//CLOCK_TC_EVAL=CLOCK_TC_EVAL+(CLOCK2-CLOCK1);
-
-	batch_64coefficient_multiplications_new(aw_avx, b_bucket, c_bucket, f);//New
-
-}
-
-static void TC_eval(__m256i* b_avx, __m256i* b_bucket){
-
-	int i;
-	__m256i bw_avx[7*small_len_avx];
-
-	__m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx;
-
-	for (i=0; i<small_len_avx; i++){
-		
-		r0_avx=b_avx[i];
-		r1_avx=b_avx[i + small_len_avx];
-		r2_avx=b_avx[i + 2*small_len_avx];
-		r3_avx=b_avx[i + 3*small_len_avx];
-		r4_avx= _mm256_add_epi16(r0_avx, r2_avx);
-		r5_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		bw_avx[2*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		bw_avx[3*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx=_mm256_slli_epi16(r0_avx,2);
-		r4_avx=_mm256_add_epi16(r4_avx,r2_avx);
-		r4_avx=_mm256_slli_epi16(r4_avx,1);
-		r5_avx=	_mm256_slli_epi16(r1_avx, 2);
-		r5_avx=_mm256_add_epi16(r5_avx, r3_avx);
-		bw_avx[4*small_len_avx+i]= _mm256_add_epi16(r4_avx, r5_avx);
-		bw_avx[5*small_len_avx+i]= _mm256_sub_epi16(r4_avx, r5_avx);
-		r4_avx= _mm256_slli_epi16(r3_avx, 3);
-		r6_avx= _mm256_slli_epi16(r2_avx, 2);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		r6_avx= _mm256_slli_epi16(r1_avx, 1);
-		r4_avx= _mm256_add_epi16(r4_avx, r6_avx);
-		bw_avx[small_len_avx+i]= _mm256_add_epi16(r4_avx, r0_avx);
-		bw_avx[6*small_len_avx+i]= r0_avx;
-		bw_avx[i]= r3_avx;
-	}
-
-	KARA_eval(bw_avx, b_bucket);
-
-}
-
-
-static void TC_interpol(__m256i *c_bucket, __m256i* res_avx){
-
-	int i;
-
-	register __m256i r0_avx, r1_avx, r2_avx, r3_avx, r4_avx, r5_avx, r6_avx, temp_avx;
-
-	__m256i w1_avx[2*small_len_avx],w2_avx[2*small_len_avx],w3_avx[2*small_len_avx],w4_avx[2*small_len_avx],w5_avx[2*small_len_avx],w6_avx[2*small_len_avx],w7_avx[2*small_len_avx];
-
-	__m256i res_avx_output[2*AVX_N1];
-
-	//CLOCK1=cpucycles();
-
-	
-	transpose_n1(c_bucket);
-	transpose_n1(c_bucket+16);
-
-	transpose_n1(c_bucket+2*SCM_SIZE);
-	transpose_n1(c_bucket+16+2*SCM_SIZE);
-
-	transpose_n1(c_bucket+4*SCM_SIZE);
-	transpose_n1(c_bucket+16+4*SCM_SIZE);
-
-	transpose_n1(c_bucket+6*SCM_SIZE);
-	transpose_n1(c_bucket+16+6*SCM_SIZE);
-	
-
-	KARA_interpol(c_bucket, w1_avx, w2_avx, w3_avx, w4_avx, w5_avx, w6_avx, w7_avx);
-
-	for (i = 0; i < 2*small_len_avx; i++) {
-
-		r0_avx = w1_avx[i];
-		r1_avx = w2_avx[i];
-		r2_avx = w3_avx[i];
-		r3_avx = w4_avx[i];
-		r4_avx = w5_avx[i];
-		r5_avx = w6_avx[i];
-		r6_avx = w7_avx[i];
-		r1_avx = _mm256_add_epi16(r1_avx, r4_avx);
-		r5_avx = _mm256_sub_epi16(r5_avx, r4_avx);
-		r3_avx = _mm256_sub_epi16(r3_avx, r2_avx);
-		r3_avx = _mm256_srli_epi16(r3_avx, 1);
-		r4_avx = _mm256_sub_epi16(r4_avx, r0_avx);
-		temp_avx = _mm256_slli_epi16(r6_avx, 6);
-		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
-		r4_avx = _mm256_slli_epi16(r4_avx, 1);
-		r4_avx = _mm256_add_epi16(r4_avx, r5_avx);
-		r2_avx = _mm256_add_epi16(r2_avx, r3_avx);
-		temp_avx = _mm256_slli_epi16(r2_avx, 6);
-		r1_avx = _mm256_sub_epi16(r1_avx, temp_avx);
-		r1_avx = _mm256_sub_epi16(r1_avx, r2_avx);
-		r2_avx = _mm256_sub_epi16(r2_avx, r6_avx);
-		r2_avx = _mm256_sub_epi16(r2_avx, r0_avx);
-		temp_avx = _mm256_mullo_epi16 (r2_avx, _mm256_set1_epi16(45));
-		r1_avx = _mm256_add_epi16(r1_avx, temp_avx);
-		temp_avx = _mm256_slli_epi16(r2_avx, 3);
-		r4_avx = _mm256_sub_epi16(r4_avx, temp_avx);
-		r4_avx = _mm256_mullo_epi16 (r4_avx, _mm256_set1_epi16(-21845)); // -21845 = 1/3 (mod 2^16)
-		r4_avx = _mm256_srli_epi16(r4_avx, 3);
-		r5_avx = _mm256_add_epi16(r5_avx, r1_avx);
-		temp_avx = _mm256_slli_epi16(r3_avx, 4);
-		r1_avx= _mm256_add_epi16(r1_avx, temp_avx);
-		r1_avx = _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(-29127)); // -29127 = 1/9 (mod 2^16)
-		r1_avx= _mm256_srli_epi16(r1_avx, 1); 	
-		r3_avx= _mm256_add_epi16(r1_avx, r3_avx);
-		r3_avx= _mm256_sub_epi16(_mm256_set1_epi16(0), r3_avx);
-		temp_avx= _mm256_mullo_epi16 (r1_avx, _mm256_set1_epi16(30));
-		temp_avx= _mm256_sub_epi16(temp_avx, r5_avx);
-		temp_avx= _mm256_mullo_epi16 (temp_avx, _mm256_set1_epi16(-4369)); // -4369 = 1/15 (mod 2^16)
-		r5_avx= _mm256_srli_epi16(temp_avx, 2);
-		r2_avx = _mm256_sub_epi16(r2_avx, r4_avx);
-		r1_avx = _mm256_sub_epi16(r1_avx, r5_avx);
-
-		if(i<small_len_avx){
-			res_avx_output[0*small_len_avx+i]=r6_avx;
-			res_avx_output[1*small_len_avx+i]=r5_avx;
-			res_avx_output[2*small_len_avx+i]=r4_avx;
-			res_avx_output[3*small_len_avx+i]=r3_avx;
-			res_avx_output[4*small_len_avx+i]=r2_avx;
-			res_avx_output[5*small_len_avx+i]=r1_avx;
-			res_avx_output[6*small_len_avx+i]=r0_avx;
-		}
-		else{
-			res_avx_output[0*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[0*small_len_avx+i], r6_avx);
-			res_avx_output[1*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[1*small_len_avx+i], r5_avx);
-			res_avx_output[2*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[2*small_len_avx+i], r4_avx);
-			res_avx_output[3*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[3*small_len_avx+i], r3_avx);
-			res_avx_output[4*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[4*small_len_avx+i], r2_avx);
-			res_avx_output[5*small_len_avx+i]=_mm256_add_epi16 (res_avx_output[5*small_len_avx+i], r1_avx);
-			res_avx_output[6*small_len_avx+i]=r0_avx;
-		}
-	}
-
-	//CLOCK2=cpucycles();
-	//CLOCK_TC_INTER=CLOCK_TC_INTER+(CLOCK2-CLOCK1);
-
-	// Reduction by X^256 + 1
-	for(i=0; i<16; i++)
-  {
-		res_avx[i] = _mm256_sub_epi16(res_avx_output[i], res_avx_output[i+16]);
-  }
-
-}
diff --git a/crypto_kem/saber/clean/SABER_indcpa.c b/crypto_kem/saber/clean/SABER_indcpa.c
index 23325749..3cc2367c 100644
--- a/crypto_kem/saber/clean/SABER_indcpa.c
+++ b/crypto_kem/saber/clean/SABER_indcpa.c
@@ -11,81 +11,102 @@
 #define h2 ((1 << (SABER_EP - 2)) - (1 << (SABER_EP - SABER_ET - 1)) + (1 << (SABER_EQ - SABER_EP - 1)))
 
 void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]) {
-    uint16_t A[SABER_L][SABER_L][SABER_N];
-    uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N] = {{0}};
-
-    uint8_t seed_A[SABER_SEEDBYTES];
-    uint8_t seed_s[SABER_NOISE_SEEDBYTES];
     size_t i, j;
 
+    poly A[SABER_L][SABER_L];
+    poly s[SABER_L];
+    poly res[SABER_L];
+
+    uint8_t rand[SABER_NOISESEEDBYTES];
+    uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+
     randombytes(seed_A, SABER_SEEDBYTES);
     shake128(seed_A, SABER_SEEDBYTES, seed_A, SABER_SEEDBYTES); // for not revealing system RNG state
-    randombytes(seed_s, SABER_NOISE_SEEDBYTES);
 
-    PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A);
-    PQCLEAN_SABER_CLEAN_GenSecret(s, seed_s);
-    PQCLEAN_SABER_CLEAN_MatrixVectorMul(b, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])s, 1);
+    randombytes(rand, SABER_NOISESEEDBYTES);
+    PQCLEAN_SABER_CLEAN_GenSecret(s, rand);
+    PQCLEAN_SABER_CLEAN_POLVECq2BS(sk, s);
 
+    PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A); // sample matrix A
+    PQCLEAN_SABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 1); // Matrix in transposed order
+
+
+    // rounding
     for (i = 0; i < SABER_L; i++) {
         for (j = 0; j < SABER_N; j++) {
-            b[i][j] = (b[i][j] + h1) >> (SABER_EQ - SABER_EP);
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
 
-    PQCLEAN_SABER_CLEAN_POLVECq2BS(sk, (const uint16_t (*)[SABER_N])s);
-    PQCLEAN_SABER_CLEAN_POLVECp2BS(pk, (const uint16_t (*)[SABER_N])b);
-    memcpy(pk + SABER_POLYVECCOMPRESSEDBYTES, seed_A, sizeof(seed_A));
+    PQCLEAN_SABER_CLEAN_POLVECp2BS(pk, res); // pack public key
 }
 
-void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
-    uint16_t A[SABER_L][SABER_L][SABER_N];
-    uint16_t sp[SABER_L][SABER_N];
-    uint16_t bp[SABER_L][SABER_N] = {{0}};
-    uint16_t vp[SABER_N] = {0};
-    uint16_t mp[SABER_N];
-    uint16_t b[SABER_L][SABER_N];
+
+void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]) {
     size_t i, j;
+
+    poly A[SABER_L][SABER_L];
+    poly res[SABER_L];
+    poly s[SABER_L];
+    poly *temp = A[0]; // re-use stack space
+    poly *vprime = &A[0][0];
+    poly *message = &A[0][1];
+
     const uint8_t *seed_A = pk + SABER_POLYVECCOMPRESSEDBYTES;
+    uint8_t *msk_c = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
 
+    PQCLEAN_SABER_CLEAN_GenSecret(s, noiseseed);
     PQCLEAN_SABER_CLEAN_GenMatrix(A, seed_A);
-    PQCLEAN_SABER_CLEAN_GenSecret(sp, seed_sp);
-    PQCLEAN_SABER_CLEAN_MatrixVectorMul(bp, (const uint16_t (*)[SABER_L][SABER_N])A, (const uint16_t (*)[SABER_N])sp, 0);
+    PQCLEAN_SABER_CLEAN_MatrixVectorMul(res, (const poly (*)[SABER_L])A, (const poly *)s, 0); // 0 => not transposed
 
-    for (i = 0; i < SABER_L; i++) {
+
+    // rounding
+    for (i = 0; i < SABER_L; i++) { //shift right EQ-EP bits
         for (j = 0; j < SABER_N; j++) {
-            bp[i][j] = (bp[i][j] + h1) >> (SABER_EQ - SABER_EP);
+            res[i].coeffs[j] += h1;
+            res[i].coeffs[j] >>= SABER_EQ - SABER_EP;
+            res[i].coeffs[j] &= SABER_Q - 1;
         }
     }
+    PQCLEAN_SABER_CLEAN_POLVECp2BS(ciphertext, res);
 
-    PQCLEAN_SABER_CLEAN_POLVECp2BS(ciphertext, (const uint16_t (*)[SABER_N])bp);
-    PQCLEAN_SABER_CLEAN_BS2POLVECp(b, pk);
-    PQCLEAN_SABER_CLEAN_InnerProd(vp, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])sp);
-
-    PQCLEAN_SABER_CLEAN_BS2POLmsg(mp, m);
-
-    for (j = 0; j < SABER_N; j++) {
-        vp[j] = (vp[j] - (mp[j] << (SABER_EP - 1)) + h1) >> (SABER_EP - SABER_ET);
-    }
-
-    PQCLEAN_SABER_CLEAN_POLT2BS(ciphertext + SABER_POLYVECCOMPRESSEDBYTES, vp);
-}
-
-void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
-
-    uint16_t s[SABER_L][SABER_N];
-    uint16_t b[SABER_L][SABER_N];
-    uint16_t v[SABER_N] = {0};
-    uint16_t cm[SABER_N];
-    size_t i;
-
-    PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk);
-    PQCLEAN_SABER_CLEAN_BS2POLVECp(b, ciphertext);
-    PQCLEAN_SABER_CLEAN_InnerProd(v, (const uint16_t (*)[SABER_N])b, (const uint16_t (*)[SABER_N])s);
-    PQCLEAN_SABER_CLEAN_BS2POLT(cm, ciphertext + SABER_POLYVECCOMPRESSEDBYTES);
+    // vector-vector scalar multiplication with mod p
+    PQCLEAN_SABER_CLEAN_BS2POLVECp(temp, pk);
+    PQCLEAN_SABER_CLEAN_InnerProd(vprime, temp, s);
+    PQCLEAN_SABER_CLEAN_BS2POLmsg(message, m);
 
     for (i = 0; i < SABER_N; i++) {
-        v[i] = (v[i] + h2 - (cm[i] << (SABER_EP - SABER_ET))) >> (SABER_EP - 1);
+        vprime->coeffs[i] += h1 - (message->coeffs[i] << (SABER_EP - 1));
+        vprime->coeffs[i] &= SABER_P - 1;
+        vprime->coeffs[i] >>= SABER_EP - SABER_ET;
+    }
+
+    PQCLEAN_SABER_CLEAN_POLT2BS(msk_c, vprime);
+}
+
+
+void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]) {
+    size_t i;
+
+    poly temp[SABER_L];
+    poly s[SABER_L];
+
+    const uint8_t *packed_cm = ciphertext + SABER_POLYVECCOMPRESSEDBYTES;
+    poly *v = &temp[0];
+    poly *cm = &temp[1];
+
+    PQCLEAN_SABER_CLEAN_BS2POLVECq(s, sk);
+    PQCLEAN_SABER_CLEAN_BS2POLVECp(temp, ciphertext);
+    PQCLEAN_SABER_CLEAN_InnerProd(&temp[0], temp, s);
+
+    PQCLEAN_SABER_CLEAN_BS2POLT(cm, packed_cm);
+
+    for (i = 0; i < SABER_N; i++) {
+        v->coeffs[i] += h2 - (cm->coeffs[i] << (SABER_EP - SABER_ET));
+        v->coeffs[i] &= SABER_P - 1;
+        v->coeffs[i] >>= SABER_EP - 1;
     }
 
     PQCLEAN_SABER_CLEAN_POLmsg2BS(m, v);
diff --git a/crypto_kem/saber/clean/SABER_indcpa.h b/crypto_kem/saber/clean/SABER_indcpa.h
index 3be3ce1c..a5e89e96 100644
--- a/crypto_kem/saber/clean/SABER_indcpa.h
+++ b/crypto_kem/saber/clean/SABER_indcpa.h
@@ -5,7 +5,7 @@
 
 void PQCLEAN_SABER_CLEAN_indcpa_kem_keypair(uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES], uint8_t sk[SABER_INDCPA_SECRETKEYBYTES]);
 
-void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t seed_sp[SABER_NOISE_SEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
+void PQCLEAN_SABER_CLEAN_indcpa_kem_enc(uint8_t ciphertext[SABER_BYTES_CCA_DEC], const uint8_t m[SABER_KEYBYTES], const uint8_t noiseseed[SABER_NOISESEEDBYTES], const uint8_t pk[SABER_INDCPA_PUBLICKEYBYTES]);
 
 void PQCLEAN_SABER_CLEAN_indcpa_kem_dec(uint8_t m[SABER_KEYBYTES], const uint8_t sk[SABER_INDCPA_SECRETKEYBYTES], const uint8_t ciphertext[SABER_BYTES_CCA_DEC]);
 
diff --git a/crypto_kem/saber/clean/SABER_params.h b/crypto_kem/saber/clean/SABER_params.h
index 200ed0e6..d1a5ddd7 100644
--- a/crypto_kem/saber/clean/SABER_params.h
+++ b/crypto_kem/saber/clean/SABER_params.h
@@ -2,19 +2,21 @@
 #define PARAMS_H
 
 
-/* Change this for different security strengths */
-
 /* Don't change anything below this line */
 #define SABER_L 3
 #define SABER_MU 8
 #define SABER_ET 4
 
-#define SABER_EQ 13
-#define SABER_EP 10
 #define SABER_N 256
 
+#define SABER_EP 10
+#define SABER_P (1 << SABER_EP)
+
+#define SABER_EQ 13
+#define SABER_Q (1 << SABER_EQ)
+
 #define SABER_SEEDBYTES 32
-#define SABER_NOISE_SEEDBYTES 32
+#define SABER_NOISESEEDBYTES 32
 #define SABER_KEYBYTES 32
 #define SABER_HASHBYTES 32
 
diff --git a/crypto_kem/saber/clean/api.h b/crypto_kem/saber/clean/api.h
index 699a19f4..7448d46d 100644
--- a/crypto_kem/saber/clean/api.h
+++ b/crypto_kem/saber/clean/api.h
@@ -15,4 +15,4 @@ int PQCLEAN_SABER_CLEAN_crypto_kem_enc(unsigned char *ct, unsigned char *k, cons
 int PQCLEAN_SABER_CLEAN_crypto_kem_dec(unsigned char *k, const unsigned char *ct, const unsigned char *sk);
 
 
-#endif /* api_h */
+#endif /* PQCLEAN_SABER_CLEAN_API_H */
diff --git a/crypto_kem/saber/clean/pack_unpack.c b/crypto_kem/saber/clean/pack_unpack.c
index e196bd34..2542f0b0 100644
--- a/crypto_kem/saber/clean/pack_unpack.c
+++ b/crypto_kem/saber/clean/pack_unpack.c
@@ -1,132 +1,145 @@
-#include "api.h"
+#include "SABER_params.h"
 #include "pack_unpack.h"
+#include "poly.h"
 #include <string.h>
 
-void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 2; j++) {
-        offset_byte = j;
-        offset_data = 2 * j;
-        bytes[offset_byte] = (data[offset_data] & 0x0f) | ((data[offset_data + 1] & 0x0f) << 4);
+        out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4);
+        in += 2;
+        out += 1;
     }
 }
 
-void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
-    size_t j, offset_byte, offset_data;
+void PQCLEAN_SABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 2; j++) {
-        offset_byte = j;
-        offset_data = 2 * j;
-        data[offset_data] = bytes[offset_byte] & 0x0f;
-        data[offset_data + 1] = (bytes[offset_byte] >> 4) & 0x0f;
+        out[0] = in[0] & 0x0f;
+        out[1] = (in[0] >> 4) & 0x0f;
+        in += 1;
+        out += 2;
     }
 }
 
-static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
-        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x1f) | ((data[offset_data + 1] & 0x07) << 5);
-        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 3) & 0xff);
-        bytes[offset_byte + 3] = ((data[offset_data + 1] >> 11) & 0x03) | ((data[offset_data + 2] & 0x3f) << 2);
-        bytes[offset_byte + 4] = ((data[offset_data + 2] >> 6) & 0x7f) | ((data[offset_data + 3] & 0x01) << 7);
-        bytes[offset_byte + 5] = ((data[offset_data + 3] >> 1) & 0xff);
-        bytes[offset_byte + 6] = ((data[offset_data + 3] >> 9) & 0x0f) | ((data[offset_data + 4] & 0x0f) << 4);
-        bytes[offset_byte + 7] = ((data[offset_data + 4] >> 4) & 0xff);
-        bytes[offset_byte + 8] = ((data[offset_data + 4] >> 12) & 0x01) | ((data[offset_data + 5] & 0x7f) << 1);
-        bytes[offset_byte + 9] = ((data[offset_data + 5] >> 7) & 0x3f) | ((data[offset_data + 6] & 0x03) << 6);
-        bytes[offset_byte + 10] = ((data[offset_data + 6] >> 2) & 0xff);
-        bytes[offset_byte + 11] = ((data[offset_data + 6] >> 10) & 0x07) | ((data[offset_data + 7] & 0x1f) << 3);
-        bytes[offset_byte + 12] = ((data[offset_data + 7] >> 5) & 0xff);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
+        out[2] = ((in[1] >> 3) & 0xff);
+        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
+        out[5] = ((in[3] >> 1) & 0xff);
+        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
+        out[7] = ((in[4] >> 4) & 0xff);
+        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
+        out[10] = ((in[6] >> 2) & 0xff);
+        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
+        out[12] = ((in[7] >> 5) & 0xff);
+        in += 8;
+        out += 13;
     }
 }
 
-static void BS2POLq(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYBYTES]) {
-    size_t j, offset_byte, offset_data;
+static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        offset_byte = 13 * j;
-        offset_data = 8 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x1f) << 8);
-        data[offset_data + 1] = (bytes[offset_byte + 1] >> 5 & (0x07)) | ((bytes[offset_byte + 2] & 0xff) << 3) | ((bytes[offset_byte + 3] & 0x03) << 11);
-        data[offset_data + 2] = (bytes[offset_byte + 3] >> 2 & (0x3f)) | ((bytes[offset_byte + 4] & 0x7f) << 6);
-        data[offset_data + 3] = (bytes[offset_byte + 4] >> 7 & (0x01)) | ((bytes[offset_byte + 5] & 0xff) << 1) | ((bytes[offset_byte + 6] & 0x0f) << 9);
-        data[offset_data + 4] = (bytes[offset_byte + 6] >> 4 & (0x0f)) | ((bytes[offset_byte + 7] & 0xff) << 4) | ((bytes[offset_byte + 8] & 0x01) << 12);
-        data[offset_data + 5] = (bytes[offset_byte + 8] >> 1 & (0x7f)) | ((bytes[offset_byte + 9] & 0x3f) << 7);
-        data[offset_data + 6] = (bytes[offset_byte + 9] >> 6 & (0x03)) | ((bytes[offset_byte + 10] & 0xff) << 2) | ((bytes[offset_byte + 11] & 0x07) << 10);
-        data[offset_data + 7] = (bytes[offset_byte + 11] >> 3 & (0x1f)) | ((bytes[offset_byte + 12] & 0xff) << 5);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
+        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
+        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
+        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
+        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
+        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
+        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
+        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        in += 13;
+        out += 8;
     }
 }
 
-static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const uint16_t data[SABER_N]) {
-    size_t j, offset_byte, offset_data;
+static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data) {
+    size_t j;
+    const uint16_t *in = data->coeffs;
+    uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 5 * j;
-        offset_data = 4 * j;
-        bytes[offset_byte + 0] = (data[offset_data + 0] & (0xff));
-        bytes[offset_byte + 1] = ((data[offset_data + 0] >> 8) & 0x03) | ((data[offset_data + 1] & 0x3f) << 2);
-        bytes[offset_byte + 2] = ((data[offset_data + 1] >> 6) & 0x0f) | ((data[offset_data + 2] & 0x0f) << 4);
-        bytes[offset_byte + 3] = ((data[offset_data + 2] >> 4) & 0x3f) | ((data[offset_data + 3] & 0x03) << 6);
-        bytes[offset_byte + 4] = ((data[offset_data + 3] >> 2) & 0xff);
+        out[0] = (in[0] & (0xff));
+        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
+        out[4] = ((in[3] >> 2) & 0xff);
+        in += 4;
+        out += 5;
     }
 }
 
-static void BS2POLp(uint16_t data[SABER_N], const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
-    size_t j, offset_byte, offset_data;
+static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES]) {
+    size_t j;
+    const uint8_t *in = bytes;
+    uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        offset_byte = 5 * j;
-        offset_data = 4 * j;
-        data[offset_data + 0] = (bytes[offset_byte + 0] & (0xff)) | ((bytes[offset_byte + 1] & 0x03) << 8);
-        data[offset_data + 1] = ((bytes[offset_byte + 1] >> 2) & (0x3f)) | ((bytes[offset_byte + 2] & 0x0f) << 6);
-        data[offset_data + 2] = ((bytes[offset_byte + 2] >> 4) & (0x0f)) | ((bytes[offset_byte + 3] & 0x3f) << 4);
-        data[offset_data + 3] = ((bytes[offset_byte + 3] >> 6) & (0x03)) | ((bytes[offset_byte + 4] & 0xff) << 2);
+        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
+        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
+        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
+        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        in += 5;
+        out += 4;
     }
 }
 
-void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]) {
+void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        POLq2BS(bytes + i * SABER_POLYBYTES, data[i]);
+        POLq2BS(bytes + i * SABER_POLYBYTES, &data[i]);
     }
 }
 
-void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]) {
+void PQCLEAN_SABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        BS2POLq(data[i], bytes + i * SABER_POLYBYTES);
+        BS2POLq(&data[i], bytes + i * SABER_POLYBYTES);
     }
 }
 
-void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]) {
+void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        POLp2BS(bytes + i * (SABER_EP * SABER_N / 8), data[i]);
+        POLp2BS(bytes + i * SABER_POLYCOMPRESSEDBYTES, &data[i]);
     }
 }
 
-void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
+void PQCLEAN_SABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]) {
     size_t i;
     for (i = 0; i < SABER_L; i++) {
-        BS2POLp(data[i], bytes + i * (SABER_EP * SABER_N / 8));
+        BS2POLp(&data[i], bytes + i * SABER_POLYCOMPRESSEDBYTES);
     }
 }
 
-void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]) {
+void PQCLEAN_SABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]) {
     size_t i, j;
     for (j = 0; j < SABER_KEYBYTES; j++) {
         for (i = 0; i < 8; i++) {
-            data[j * 8 + i] = ((bytes[j] >> i) & 0x01);
+            data->coeffs[j * 8 + i] = ((bytes[j] >> i) & 0x01);
         }
     }
 }
 
-void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]) {
+void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data) {
     size_t i, j;
     memset(bytes, 0, SABER_KEYBYTES);
 
     for (j = 0; j < SABER_KEYBYTES; j++) {
         for (i = 0; i < 8; i++) {
-            bytes[j] = bytes[j] | ((data[j * 8 + i] & 0x01) << i);
+            bytes[j] = bytes[j] | ((data->coeffs[j * 8 + i] & 0x01) << i);
         }
     }
 }
diff --git a/crypto_kem/saber/clean/pack_unpack.h b/crypto_kem/saber/clean/pack_unpack.h
index 52537c07..fc6a3abf 100644
--- a/crypto_kem/saber/clean/pack_unpack.h
+++ b/crypto_kem/saber/clean/pack_unpack.h
@@ -1,27 +1,28 @@
 #ifndef PACK_UNPACK_H
 #define PACK_UNPACK_H
 #include "SABER_params.h"
+#include "poly.h"
 #include <stdint.h>
 #include <stdio.h>
 
-void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const uint16_t data[SABER_N]);
+void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly *data);
 
-void PQCLEAN_SABER_CLEAN_BS2POLT(uint16_t data[SABER_N], const uint8_t bytes[SABER_SCALEBYTES_KEM]);
+void PQCLEAN_SABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]);
 
 
-void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const uint16_t data[SABER_L][SABER_N]);
+void PQCLEAN_SABER_CLEAN_POLVECq2BS(uint8_t bytes[SABER_POLYVECBYTES], const poly data[SABER_L]);
 
-void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const uint16_t data[SABER_L][SABER_N]);
+void PQCLEAN_SABER_CLEAN_POLVECp2BS(uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES], const poly data[SABER_L]);
 
 
-void PQCLEAN_SABER_CLEAN_BS2POLVECq(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECBYTES]);
+void PQCLEAN_SABER_CLEAN_BS2POLVECq(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECBYTES]);
 
-void PQCLEAN_SABER_CLEAN_BS2POLVECp(uint16_t data[SABER_L][SABER_N], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
+void PQCLEAN_SABER_CLEAN_BS2POLVECp(poly data[SABER_L], const uint8_t bytes[SABER_POLYVECCOMPRESSEDBYTES]);
 
 
-void PQCLEAN_SABER_CLEAN_BS2POLmsg(uint16_t data[SABER_N], const uint8_t bytes[SABER_KEYBYTES]);
+void PQCLEAN_SABER_CLEAN_BS2POLmsg(poly *data, const uint8_t bytes[SABER_KEYBYTES]);
 
-void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const uint16_t data[SABER_N]);
+void PQCLEAN_SABER_CLEAN_POLmsg2BS(uint8_t bytes[SABER_KEYBYTES], const poly *data);
 
 
 #endif
diff --git a/crypto_kem/saber/clean/poly.c b/crypto_kem/saber/clean/poly.c
index 2c44e962..588d0c99 100644
--- a/crypto_kem/saber/clean/poly.c
+++ b/crypto_kem/saber/clean/poly.c
@@ -3,32 +3,40 @@
 #include "fips202.h"
 #include "pack_unpack.h"
 #include "poly.h"
-#include "poly_mul.h"
 #include <stddef.h>
 
-void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t A[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose) {
+void PQCLEAN_SABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose) {
     size_t i, j;
-    for (i = 0; i < SABER_L; i++) {
-        for (j = 0; j < SABER_L; j++) {
-            if (transpose == 1) {
-                PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[j][i], s[j]);
-            } else {
-                PQCLEAN_SABER_CLEAN_poly_mul_acc(res[i], A[i][j], s[j]);
+
+    if (transpose) {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[0][i], &s[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[j][i], &s[j], 1);
+            }
+        }
+    } else {
+        for (i = 0; i < SABER_L; i++) {
+            PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[i][0], &s[0], 0);
+            for (j = 1; j < SABER_L; j++) {
+                PQCLEAN_SABER_CLEAN_poly_mul(&c[i], &A[i][j], &s[j], 1);
             }
         }
     }
 }
 
-void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]) {
-    size_t j;
-    for (j = 0; j < SABER_L; j++) {
-        PQCLEAN_SABER_CLEAN_poly_mul_acc(res, b[j], s[j]);
+void PQCLEAN_SABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]) {
+    size_t i;
+
+    PQCLEAN_SABER_CLEAN_poly_mul(c, &b[0], &s[0], 0);
+    for (i = 1; i < SABER_L; i++) {
+        PQCLEAN_SABER_CLEAN_poly_mul(c, &b[i], &s[i], 1);
     }
 }
 
-void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]) {
-    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
+void PQCLEAN_SABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]) {
     size_t i;
+    uint8_t buf[SABER_L * SABER_POLYVECBYTES];
 
     shake128(buf, sizeof(buf), seed, SABER_SEEDBYTES);
 
@@ -37,13 +45,13 @@ void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t A[SABER_L][SABER_L][SABER_N], const
     }
 }
 
-void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]) {
-    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
+void PQCLEAN_SABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]) {
     size_t i;
+    uint8_t buf[SABER_L * SABER_POLYCOINBYTES];
 
-    shake128(buf, sizeof(buf), seed, SABER_NOISE_SEEDBYTES);
+    shake128(buf, sizeof(buf), seed, SABER_NOISESEEDBYTES);
 
     for (i = 0; i < SABER_L; i++) {
-        PQCLEAN_SABER_CLEAN_cbd(s[i], buf + i * SABER_POLYCOINBYTES);
+        PQCLEAN_SABER_CLEAN_cbd(s[i].coeffs, buf + i * SABER_POLYCOINBYTES);
     }
 }
diff --git a/crypto_kem/saber/clean/poly.h b/crypto_kem/saber/clean/poly.h
index dd882cb7..d365b489 100644
--- a/crypto_kem/saber/clean/poly.h
+++ b/crypto_kem/saber/clean/poly.h
@@ -3,13 +3,21 @@
 #include "SABER_params.h"
 #include <stdint.h>
 
-void PQCLEAN_SABER_CLEAN_MatrixVectorMul(uint16_t res[SABER_L][SABER_N], const uint16_t a[SABER_L][SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N], int16_t transpose);
+typedef union {
+    uint16_t coeffs[SABER_N];
+} poly;
 
-void PQCLEAN_SABER_CLEAN_InnerProd(uint16_t res[SABER_N], const uint16_t b[SABER_L][SABER_N], const uint16_t s[SABER_L][SABER_N]);
 
-void PQCLEAN_SABER_CLEAN_GenMatrix(uint16_t a[SABER_L][SABER_L][SABER_N], const uint8_t seed[SABER_SEEDBYTES]);
+void PQCLEAN_SABER_CLEAN_MatrixVectorMul(poly c[SABER_L], const poly A[SABER_L][SABER_L], const poly s[SABER_L], int16_t transpose);
 
-void PQCLEAN_SABER_CLEAN_GenSecret(uint16_t s[SABER_L][SABER_N], const uint8_t seed[SABER_NOISE_SEEDBYTES]);
+void PQCLEAN_SABER_CLEAN_InnerProd(poly *c, const poly b[SABER_L], const poly s[SABER_L]);
+
+void PQCLEAN_SABER_CLEAN_GenMatrix(poly A[SABER_L][SABER_L], const uint8_t seed[SABER_SEEDBYTES]);
+
+void PQCLEAN_SABER_CLEAN_GenSecret(poly s[SABER_L], const uint8_t seed[SABER_NOISESEEDBYTES]);
+
+
+void PQCLEAN_SABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, int accumulate);
 
 
 #endif
diff --git a/crypto_kem/saber/clean/poly_mul.c b/crypto_kem/saber/clean/poly_mul.c
index 686960dc..0e03ff99 100644
--- a/crypto_kem/saber/clean/poly_mul.c
+++ b/crypto_kem/saber/clean/poly_mul.c
@@ -1,4 +1,4 @@
-#include "poly_mul.h"
+#include "poly.h"
 #include <stdint.h>
 #include <string.h>
 
@@ -229,14 +229,20 @@ static void toom_cook_4way (uint16_t *result, const uint16_t *a1, const uint16_t
 }
 
 /* res += a*b */
-void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]) {
-    uint16_t c[2 * SABER_N] = {0};
+void PQCLEAN_SABER_CLEAN_poly_mul(poly *c, const poly *a, const poly *b, const int accumulate) {
+    uint16_t C[2 * SABER_N] = {0};
     size_t i;
 
-    toom_cook_4way(c, a, b);
+    toom_cook_4way(C, a->coeffs, b->coeffs);
 
     /* reduction */
-    for (i = SABER_N; i < 2 * SABER_N; i++) {
-        res[i - SABER_N] += (c[i - SABER_N] - c[i]);
+    if (accumulate == 0) {
+        for (i = SABER_N; i < 2 * SABER_N; i++) {
+            c->coeffs[i - SABER_N] = (C[i - SABER_N] - C[i]);
+        }
+    } else {
+        for (i = SABER_N; i < 2 * SABER_N; i++) {
+            c->coeffs[i - SABER_N] += (C[i - SABER_N] - C[i]);
+        }
     }
 }
diff --git a/crypto_kem/saber/clean/poly_mul.h b/crypto_kem/saber/clean/poly_mul.h
index 82140f5b..b28b04f6 100644
--- a/crypto_kem/saber/clean/poly_mul.h
+++ b/crypto_kem/saber/clean/poly_mul.h
@@ -1,9 +1,3 @@
-#ifndef POLY_MUL_H
-#define POLY_MUL_H
-#include "SABER_params.h"
-#include <stdint.h>
-
-void PQCLEAN_SABER_CLEAN_poly_mul_acc(uint16_t res[SABER_N], const uint16_t a[SABER_N], const uint16_t b[SABER_N]);
 
 
-#endif
+
diff --git a/test/duplicate_consistency/firesaber_avx2.yml b/test/duplicate_consistency/firesaber_avx2.yml
index 1790559f..0ff04768 100644
--- a/test/duplicate_consistency/firesaber_avx2.yml
+++ b/test/duplicate_consistency/firesaber_avx2.yml
@@ -3,5 +3,14 @@ consistency_checks:
         scheme: firesaber
         implementation: clean
       files:
+      - api.h
+      - cbd.h
+      - pack_unpack.h
+      - kem.h
+      - SABER_indcpa.h
+      - SABER_params.h
       - verify.h
+      - cbd.c
+      - kem.c
+      - pack_unpack.c
       - verify.c
diff --git a/test/duplicate_consistency/firesaber_clean.yml b/test/duplicate_consistency/firesaber_clean.yml
index 3e93674e..5537ea62 100644
--- a/test/duplicate_consistency/firesaber_clean.yml
+++ b/test/duplicate_consistency/firesaber_clean.yml
@@ -3,5 +3,14 @@ consistency_checks:
         scheme: firesaber
         implementation: avx2
       files:
+      - api.h
+      - cbd.h
+      - poly_mul.h
+      - pack_unpack.h
+      - SABER_indcpa.h
+      - SABER_params.h
       - verify.h
+      - cbd.c
+      - kem.c
+      - pack_unpack.c
       - verify.c
diff --git a/test/duplicate_consistency/lightsaber_avx2.yml b/test/duplicate_consistency/lightsaber_avx2.yml
index 9239f8f0..ed9ea4b8 100644
--- a/test/duplicate_consistency/lightsaber_avx2.yml
+++ b/test/duplicate_consistency/lightsaber_avx2.yml
@@ -3,13 +3,27 @@ consistency_checks:
         scheme: lightsaber
         implementation: clean
       files:
+      - api.h
+      - cbd.h
+      - pack_unpack.h
+      - kem.h
+      - SABER_indcpa.h
+      - SABER_params.h
       - verify.h
+      - cbd.c
+      - kem.c
+      - pack_unpack.c
       - verify.c
     - source:
         scheme: saber
         implementation: clean
       files:
+      - cbd.h
+      - pack_unpack.h
+      - kem.h
+      - SABER_indcpa.h
       - verify.h
+      - kem.c
       - verify.c
     - source:
         scheme: saber
@@ -22,13 +36,20 @@ consistency_checks:
       - SABER_indcpa.h
       - verify.h
       - kem.c
-      - pack_unpack.c
+      - poly.c
+      - poly_mul.c
+      - SABER_indcpa.c
       - verify.c
     - source:
         scheme: firesaber
         implementation: clean
       files:
+      - cbd.h
+      - pack_unpack.h
+      - kem.h
+      - SABER_indcpa.h
       - verify.h
+      - kem.c
       - verify.c
     - source:
         scheme: firesaber
@@ -41,5 +62,7 @@ consistency_checks:
       - SABER_indcpa.h
       - verify.h
       - kem.c
-      - pack_unpack.c
+      - poly.c
+      - poly_mul.c
+      - SABER_indcpa.c
       - verify.c
diff --git a/test/duplicate_consistency/lightsaber_clean.yml b/test/duplicate_consistency/lightsaber_clean.yml
index 14c8975d..8146b7d0 100644
--- a/test/duplicate_consistency/lightsaber_clean.yml
+++ b/test/duplicate_consistency/lightsaber_clean.yml
@@ -3,7 +3,16 @@ consistency_checks:
         scheme: lightsaber
         implementation: avx2
       files:
+      - api.h
+      - cbd.h
+      - poly_mul.h
+      - pack_unpack.h
+      - SABER_indcpa.h
+      - SABER_params.h
       - verify.h
+      - cbd.c
+      - kem.c
+      - pack_unpack.c
       - verify.c
     - source:
         scheme: saber
@@ -24,7 +33,12 @@ consistency_checks:
         scheme: saber
         implementation: avx2
       files:
+      - cbd.h
+      - poly_mul.h
+      - pack_unpack.h
+      - SABER_indcpa.h
       - verify.h
+      - kem.c
       - verify.c
     - source:
         scheme: firesaber
@@ -45,5 +59,10 @@ consistency_checks:
         scheme: firesaber
         implementation: avx2
       files:
+      - cbd.h
+      - poly_mul.h
+      - pack_unpack.h
+      - SABER_indcpa.h
       - verify.h
+      - kem.c
       - verify.c
diff --git a/test/duplicate_consistency/saber_avx2.yml b/test/duplicate_consistency/saber_avx2.yml
index 010ac0c9..4a04951d 100644
--- a/test/duplicate_consistency/saber_avx2.yml
+++ b/test/duplicate_consistency/saber_avx2.yml
@@ -3,13 +3,27 @@ consistency_checks:
         scheme: saber
         implementation: clean
       files:
+      - api.h
+      - cbd.h
+      - pack_unpack.h
+      - kem.h
+      - SABER_indcpa.h
+      - SABER_params.h
       - verify.h
+      - cbd.c
+      - kem.c
+      - pack_unpack.c
       - verify.c
     - source:
         scheme: firesaber
         implementation: clean
       files:
+      - cbd.h
+      - pack_unpack.h
+      - kem.h
+      - SABER_indcpa.h
       - verify.h
+      - kem.c
       - verify.c
     - source:
         scheme: firesaber
@@ -22,5 +36,7 @@ consistency_checks:
       - SABER_indcpa.h
       - verify.h
       - kem.c
-      - pack_unpack.c
+      - poly.c
+      - poly_mul.c
+      - SABER_indcpa.c
       - verify.c
diff --git a/test/duplicate_consistency/saber_clean.yml b/test/duplicate_consistency/saber_clean.yml
index 7f01d619..a2700ea8 100644
--- a/test/duplicate_consistency/saber_clean.yml
+++ b/test/duplicate_consistency/saber_clean.yml
@@ -3,7 +3,16 @@ consistency_checks:
         scheme: saber
         implementation: avx2
       files:
+      - api.h
+      - cbd.h
+      - poly_mul.h
+      - pack_unpack.h
+      - SABER_indcpa.h
+      - SABER_params.h
       - verify.h
+      - cbd.c
+      - kem.c
+      - pack_unpack.c
       - verify.c
     - source:
         scheme: firesaber
@@ -24,5 +33,10 @@ consistency_checks:
         scheme: firesaber
         implementation: avx2
       files:
+      - cbd.h
+      - poly_mul.h
+      - pack_unpack.h
+      - SABER_indcpa.h
       - verify.h
+      - kem.c
       - verify.c

From bb037b918b4fe80149baa1a74a21fd00aaa95943 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Wed, 28 Oct 2020 12:12:44 -0400
Subject: [PATCH 06/10] Update packaging script version

---
 crypto_kem/firesaber/META.yml  | 4 ++--
 crypto_kem/lightsaber/META.yml | 4 ++--
 crypto_kem/saber/META.yml      | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml
index 0aa614ca..6cd4342b 100644
--- a/crypto_kem/firesaber/META.yml
+++ b/crypto_kem/firesaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml
index 027f1fab..d3d7bf13 100644
--- a/crypto_kem/lightsaber/META.yml
+++ b/crypto_kem/lightsaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml
index 7eb15ca2..32b4b964 100644
--- a/crypto_kem/saber/META.yml
+++ b/crypto_kem/saber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/88ee652a/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:

From 8af8939e2b32452ffd71d59077208d7e00e7c368 Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Wed, 28 Oct 2020 12:15:04 -0400
Subject: [PATCH 07/10] Remove empty file

---
 crypto_kem/firesaber/avx2/Makefile  | 2 +-
 crypto_kem/firesaber/avx2/kem.h     | 3 ---
 crypto_kem/lightsaber/avx2/Makefile | 2 +-
 crypto_kem/lightsaber/avx2/kem.h    | 3 ---
 crypto_kem/saber/avx2/Makefile      | 2 +-
 crypto_kem/saber/avx2/kem.h         | 3 ---
 6 files changed, 3 insertions(+), 12 deletions(-)
 delete mode 100644 crypto_kem/firesaber/avx2/kem.h
 delete mode 100644 crypto_kem/lightsaber/avx2/kem.h
 delete mode 100644 crypto_kem/saber/avx2/kem.h

diff --git a/crypto_kem/firesaber/avx2/Makefile b/crypto_kem/firesaber/avx2/Makefile
index b7fbd7d8..1ecd3c1a 100644
--- a/crypto_kem/firesaber/avx2/Makefile
+++ b/crypto_kem/firesaber/avx2/Makefile
@@ -1,7 +1,7 @@
 # This Makefile can be used with GNU Make or BSD Make
 
 LIB=libfiresaber_avx2.a
-HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
+HEADERS=api.h cbd.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
 OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
 CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
diff --git a/crypto_kem/firesaber/avx2/kem.h b/crypto_kem/firesaber/avx2/kem.h
deleted file mode 100644
index b28b04f6..00000000
--- a/crypto_kem/firesaber/avx2/kem.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-
-
diff --git a/crypto_kem/lightsaber/avx2/Makefile b/crypto_kem/lightsaber/avx2/Makefile
index f2817574..ff4f4367 100644
--- a/crypto_kem/lightsaber/avx2/Makefile
+++ b/crypto_kem/lightsaber/avx2/Makefile
@@ -1,7 +1,7 @@
 # This Makefile can be used with GNU Make or BSD Make
 
 LIB=liblightsaber_avx2.a
-HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
+HEADERS=api.h cbd.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
 OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
 CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
diff --git a/crypto_kem/lightsaber/avx2/kem.h b/crypto_kem/lightsaber/avx2/kem.h
deleted file mode 100644
index b28b04f6..00000000
--- a/crypto_kem/lightsaber/avx2/kem.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-
-
diff --git a/crypto_kem/saber/avx2/Makefile b/crypto_kem/saber/avx2/Makefile
index 070665b4..41ea6101 100644
--- a/crypto_kem/saber/avx2/Makefile
+++ b/crypto_kem/saber/avx2/Makefile
@@ -1,7 +1,7 @@
 # This Makefile can be used with GNU Make or BSD Make
 
 LIB=libsaber_avx2.a
-HEADERS=api.h cbd.h kem.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
+HEADERS=api.h cbd.h pack_unpack.h poly.h SABER_indcpa.h SABER_params.h verify.h 
 OBJECTS=cbd.o kem.o pack_unpack.o poly.o poly_mul.o SABER_indcpa.o verify.o 
 
 CFLAGS=-O3 -mavx2 -Wall -Wextra -Wpedantic -Wvla -Werror -Wredundant-decls -Wmissing-prototypes -std=c99 -I../../../common $(EXTRAFLAGS)
diff --git a/crypto_kem/saber/avx2/kem.h b/crypto_kem/saber/avx2/kem.h
deleted file mode 100644
index b28b04f6..00000000
--- a/crypto_kem/saber/avx2/kem.h
+++ /dev/null
@@ -1,3 +0,0 @@
-
-
-

From f8503cbd7158049fce79cc26fe784f17179226ea Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Wed, 28 Oct 2020 23:31:01 -0400
Subject: [PATCH 08/10] simplify pack_unpack.c

---
 crypto_kem/firesaber/META.yml                 |  4 +-
 crypto_kem/firesaber/avx2/pack_unpack.c       | 76 +++++++++--------
 crypto_kem/firesaber/clean/pack_unpack.c      | 76 +++++++++--------
 crypto_kem/lightsaber/META.yml                |  4 +-
 crypto_kem/lightsaber/avx2/pack_unpack.c      | 84 ++++++++++---------
 crypto_kem/lightsaber/clean/pack_unpack.c     | 84 ++++++++++---------
 crypto_kem/saber/META.yml                     |  4 +-
 crypto_kem/saber/avx2/pack_unpack.c           | 68 +++++++--------
 crypto_kem/saber/clean/pack_unpack.c          | 68 +++++++--------
 test/duplicate_consistency/firesaber_avx2.yml |  1 -
 .../duplicate_consistency/firesaber_clean.yml |  1 -
 .../duplicate_consistency/lightsaber_avx2.yml |  5 --
 .../lightsaber_clean.yml                      |  3 -
 test/duplicate_consistency/saber_avx2.yml     |  3 -
 test/duplicate_consistency/saber_clean.yml    |  2 -
 15 files changed, 240 insertions(+), 243 deletions(-)

diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml
index 6cd4342b..3d1a0a41 100644
--- a/crypto_kem/firesaber/META.yml
+++ b/crypto_kem/firesaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/firesaber/avx2/pack_unpack.c b/crypto_kem/firesaber/avx2/pack_unpack.c
index 41b9747a..82f5a3f0 100644
--- a/crypto_kem/firesaber/avx2/pack_unpack.c
+++ b/crypto_kem/firesaber/avx2/pack_unpack.c
@@ -8,23 +8,24 @@ void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const p
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6);
-        out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4);
-        out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2);
+        out[0] = (in[0] & 0x3f) | (in[1] << 6);
+        out[1] = ((in[1] >> 2) & 0x0f) | (in[2] << 4);
+        out[2] = ((in[2] >> 4) & 0x03) | (in[3] << 2);
         in += 4;
         out += 3;
     }
 }
 
 void PQCLEAN_FIRESABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    /* This function does not reduce its output mod T */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = in[0] & 0x3f;
-        out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2);
-        out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4);
-        out[3] = ((in[2] & 0xff) >> 2);
+        out[0] = in[0];
+        out[1] = (in[0] >> 6) | (in[1] << 2);
+        out[2] = (in[1] >> 4) | (in[2] << 4);
+        out[3] = (in[2] >> 2);
         in += 3;
         out += 4;
     }
@@ -35,37 +36,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
-        out[2] = ((in[1] >> 3) & 0xff);
-        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
-        out[5] = ((in[3] >> 1) & 0xff);
-        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
-        out[7] = ((in[4] >> 4) & 0xff);
-        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
-        out[10] = ((in[6] >> 2) & 0xff);
-        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
-        out[12] = ((in[7] >> 5) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
+        out[2] = in[1] >> 3;
+        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
+        out[5] = in[3] >> 1;
+        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
+        out[7] = in[4] >> 4;
+        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
+        out[10] = in[6] >> 2;
+        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
+        out[12] = in[7] >> 5;
         in += 8;
         out += 13;
     }
 }
 
 static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    /* This function does not reduce its output mod Q */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
-        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
-        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
-        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
-        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
-        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
-        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
-        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        out[0] = (in[0]) | (in[1] << 8);
+        out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11);
+        out[2] = (in[3] >> 2) | (in[4] << 6);
+        out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9);
+        out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12);
+        out[5] = (in[8] >> 1) | (in[9] << 7);
+        out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10);
+        out[7] = (in[11] >> 3) | (in[12] << 5);
         in += 13;
         out += 8;
     }
@@ -76,11 +78,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
-        out[4] = ((in[3] >> 2) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
+        out[4] = in[3] >> 2;
         in += 4;
         out += 5;
     }
@@ -91,10 +93,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES])
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
-        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
-        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
-        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        out[0] = in[0] | (in[1] << 8);
+        out[1] = (in[1] >> 2) | (in[2] << 6);
+        out[2] = (in[2] >> 4) | (in[3] << 4);
+        out[3] = (in[3] >> 6) | (in[4] << 2);
         in += 5;
         out += 4;
     }
diff --git a/crypto_kem/firesaber/clean/pack_unpack.c b/crypto_kem/firesaber/clean/pack_unpack.c
index ec2f1263..91ffd723 100644
--- a/crypto_kem/firesaber/clean/pack_unpack.c
+++ b/crypto_kem/firesaber/clean/pack_unpack.c
@@ -8,23 +8,24 @@ void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & 0x3f) | ((in[1] & 0x03) << 6);
-        out[1] = ((in[1] >> 2) & 0x0f) | ((in[2] & 0x0f) << 4);
-        out[2] = ((in[2] >> 4) & 0x03) | ((in[3] & 0x3f) << 2);
+        out[0] = (in[0] & 0x3f) | (in[1] << 6);
+        out[1] = ((in[1] >> 2) & 0x0f) | (in[2] << 4);
+        out[2] = ((in[2] >> 4) & 0x03) | (in[3] << 2);
         in += 4;
         out += 3;
     }
 }
 
 void PQCLEAN_FIRESABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    /* This function does not reduce its output mod T */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = in[0] & 0x3f;
-        out[1] = ((in[0] >> 6) & 0x03) | ((in[1] & 0x0f) << 2);
-        out[2] = ((in[1] & 0xff) >> 4) | ((in[2] & 0x03) << 4);
-        out[3] = ((in[2] & 0xff) >> 2);
+        out[0] = in[0];
+        out[1] = (in[0] >> 6) | (in[1] << 2);
+        out[2] = (in[1] >> 4) | (in[2] << 4);
+        out[3] = (in[2] >> 2);
         in += 3;
         out += 4;
     }
@@ -35,37 +36,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
-        out[2] = ((in[1] >> 3) & 0xff);
-        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
-        out[5] = ((in[3] >> 1) & 0xff);
-        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
-        out[7] = ((in[4] >> 4) & 0xff);
-        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
-        out[10] = ((in[6] >> 2) & 0xff);
-        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
-        out[12] = ((in[7] >> 5) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
+        out[2] = in[1] >> 3;
+        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
+        out[5] = in[3] >> 1;
+        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
+        out[7] = in[4] >> 4;
+        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
+        out[10] = in[6] >> 2;
+        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
+        out[12] = in[7] >> 5;
         in += 8;
         out += 13;
     }
 }
 
 static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    /* This function does not reduce its output mod Q */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
-        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
-        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
-        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
-        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
-        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
-        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
-        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        out[0] = (in[0]) | (in[1] << 8);
+        out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11);
+        out[2] = (in[3] >> 2) | (in[4] << 6);
+        out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9);
+        out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12);
+        out[5] = (in[8] >> 1) | (in[9] << 7);
+        out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10);
+        out[7] = (in[11] >> 3) | (in[12] << 5);
         in += 13;
         out += 8;
     }
@@ -76,11 +78,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
-        out[4] = ((in[3] >> 2) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
+        out[4] = in[3] >> 2;
         in += 4;
         out += 5;
     }
@@ -91,10 +93,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES])
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
-        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
-        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
-        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        out[0] = in[0] | (in[1] << 8);
+        out[1] = (in[1] >> 2) | (in[2] << 6);
+        out[2] = (in[2] >> 4) | (in[3] << 4);
+        out[3] = (in[3] >> 6) | (in[4] << 2);
         in += 5;
         out += 4;
     }
diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml
index d3d7bf13..263db2e0 100644
--- a/crypto_kem/lightsaber/META.yml
+++ b/crypto_kem/lightsaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.c b/crypto_kem/lightsaber/avx2/pack_unpack.c
index a9f866ae..a154d24c 100644
--- a/crypto_kem/lightsaber/avx2/pack_unpack.c
+++ b/crypto_kem/lightsaber/avx2/pack_unpack.c
@@ -8,27 +8,28 @@ void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6);
-        out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7);
-        out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5);
+        out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6);
+        out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7);
+        out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5);
         in += 8;
         out += 3;
     }
 }
 
 void PQCLEAN_LIGHTSABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    /* This function does not reduce its output mod T */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0]) & 0x07;
-        out[1] = ((in[0]) >> 3) & 0x07;
-        out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2);
-        out[3] = ((in[1]) >> 1) & 0x07;
-        out[4] = ((in[1]) >> 4) & 0x07;
-        out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1);
-        out[6] = ((in[2] >> 2) & 0x07);
-        out[7] = ((in[2] >> 5) & 0x07);
+        out[0] = in[0];
+        out[1] = in[0] >> 3;
+        out[2] = (in[0] >> 6) | (in[1] << 2);
+        out[3] = in[1] >> 1;
+        out[4] = in[1] >> 4;
+        out[5] = (in[1] >> 7) | (in[2] << 1);
+        out[6] = in[2] >> 2;
+        out[7] = in[2] >> 5;
         in += 3;
         out += 8;
     }
@@ -39,37 +40,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
-        out[2] = ((in[1] >> 3) & 0xff);
-        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
-        out[5] = ((in[3] >> 1) & 0xff);
-        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
-        out[7] = ((in[4] >> 4) & 0xff);
-        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
-        out[10] = ((in[6] >> 2) & 0xff);
-        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
-        out[12] = ((in[7] >> 5) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
+        out[2] = in[1] >> 3;
+        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
+        out[5] = in[3] >> 1;
+        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
+        out[7] = in[4] >> 4;
+        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
+        out[10] = in[6] >> 2;
+        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
+        out[12] = in[7] >> 5;
         in += 8;
         out += 13;
     }
 }
 
 static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    /* This function does not reduce its output mod Q */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
-        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
-        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
-        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
-        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
-        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
-        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
-        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        out[0] = (in[0]) | (in[1] << 8);
+        out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11);
+        out[2] = (in[3] >> 2) | (in[4] << 6);
+        out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9);
+        out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12);
+        out[5] = (in[8] >> 1) | (in[9] << 7);
+        out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10);
+        out[7] = (in[11] >> 3) | (in[12] << 5);
         in += 13;
         out += 8;
     }
@@ -80,11 +82,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
-        out[4] = ((in[3] >> 2) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
+        out[4] = in[3] >> 2;
         in += 4;
         out += 5;
     }
@@ -95,10 +97,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES])
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
-        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
-        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
-        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        out[0] = in[0] | (in[1] << 8);
+        out[1] = (in[1] >> 2) | (in[2] << 6);
+        out[2] = (in[2] >> 4) | (in[3] << 4);
+        out[3] = (in[3] >> 6) | (in[4] << 2);
         in += 5;
         out += 4;
     }
diff --git a/crypto_kem/lightsaber/clean/pack_unpack.c b/crypto_kem/lightsaber/clean/pack_unpack.c
index f64c4143..c1c8666c 100644
--- a/crypto_kem/lightsaber/clean/pack_unpack.c
+++ b/crypto_kem/lightsaber/clean/pack_unpack.c
@@ -8,27 +8,28 @@ void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | ((in[2] & 0x3) << 6);
-        out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (((in[5]) & 0x01) << 7);
-        out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | ((in[7] & 0x7) << 5);
+        out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6);
+        out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7);
+        out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5);
         in += 8;
         out += 3;
     }
 }
 
 void PQCLEAN_LIGHTSABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    /* This function does not reduce its output mod T */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0]) & 0x07;
-        out[1] = ((in[0]) >> 3) & 0x07;
-        out[2] = (((in[0]) >> 6) & 0x03) | (((in[1]) & 0x01) << 2);
-        out[3] = ((in[1]) >> 1) & 0x07;
-        out[4] = ((in[1]) >> 4) & 0x07;
-        out[5] = (((in[1]) >> 7) & 0x01) | (((in[2]) & 0x03) << 1);
-        out[6] = ((in[2] >> 2) & 0x07);
-        out[7] = ((in[2] >> 5) & 0x07);
+        out[0] = in[0];
+        out[1] = in[0] >> 3;
+        out[2] = (in[0] >> 6) | (in[1] << 2);
+        out[3] = in[1] >> 1;
+        out[4] = in[1] >> 4;
+        out[5] = (in[1] >> 7) | (in[2] << 1);
+        out[6] = in[2] >> 2;
+        out[7] = in[2] >> 5;
         in += 3;
         out += 8;
     }
@@ -39,37 +40,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
-        out[2] = ((in[1] >> 3) & 0xff);
-        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
-        out[5] = ((in[3] >> 1) & 0xff);
-        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
-        out[7] = ((in[4] >> 4) & 0xff);
-        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
-        out[10] = ((in[6] >> 2) & 0xff);
-        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
-        out[12] = ((in[7] >> 5) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
+        out[2] = in[1] >> 3;
+        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
+        out[5] = in[3] >> 1;
+        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
+        out[7] = in[4] >> 4;
+        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
+        out[10] = in[6] >> 2;
+        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
+        out[12] = in[7] >> 5;
         in += 8;
         out += 13;
     }
 }
 
 static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    /* This function does not reduce its output mod Q */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
-        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
-        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
-        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
-        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
-        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
-        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
-        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        out[0] = (in[0]) | (in[1] << 8);
+        out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11);
+        out[2] = (in[3] >> 2) | (in[4] << 6);
+        out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9);
+        out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12);
+        out[5] = (in[8] >> 1) | (in[9] << 7);
+        out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10);
+        out[7] = (in[11] >> 3) | (in[12] << 5);
         in += 13;
         out += 8;
     }
@@ -80,11 +82,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
-        out[4] = ((in[3] >> 2) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
+        out[4] = in[3] >> 2;
         in += 4;
         out += 5;
     }
@@ -95,10 +97,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES])
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
-        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
-        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
-        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        out[0] = in[0] | (in[1] << 8);
+        out[1] = (in[1] >> 2) | (in[2] << 6);
+        out[2] = (in[2] >> 4) | (in[3] << 4);
+        out[3] = (in[3] >> 6) | (in[4] << 2);
         in += 5;
         out += 4;
     }
diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml
index 32b4b964..319f4ebf 100644
--- a/crypto_kem/saber/META.yml
+++ b/crypto_kem/saber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/33e5ed62/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/saber/avx2/pack_unpack.c b/crypto_kem/saber/avx2/pack_unpack.c
index 9bb46acb..a055b7e5 100644
--- a/crypto_kem/saber/avx2/pack_unpack.c
+++ b/crypto_kem/saber/avx2/pack_unpack.c
@@ -8,19 +8,20 @@ void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 2; j++) {
-        out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4);
+        out[0] = (in[0] & 0x0f) | (in[1] << 4);
         in += 2;
         out += 1;
     }
 }
 
 void PQCLEAN_SABER_AVX2_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    /* This function does not reduce its output mod T */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 2; j++) {
-        out[0] = in[0] & 0x0f;
-        out[1] = (in[0] >> 4) & 0x0f;
+        out[0] = in[0];
+        out[1] = in[0] >> 4;
         in += 1;
         out += 2;
     }
@@ -31,37 +32,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
-        out[2] = ((in[1] >> 3) & 0xff);
-        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
-        out[5] = ((in[3] >> 1) & 0xff);
-        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
-        out[7] = ((in[4] >> 4) & 0xff);
-        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
-        out[10] = ((in[6] >> 2) & 0xff);
-        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
-        out[12] = ((in[7] >> 5) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
+        out[2] = in[1] >> 3;
+        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
+        out[5] = in[3] >> 1;
+        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
+        out[7] = in[4] >> 4;
+        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
+        out[10] = in[6] >> 2;
+        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
+        out[12] = in[7] >> 5;
         in += 8;
         out += 13;
     }
 }
 
 static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    /* This function does not reduce its output mod Q */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
-        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
-        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
-        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
-        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
-        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
-        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
-        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        out[0] = (in[0]) | (in[1] << 8);
+        out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11);
+        out[2] = (in[3] >> 2) | (in[4] << 6);
+        out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9);
+        out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12);
+        out[5] = (in[8] >> 1) | (in[9] << 7);
+        out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10);
+        out[7] = (in[11] >> 3) | (in[12] << 5);
         in += 13;
         out += 8;
     }
@@ -72,11 +74,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
-        out[4] = ((in[3] >> 2) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
+        out[4] = in[3] >> 2;
         in += 4;
         out += 5;
     }
@@ -87,10 +89,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES])
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
-        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
-        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
-        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        out[0] = in[0] | (in[1] << 8);
+        out[1] = (in[1] >> 2) | (in[2] << 6);
+        out[2] = (in[2] >> 4) | (in[3] << 4);
+        out[3] = (in[3] >> 6) | (in[4] << 2);
         in += 5;
         out += 4;
     }
diff --git a/crypto_kem/saber/clean/pack_unpack.c b/crypto_kem/saber/clean/pack_unpack.c
index 2542f0b0..1b5bed81 100644
--- a/crypto_kem/saber/clean/pack_unpack.c
+++ b/crypto_kem/saber/clean/pack_unpack.c
@@ -8,19 +8,20 @@ void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 2; j++) {
-        out[0] = (in[0] & 0x0f) | ((in[1] & 0x0f) << 4);
+        out[0] = (in[0] & 0x0f) | (in[1] << 4);
         in += 2;
         out += 1;
     }
 }
 
 void PQCLEAN_SABER_CLEAN_BS2POLT(poly *data, const uint8_t bytes[SABER_SCALEBYTES_KEM]) {
+    /* This function does not reduce its output mod T */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 2; j++) {
-        out[0] = in[0] & 0x0f;
-        out[1] = (in[0] >> 4) & 0x0f;
+        out[0] = in[0];
+        out[1] = in[0] >> 4;
         in += 1;
         out += 2;
     }
@@ -31,37 +32,38 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x1f) | ((in[1] & 0x07) << 5);
-        out[2] = ((in[1] >> 3) & 0xff);
-        out[3] = ((in[1] >> 11) & 0x03) | ((in[2] & 0x3f) << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | ((in[3] & 0x01) << 7);
-        out[5] = ((in[3] >> 1) & 0xff);
-        out[6] = ((in[3] >> 9) & 0x0f) | ((in[4] & 0x0f) << 4);
-        out[7] = ((in[4] >> 4) & 0xff);
-        out[8] = ((in[4] >> 12) & 0x01) | ((in[5] & 0x7f) << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | ((in[6] & 0x03) << 6);
-        out[10] = ((in[6] >> 2) & 0xff);
-        out[11] = ((in[6] >> 10) & 0x07) | ((in[7] & 0x1f) << 3);
-        out[12] = ((in[7] >> 5) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
+        out[2] = in[1] >> 3;
+        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
+        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
+        out[5] = in[3] >> 1;
+        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
+        out[7] = in[4] >> 4;
+        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
+        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
+        out[10] = in[6] >> 2;
+        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
+        out[12] = in[7] >> 5;
         in += 8;
         out += 13;
     }
 }
 
 static void BS2POLq(poly *data, const uint8_t bytes[SABER_POLYBYTES]) {
+    /* This function does not reduce its output mod Q */
     size_t j;
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x1f) << 8);
-        out[1] = (in[1] >> 5 & (0x07)) | ((in[2] & 0xff) << 3) | ((in[3] & 0x03) << 11);
-        out[2] = (in[3] >> 2 & (0x3f)) | ((in[4] & 0x7f) << 6);
-        out[3] = (in[4] >> 7 & (0x01)) | ((in[5] & 0xff) << 1) | ((in[6] & 0x0f) << 9);
-        out[4] = (in[6] >> 4 & (0x0f)) | ((in[7] & 0xff) << 4) | ((in[8] & 0x01) << 12);
-        out[5] = (in[8] >> 1 & (0x7f)) | ((in[9] & 0x3f) << 7);
-        out[6] = (in[9] >> 6 & (0x03)) | ((in[10] & 0xff) << 2) | ((in[11] & 0x07) << 10);
-        out[7] = (in[11] >> 3 & (0x1f)) | ((in[12] & 0xff) << 5);
+        out[0] = (in[0]) | (in[1] << 8);
+        out[1] = (in[1] >> 5) | (in[2] << 3) | (in[3] << 11);
+        out[2] = (in[3] >> 2) | (in[4] << 6);
+        out[3] = (in[4] >> 7) | (in[5] << 1) | (in[6] << 9);
+        out[4] = (in[6] >> 4) | (in[7] << 4) | (in[8] << 12);
+        out[5] = (in[8] >> 1) | (in[9] << 7);
+        out[6] = (in[9] >> 6) | (in[10] << 2) | (in[11] << 10);
+        out[7] = (in[11] >> 3) | (in[12] << 5);
         in += 13;
         out += 8;
     }
@@ -72,11 +74,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff));
-        out[1] = ((in[0] >> 8) & 0x03) | ((in[1] & 0x3f) << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | ((in[2] & 0x0f) << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | ((in[3] & 0x03) << 6);
-        out[4] = ((in[3] >> 2) & 0xff);
+        out[0] = in[0];
+        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
+        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
+        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
+        out[4] = in[3] >> 2;
         in += 4;
         out += 5;
     }
@@ -87,10 +89,10 @@ static void BS2POLp(poly *data, const uint8_t bytes[SABER_POLYCOMPRESSEDBYTES])
     const uint8_t *in = bytes;
     uint16_t *out = data->coeffs;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & (0xff)) | ((in[1] & 0x03) << 8);
-        out[1] = ((in[1] >> 2) & (0x3f)) | ((in[2] & 0x0f) << 6);
-        out[2] = ((in[2] >> 4) & (0x0f)) | ((in[3] & 0x3f) << 4);
-        out[3] = ((in[3] >> 6) & (0x03)) | ((in[4] & 0xff) << 2);
+        out[0] = in[0] | (in[1] << 8);
+        out[1] = (in[1] >> 2) | (in[2] << 6);
+        out[2] = (in[2] >> 4) | (in[3] << 4);
+        out[3] = (in[3] >> 6) | (in[4] << 2);
         in += 5;
         out += 4;
     }
diff --git a/test/duplicate_consistency/firesaber_avx2.yml b/test/duplicate_consistency/firesaber_avx2.yml
index 0ff04768..f5240334 100644
--- a/test/duplicate_consistency/firesaber_avx2.yml
+++ b/test/duplicate_consistency/firesaber_avx2.yml
@@ -6,7 +6,6 @@ consistency_checks:
       - api.h
       - cbd.h
       - pack_unpack.h
-      - kem.h
       - SABER_indcpa.h
       - SABER_params.h
       - verify.h
diff --git a/test/duplicate_consistency/firesaber_clean.yml b/test/duplicate_consistency/firesaber_clean.yml
index 5537ea62..bcfed7c0 100644
--- a/test/duplicate_consistency/firesaber_clean.yml
+++ b/test/duplicate_consistency/firesaber_clean.yml
@@ -5,7 +5,6 @@ consistency_checks:
       files:
       - api.h
       - cbd.h
-      - poly_mul.h
       - pack_unpack.h
       - SABER_indcpa.h
       - SABER_params.h
diff --git a/test/duplicate_consistency/lightsaber_avx2.yml b/test/duplicate_consistency/lightsaber_avx2.yml
index ed9ea4b8..dd6e9fed 100644
--- a/test/duplicate_consistency/lightsaber_avx2.yml
+++ b/test/duplicate_consistency/lightsaber_avx2.yml
@@ -6,7 +6,6 @@ consistency_checks:
       - api.h
       - cbd.h
       - pack_unpack.h
-      - kem.h
       - SABER_indcpa.h
       - SABER_params.h
       - verify.h
@@ -20,7 +19,6 @@ consistency_checks:
       files:
       - cbd.h
       - pack_unpack.h
-      - kem.h
       - SABER_indcpa.h
       - verify.h
       - kem.c
@@ -30,7 +28,6 @@ consistency_checks:
         implementation: avx2
       files:
       - cbd.h
-      - kem.h
       - pack_unpack.h
       - poly.h
       - SABER_indcpa.h
@@ -46,7 +43,6 @@ consistency_checks:
       files:
       - cbd.h
       - pack_unpack.h
-      - kem.h
       - SABER_indcpa.h
       - verify.h
       - kem.c
@@ -56,7 +52,6 @@ consistency_checks:
         implementation: avx2
       files:
       - cbd.h
-      - kem.h
       - pack_unpack.h
       - poly.h
       - SABER_indcpa.h
diff --git a/test/duplicate_consistency/lightsaber_clean.yml b/test/duplicate_consistency/lightsaber_clean.yml
index 8146b7d0..2f36ec86 100644
--- a/test/duplicate_consistency/lightsaber_clean.yml
+++ b/test/duplicate_consistency/lightsaber_clean.yml
@@ -5,7 +5,6 @@ consistency_checks:
       files:
       - api.h
       - cbd.h
-      - poly_mul.h
       - pack_unpack.h
       - SABER_indcpa.h
       - SABER_params.h
@@ -34,7 +33,6 @@ consistency_checks:
         implementation: avx2
       files:
       - cbd.h
-      - poly_mul.h
       - pack_unpack.h
       - SABER_indcpa.h
       - verify.h
@@ -60,7 +58,6 @@ consistency_checks:
         implementation: avx2
       files:
       - cbd.h
-      - poly_mul.h
       - pack_unpack.h
       - SABER_indcpa.h
       - verify.h
diff --git a/test/duplicate_consistency/saber_avx2.yml b/test/duplicate_consistency/saber_avx2.yml
index 4a04951d..0b4b60d7 100644
--- a/test/duplicate_consistency/saber_avx2.yml
+++ b/test/duplicate_consistency/saber_avx2.yml
@@ -6,7 +6,6 @@ consistency_checks:
       - api.h
       - cbd.h
       - pack_unpack.h
-      - kem.h
       - SABER_indcpa.h
       - SABER_params.h
       - verify.h
@@ -20,7 +19,6 @@ consistency_checks:
       files:
       - cbd.h
       - pack_unpack.h
-      - kem.h
       - SABER_indcpa.h
       - verify.h
       - kem.c
@@ -30,7 +28,6 @@ consistency_checks:
         implementation: avx2
       files:
       - cbd.h
-      - kem.h
       - pack_unpack.h
       - poly.h
       - SABER_indcpa.h
diff --git a/test/duplicate_consistency/saber_clean.yml b/test/duplicate_consistency/saber_clean.yml
index a2700ea8..7f5ba121 100644
--- a/test/duplicate_consistency/saber_clean.yml
+++ b/test/duplicate_consistency/saber_clean.yml
@@ -5,7 +5,6 @@ consistency_checks:
       files:
       - api.h
       - cbd.h
-      - poly_mul.h
       - pack_unpack.h
       - SABER_indcpa.h
       - SABER_params.h
@@ -34,7 +33,6 @@ consistency_checks:
         implementation: avx2
       files:
       - cbd.h
-      - poly_mul.h
       - pack_unpack.h
       - SABER_indcpa.h
       - verify.h

From 11b4772e73ae4967619356b517c715b21f27627b Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Wed, 28 Oct 2020 23:42:28 -0400
Subject: [PATCH 09/10] explicit casts

---
 crypto_kem/firesaber/META.yml             |  4 +--
 crypto_kem/firesaber/avx2/pack_unpack.c   | 42 +++++++++++------------
 crypto_kem/firesaber/clean/pack_unpack.c  | 42 +++++++++++------------
 crypto_kem/lightsaber/META.yml            |  4 +--
 crypto_kem/lightsaber/avx2/pack_unpack.c  | 42 +++++++++++------------
 crypto_kem/lightsaber/clean/pack_unpack.c | 42 +++++++++++------------
 crypto_kem/saber/META.yml                 |  4 +--
 crypto_kem/saber/avx2/pack_unpack.c       | 38 ++++++++++----------
 crypto_kem/saber/clean/pack_unpack.c      | 38 ++++++++++----------
 9 files changed, 128 insertions(+), 128 deletions(-)

diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml
index 3d1a0a41..24363a85 100644
--- a/crypto_kem/firesaber/META.yml
+++ b/crypto_kem/firesaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/firesaber/avx2/pack_unpack.c b/crypto_kem/firesaber/avx2/pack_unpack.c
index 82f5a3f0..d5e6b9ba 100644
--- a/crypto_kem/firesaber/avx2/pack_unpack.c
+++ b/crypto_kem/firesaber/avx2/pack_unpack.c
@@ -8,9 +8,9 @@ void PQCLEAN_FIRESABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const p
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & 0x3f) | (in[1] << 6);
-        out[1] = ((in[1] >> 2) & 0x0f) | (in[2] << 4);
-        out[2] = ((in[2] >> 4) & 0x03) | (in[3] << 2);
+        out[0] = (uint8_t) ((in[0] & 0x3f) | (in[1] << 6));
+        out[1] = (uint8_t) (((in[1] >> 2) & 0x0f) | (in[2] << 4));
+        out[2] = (uint8_t) (((in[2] >> 4) & 0x03) | (in[3] << 2));
         in += 4;
         out += 3;
     }
@@ -36,19 +36,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
-        out[2] = in[1] >> 3;
-        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
-        out[5] = in[3] >> 1;
-        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
-        out[7] = in[4] >> 4;
-        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
-        out[10] = in[6] >> 2;
-        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
-        out[12] = in[7] >> 5;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5));
+        out[2] = (uint8_t) (in[1] >> 3);
+        out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2));
+        out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7));
+        out[5] = (uint8_t) (in[3] >> 1);
+        out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4));
+        out[7] = (uint8_t) (in[4] >> 4);
+        out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1));
+        out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6));
+        out[10] = (uint8_t) (in[6] >> 2);
+        out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3));
+        out[12] = (uint8_t) (in[7] >> 5);
         in += 8;
         out += 13;
     }
@@ -78,11 +78,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
-        out[4] = in[3] >> 2;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2));
+        out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4));
+        out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6));
+        out[4] = (uint8_t) (in[3] >> 2);
         in += 4;
         out += 5;
     }
diff --git a/crypto_kem/firesaber/clean/pack_unpack.c b/crypto_kem/firesaber/clean/pack_unpack.c
index 91ffd723..2d1538ae 100644
--- a/crypto_kem/firesaber/clean/pack_unpack.c
+++ b/crypto_kem/firesaber/clean/pack_unpack.c
@@ -8,9 +8,9 @@ void PQCLEAN_FIRESABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = (in[0] & 0x3f) | (in[1] << 6);
-        out[1] = ((in[1] >> 2) & 0x0f) | (in[2] << 4);
-        out[2] = ((in[2] >> 4) & 0x03) | (in[3] << 2);
+        out[0] = (uint8_t) ((in[0] & 0x3f) | (in[1] << 6));
+        out[1] = (uint8_t) (((in[1] >> 2) & 0x0f) | (in[2] << 4));
+        out[2] = (uint8_t) (((in[2] >> 4) & 0x03) | (in[3] << 2));
         in += 4;
         out += 3;
     }
@@ -36,19 +36,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
-        out[2] = in[1] >> 3;
-        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
-        out[5] = in[3] >> 1;
-        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
-        out[7] = in[4] >> 4;
-        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
-        out[10] = in[6] >> 2;
-        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
-        out[12] = in[7] >> 5;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5));
+        out[2] = (uint8_t) (in[1] >> 3);
+        out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2));
+        out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7));
+        out[5] = (uint8_t) (in[3] >> 1);
+        out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4));
+        out[7] = (uint8_t) (in[4] >> 4);
+        out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1));
+        out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6));
+        out[10] = (uint8_t) (in[6] >> 2);
+        out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3));
+        out[12] = (uint8_t) (in[7] >> 5);
         in += 8;
         out += 13;
     }
@@ -78,11 +78,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
-        out[4] = in[3] >> 2;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2));
+        out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4));
+        out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6));
+        out[4] = (uint8_t) (in[3] >> 2);
         in += 4;
         out += 5;
     }
diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml
index 263db2e0..ad9d6acc 100644
--- a/crypto_kem/lightsaber/META.yml
+++ b/crypto_kem/lightsaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/lightsaber/avx2/pack_unpack.c b/crypto_kem/lightsaber/avx2/pack_unpack.c
index a154d24c..08f7a9d9 100644
--- a/crypto_kem/lightsaber/avx2/pack_unpack.c
+++ b/crypto_kem/lightsaber/avx2/pack_unpack.c
@@ -8,9 +8,9 @@ void PQCLEAN_LIGHTSABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6);
-        out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7);
-        out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5);
+        out[0] = (uint8_t) ((in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6));
+        out[1] = (uint8_t) (((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7));
+        out[2] = (uint8_t) (((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5));
         in += 8;
         out += 3;
     }
@@ -40,19 +40,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
-        out[2] = in[1] >> 3;
-        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
-        out[5] = in[3] >> 1;
-        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
-        out[7] = in[4] >> 4;
-        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
-        out[10] = in[6] >> 2;
-        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
-        out[12] = in[7] >> 5;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5));
+        out[2] = (uint8_t) (in[1] >> 3);
+        out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2));
+        out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7));
+        out[5] = (uint8_t) (in[3] >> 1);
+        out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4));
+        out[7] = (uint8_t) (in[4] >> 4);
+        out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1));
+        out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6));
+        out[10] = (uint8_t) (in[6] >> 2);
+        out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3));
+        out[12] = (uint8_t) (in[7] >> 5);
         in += 8;
         out += 13;
     }
@@ -82,11 +82,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
-        out[4] = in[3] >> 2;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2));
+        out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4));
+        out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6));
+        out[4] = (uint8_t) (in[3] >> 2);
         in += 4;
         out += 5;
     }
diff --git a/crypto_kem/lightsaber/clean/pack_unpack.c b/crypto_kem/lightsaber/clean/pack_unpack.c
index c1c8666c..106a62d4 100644
--- a/crypto_kem/lightsaber/clean/pack_unpack.c
+++ b/crypto_kem/lightsaber/clean/pack_unpack.c
@@ -8,9 +8,9 @@ void PQCLEAN_LIGHTSABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = (in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6);
-        out[1] = ((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7);
-        out[2] = ((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5);
+        out[0] = (uint8_t) ((in[0] & 0x7) | ((in[1] & 0x7) << 3) | (in[2] << 6));
+        out[1] = (uint8_t) (((in[2] >> 2) & 0x01) | ((in[3] & 0x7) << 1) | ((in[4] & 0x7) << 4) | (in[5] << 7));
+        out[2] = (uint8_t) (((in[5] >> 1) & 0x03) | ((in[6] & 0x7) << 2) | (in[7] << 5));
         in += 8;
         out += 3;
     }
@@ -40,19 +40,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
-        out[2] = in[1] >> 3;
-        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
-        out[5] = in[3] >> 1;
-        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
-        out[7] = in[4] >> 4;
-        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
-        out[10] = in[6] >> 2;
-        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
-        out[12] = in[7] >> 5;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5));
+        out[2] = (uint8_t) (in[1] >> 3);
+        out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2));
+        out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7));
+        out[5] = (uint8_t) (in[3] >> 1);
+        out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4));
+        out[7] = (uint8_t) (in[4] >> 4);
+        out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1));
+        out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6));
+        out[10] = (uint8_t) (in[6] >> 2);
+        out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3));
+        out[12] = (uint8_t) (in[7] >> 5);
         in += 8;
         out += 13;
     }
@@ -82,11 +82,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
-        out[4] = in[3] >> 2;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2));
+        out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4));
+        out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6));
+        out[4] = (uint8_t) (in[3] >> 2);
         in += 4;
         out += 5;
     }
diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml
index 319f4ebf..f6375c71 100644
--- a/crypto_kem/saber/META.yml
+++ b/crypto_kem/saber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/90d072e4/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/saber/avx2/pack_unpack.c b/crypto_kem/saber/avx2/pack_unpack.c
index a055b7e5..f9315d5d 100644
--- a/crypto_kem/saber/avx2/pack_unpack.c
+++ b/crypto_kem/saber/avx2/pack_unpack.c
@@ -8,7 +8,7 @@ void PQCLEAN_SABER_AVX2_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 2; j++) {
-        out[0] = (in[0] & 0x0f) | (in[1] << 4);
+        out[0] = (uint8_t) ((in[0] & 0x0f) | (in[1] << 4));
         in += 2;
         out += 1;
     }
@@ -32,19 +32,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
-        out[2] = in[1] >> 3;
-        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
-        out[5] = in[3] >> 1;
-        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
-        out[7] = in[4] >> 4;
-        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
-        out[10] = in[6] >> 2;
-        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
-        out[12] = in[7] >> 5;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5));
+        out[2] = (uint8_t) (in[1] >> 3);
+        out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2));
+        out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7));
+        out[5] = (uint8_t) (in[3] >> 1);
+        out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4));
+        out[7] = (uint8_t) (in[4] >> 4);
+        out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1));
+        out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6));
+        out[10] = (uint8_t) (in[6] >> 2);
+        out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3));
+        out[12] = (uint8_t) (in[7] >> 5);
         in += 8;
         out += 13;
     }
@@ -74,11 +74,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
-        out[4] = in[3] >> 2;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2));
+        out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4));
+        out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6));
+        out[4] = (uint8_t) (in[3] >> 2);
         in += 4;
         out += 5;
     }
diff --git a/crypto_kem/saber/clean/pack_unpack.c b/crypto_kem/saber/clean/pack_unpack.c
index 1b5bed81..89a98951 100644
--- a/crypto_kem/saber/clean/pack_unpack.c
+++ b/crypto_kem/saber/clean/pack_unpack.c
@@ -8,7 +8,7 @@ void PQCLEAN_SABER_CLEAN_POLT2BS(uint8_t bytes[SABER_SCALEBYTES_KEM], const poly
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 2; j++) {
-        out[0] = (in[0] & 0x0f) | (in[1] << 4);
+        out[0] = (uint8_t) ((in[0] & 0x0f) | (in[1] << 4));
         in += 2;
         out += 1;
     }
@@ -32,19 +32,19 @@ static void POLq2BS(uint8_t bytes[SABER_POLYBYTES], const poly *data) {
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 8; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x1f) | (in[1] << 5);
-        out[2] = in[1] >> 3;
-        out[3] = ((in[1] >> 11) & 0x03) | (in[2] << 2);
-        out[4] = ((in[2] >> 6) & 0x7f) | (in[3] << 7);
-        out[5] = in[3] >> 1;
-        out[6] = ((in[3] >> 9) & 0x0f) | (in[4] << 4);
-        out[7] = in[4] >> 4;
-        out[8] = ((in[4] >> 12) & 0x01) | (in[5] << 1);
-        out[9] = ((in[5] >> 7) & 0x3f) | (in[6] << 6);
-        out[10] = in[6] >> 2;
-        out[11] = ((in[6] >> 10) & 0x07) | (in[7] << 3);
-        out[12] = in[7] >> 5;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x1f) | (in[1] << 5));
+        out[2] = (uint8_t) (in[1] >> 3);
+        out[3] = (uint8_t) (((in[1] >> 11) & 0x03) | (in[2] << 2));
+        out[4] = (uint8_t) (((in[2] >> 6) & 0x7f) | (in[3] << 7));
+        out[5] = (uint8_t) (in[3] >> 1);
+        out[6] = (uint8_t) (((in[3] >> 9) & 0x0f) | (in[4] << 4));
+        out[7] = (uint8_t) (in[4] >> 4);
+        out[8] = (uint8_t) (((in[4] >> 12) & 0x01) | (in[5] << 1));
+        out[9] = (uint8_t) (((in[5] >> 7) & 0x3f) | (in[6] << 6));
+        out[10] = (uint8_t) (in[6] >> 2);
+        out[11] = (uint8_t) (((in[6] >> 10) & 0x07) | (in[7] << 3));
+        out[12] = (uint8_t) (in[7] >> 5);
         in += 8;
         out += 13;
     }
@@ -74,11 +74,11 @@ static void POLp2BS(uint8_t bytes[SABER_POLYCOMPRESSEDBYTES], const poly *data)
     const uint16_t *in = data->coeffs;
     uint8_t *out = bytes;
     for (j = 0; j < SABER_N / 4; j++) {
-        out[0] = in[0];
-        out[1] = ((in[0] >> 8) & 0x03) | (in[1] << 2);
-        out[2] = ((in[1] >> 6) & 0x0f) | (in[2] << 4);
-        out[3] = ((in[2] >> 4) & 0x3f) | (in[3] << 6);
-        out[4] = in[3] >> 2;
+        out[0] = (uint8_t) (in[0]);
+        out[1] = (uint8_t) (((in[0] >> 8) & 0x03) | (in[1] << 2));
+        out[2] = (uint8_t) (((in[1] >> 6) & 0x0f) | (in[2] << 4));
+        out[3] = (uint8_t) (((in[2] >> 4) & 0x3f) | (in[3] << 6));
+        out[4] = (uint8_t) (in[3] >> 2);
         in += 4;
         out += 5;
     }

From dd00b7fbd89d373286ea8e474ffdbb171580da8d Mon Sep 17 00:00:00 2001
From: "John M. Schanck" <jschanck@uwaterloo.ca>
Date: Fri, 30 Oct 2020 18:01:44 -0400
Subject: [PATCH 10/10] slightly faster avx2 schoolbook multiplications

---
 crypto_kem/firesaber/META.yml         |    4 +-
 crypto_kem/firesaber/avx2/poly_mul.c  | 1336 ++++++++++++-------------
 crypto_kem/lightsaber/META.yml        |    4 +-
 crypto_kem/lightsaber/avx2/poly_mul.c | 1336 ++++++++++++-------------
 crypto_kem/saber/META.yml             |    4 +-
 crypto_kem/saber/avx2/poly_mul.c      | 1336 ++++++++++++-------------
 6 files changed, 1968 insertions(+), 2052 deletions(-)

diff --git a/crypto_kem/firesaber/META.yml b/crypto_kem/firesaber/META.yml
index 24363a85..9e067250 100644
--- a/crypto_kem/firesaber/META.yml
+++ b/crypto_kem/firesaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/firesaber/avx2/poly_mul.c b/crypto_kem/firesaber/avx2/poly_mul.c
index d4e37d59..4d4ec959 100644
--- a/crypto_kem/firesaber/avx2/poly_mul.c
+++ b/crypto_kem/firesaber/avx2/poly_mul.c
@@ -4,701 +4,673 @@
 
 #define L (SABER_N / 64)
 
-static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) {
-    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
-}
-
-static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) {
-    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-    __m256i temp;
+/* 16 word parallel multiply */
+#define mul(a, b)     _mm256_mullo_epi16((a), (b))
+/* 16 word parallel multiply and accumulate */
+#define mac(a, b, c)  _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c))
 
+static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i t0;
     a0 = a[0];
     a1 = a[1];
     a2 = a[2];
     a3 = a[3];
-    a4 = a[4];
-    a5 = a[5];
-    a6 = a[6];
-    a7 = a[7];
-
     b0 = b[0];
     b1 = b[1];
     b2 = b[2];
     b3 = b[3];
-    b4 = b[4];
-    b5 = b[5];
-    b6 = b[6];
-    b7 = b[7];
-
-    c[0] = mul_add(a0, b0, c[0]);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    temp = mul_add(a1, b0, temp);
-    c[1] = _mm256_add_epi16(temp, c[1]);
-
-    temp = _mm256_mullo_epi16(a0, b2);
-    temp = mul_add(a1, b1, temp);
-    temp = mul_add(a2, b0, temp);
-    c[2] = _mm256_add_epi16(temp, c[2]);
-
-    temp = _mm256_mullo_epi16(a0, b3);
-    temp = mul_add(a1, b2, temp);
-    temp = mul_add(a2, b1, temp);
-    temp = mul_add(a3, b0, temp);
-    c[3] = _mm256_add_epi16(temp, c[3]);
-
-    temp = _mm256_mullo_epi16(a0, b4);
-    temp = mul_add(a1, b3, temp);
-    temp = mul_add(a3, b1, temp);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    c[4] = _mm256_add_epi16(temp, c[4]);
-
-    temp = _mm256_mullo_epi16(a0, b5);
-    temp = mul_add(a1, b4, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add( a4, b1, temp);
-    temp = mul_add(a5, b0, temp);
-    c[5] = _mm256_add_epi16(temp, c[5]);
-
-    temp = _mm256_mullo_epi16(a0, b6);
-    temp = mul_add(a1, b5, temp);
-    temp = mul_add(a5, b1, temp);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a4, b2, temp);
-    c[6] = _mm256_add_epi16(temp, c[6]);
-
-    temp = _mm256_mullo_epi16(a0, b7);
-    temp = mul_add(a1, b6, temp);
-    temp = mul_add(a6, b1, temp);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a5, b2, temp);
-    c[7] = _mm256_add_epi16(temp, c[7]);
-
-    temp = _mm256_mullo_epi16(a0, b[8]);
-    temp = mul_add(a1, b7, temp);
-    temp = mul_add(a7, b1, temp);
-    temp = mul_add(a[8], b0, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a6, b2, temp);
-    c[8] = _mm256_add_epi16(temp, c[8]);
-
-    temp = _mm256_mullo_epi16(a0, b[9]);
-    temp = mul_add(a1, b[8], temp);
-    temp = mul_add(a[8], b1, temp);
-    temp = mul_add(a[9], b0, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a7, b2, temp);
-    c[9] = _mm256_add_epi16(temp, c[9]);
-
-    temp = _mm256_mullo_epi16(a0, b[10]);
-    temp = mul_add(a1, b[9], temp);
-    temp = mul_add(a[9], b1, temp);
-    temp = mul_add(a[10], b0, temp);
-    temp = mul_add(a2, b[8], temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a[8], b2, temp);
-    c[10] = _mm256_add_epi16(temp, c[10]);
-
-    temp = _mm256_mullo_epi16(a0, b[11]);
-    temp = mul_add(a1, b[10], temp);
-    temp = mul_add(a[10], b1, temp);
-    temp = mul_add(a[11], b0, temp);
-    temp = mul_add(a2, b[9], temp);
-    temp = mul_add(a3, b[8], temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a[8], b3, temp);
-    temp = mul_add(a[9], b2, temp);
-    c[11] = _mm256_add_epi16(temp, c[11]);
-
-    temp = _mm256_mullo_epi16(a0, b[12]);
-    temp = mul_add(a1, b[11], temp);
-    temp = mul_add(a[11], b1, temp);
-    temp = mul_add(a[12], b0, temp);
-    temp = mul_add(a2, b[10], temp);
-    temp = mul_add(a3, b[9], temp);
-    temp = mul_add(a4, b[8], temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a[8], b4, temp);
-    temp = mul_add(a[9], b3, temp);
-    temp = mul_add(a[10], b2, temp);
-    c[12] = _mm256_add_epi16(temp, c[12]);
-
-    temp = _mm256_mullo_epi16(a0, b[13]);
-    temp = mul_add(a1, b[12], temp);
-    temp = mul_add(a[12], b1, temp);
-    temp = mul_add(a[13], b0, temp);
-    temp = mul_add(a2, b[11], temp);
-    temp = mul_add(a3, b[10], temp);
-    temp = mul_add(a4, b[9], temp);
-    temp = mul_add(a5, b[8], temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a[8], b5, temp);
-    temp = mul_add(a[9], b4, temp);
-    temp = mul_add(a[10], b3, temp);
-    temp = mul_add(a[11], b2, temp);
-    c[13] = _mm256_add_epi16(temp, c[13]);
-
-    temp = _mm256_mullo_epi16(a0, b[14]);
-    temp = mul_add(a1, b[13], temp);
-    temp = mul_add(a[13], b1, temp);
-    temp = mul_add(a[14], b0, temp);
-    temp = mul_add(a2, b[12], temp);
-    temp = mul_add(a3, b[11], temp);
-    temp = mul_add(a4, b[10], temp);
-    temp = mul_add(a5, b[9], temp);
-    temp = mul_add(a6, b[8], temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a[8], b6, temp);
-    temp = mul_add(a[9], b5, temp);
-    temp = mul_add(a[10], b4, temp);
-    temp = mul_add(a[11], b3, temp);
-    temp = mul_add(a[12], b2, temp);
-    c[14] = _mm256_add_epi16(temp, c[14]);
-
-    temp = _mm256_mullo_epi16(a0, b[15]);
-    temp = mul_add(a1, b[14], temp);
-    temp = mul_add(a[14], b1, temp);
-    temp = mul_add(a[15], b0, temp);
-    temp = mul_add(a2, b[13], temp);
-    temp = mul_add(a3, b[12], temp);
-    temp = mul_add(a4, b[11], temp);
-    temp = mul_add(a5, b[10], temp);
-    temp = mul_add(a6, b[9], temp);
-    temp = mul_add(a7, b[8], temp);
-    temp = mul_add(a[8], b7, temp);
-    temp = mul_add(a[9], b6, temp);
-    temp = mul_add(a[10], b5, temp);
-    temp = mul_add(a[11], b4, temp);
-    temp = mul_add(a[12], b3, temp);
-    temp = mul_add(a[13], b2, temp);
-    c[15] = _mm256_add_epi16(temp, c[15]);
-
-    a0 = a[14];
-    a1 = a[15];
-    a2 = a[13];
-    a3 = a[12];
-    a4 = a[11];
-    a5 = a[10];
-    a6 = a[9];
-    a7 = a[8];
-
-    b0 = b[14];
-    b1 = b[15];
-    b2 = b[13];
-    b3 = b[12];
-    b4 = b[11];
-    b5 = b[10];
-    b6 = b[9];
-    b7 = b[8];
-
-    temp = _mm256_mullo_epi16(a[1], b1);
-    temp = mul_add(a[2], b0, temp);
-    temp = mul_add(a[3], b2, temp);
-    temp = mul_add(a[4], b3, temp);
-    temp = mul_add(a[5], b4, temp);
-    temp = mul_add(a[6], b5, temp);
-    temp = mul_add(a[7], b6, temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a6, b[7], temp);
-    temp = mul_add(a5, b[6], temp);
-    temp = mul_add(a4, b[5], temp);
-    temp = mul_add(a3, b[4], temp);
-    temp = mul_add(a2, b[3], temp);
-    temp = mul_add(a0, b[2], temp);
-    temp = mul_add(a1, b[1], temp);
-    c[16] = _mm256_add_epi16(temp, c[16]);
-
-    temp = _mm256_mullo_epi16(a[2], b1);
-    temp = mul_add(a[3], b0, temp);
-    temp = mul_add(a[4], b2, temp);
-    temp = mul_add(a[5], b3, temp);
-    temp = mul_add(a[6], b4, temp);
-    temp = mul_add(a[7], b5, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a5, b[7], temp);
-    temp = mul_add(a4, b[6], temp);
-    temp = mul_add(a3, b[5], temp);
-    temp = mul_add(a2, b[4], temp);
-    temp = mul_add(a0, b[3], temp);
-    temp = mul_add(a1, b[2], temp);
-    c[17] = _mm256_add_epi16(temp, c[17]);
-
-    temp = _mm256_mullo_epi16(a[3], b1);
-    temp = mul_add(a[4], b0, temp);
-    temp = mul_add(a[5], b2, temp);
-    temp = mul_add(a[6], b3, temp);
-    temp = mul_add(a[7], b4, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a4, b[7], temp);
-    temp = mul_add(a3, b[6], temp);
-    temp = mul_add(a2, b[5], temp);
-    temp = mul_add(a0, b[4], temp);
-    temp = mul_add(a1, b[3], temp);
-    c[18] = _mm256_add_epi16(temp, c[18]);
-
-    temp = _mm256_mullo_epi16(a[4], b1);
-    temp = mul_add(a[5], b0, temp);
-    temp = mul_add(a[6], b2, temp);
-    temp = mul_add(a[7], b3, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a3, b[7], temp);
-    temp = mul_add(a2, b[6], temp);
-    temp = mul_add(a0, b[5], temp);
-    temp = mul_add(a1, b[4], temp);
-    c[19] = _mm256_add_epi16(temp, c[19]);
-
-    temp = _mm256_mullo_epi16(a[5], b1);
-    temp = mul_add(a[6], b0, temp);
-    temp = mul_add(a[7], b2, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a2, b[7], temp);
-    temp = mul_add(a0, b[6], temp);
-    temp = mul_add(a1, b[5], temp);
-    c[20] = _mm256_add_epi16(temp, c[20]);
-
-    temp = _mm256_mullo_epi16(a[6], b1);
-    temp = mul_add(a[7], b0, temp);
-    temp = mul_add(a7, b2, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a0, b[7], temp);
-    temp = mul_add(a1, b[6], temp);
-    c[21] = _mm256_add_epi16(temp, c[21]);
-
-    temp = _mm256_mullo_epi16(a[7], b1);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a6, b2, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a0, b7, temp);
-    temp = mul_add(a1, b[7], temp);
-    c[22] = _mm256_add_epi16(temp, c[22]);
-
-    temp = _mm256_mullo_epi16(a7, b1);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a5, b2, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a0, b6, temp);
-    temp = mul_add(a1, b7, temp);
-    c[23] = _mm256_add_epi16(temp, c[23]);
-
-    temp = _mm256_mullo_epi16(a6, b1);
-    temp = mul_add(a5, b0, temp);
-    temp = mul_add(a4, b2, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a0, b5, temp);
-    temp = mul_add(a1, b6, temp);
-    c[24] = _mm256_add_epi16(temp, c[24]);
-
-    temp = _mm256_mullo_epi16(a5, b1);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a0, b4, temp);
-    temp = mul_add(a1, b5, temp);
-    c[25] = _mm256_add_epi16(temp, c[25]);
-
-    temp = _mm256_mullo_epi16(a4, b1);
-    temp = mul_add(a3, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    temp = mul_add(a0, b3, temp);
-    temp = mul_add(a1, b4, temp);
-    c[26] = _mm256_add_epi16(temp, c[26]);
-
-    temp = _mm256_mullo_epi16(a3, b1);
-    temp = mul_add(a2, b0, temp);
-    temp = mul_add(a0, b2, temp);
-    temp = mul_add(a1, b3, temp);
-    c[27] = _mm256_add_epi16(temp, c[27]);
-
-    temp = _mm256_mullo_epi16(a2, b1);
-    temp = mul_add(a0, b0, temp);
-    temp = mul_add(a1, b2, temp);
-    c[28] = _mm256_add_epi16(temp, c[28]);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    temp = mul_add(a1, b0, temp);
-    c[29] = _mm256_add_epi16(temp, c[29]);
-
-    c[30] = mul_add(a1, b1, c[30]);
-
-    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+    c[0] = mul(a0, b0);
+    t0 = mul(a0, b1);
+    c[1] = mac(a1, b0, t0);
+    t0 = mul(a0, b2);
+    t0 = mac(a1, b1, t0);
+    c[2] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[3] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[4] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[5] = mac(a3, b2, t0);
+    c[6] = mul(a3, b3);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mul(a3, b3);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mul(a3, b3);
+    a0 = a[4];
+    a1 = a[5];
+    a2 = a[6];
+    a3 = a[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    a0 = a[8];
+    a1 = a[9];
+    a2 = a[10];
+    a3 = a[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mul(a3, b3);
+    a0 = a[12];
+    a1 = a[13];
+    a2 = a[14];
+    a3 = a[15];
+    c[24] = mac(a0, b0, c[24]);
+    t0 = mac(a0, b1, c[25]);
+    c[25] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[26]);
+    t0 = mac(a1, b1, t0);
+    c[26] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[27] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[28] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[29] = mac(a3, b2, t0);
+    c[30] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    c[31] = _mm256_setzero_si256();
 }
 
-
-static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) {
-    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-    __m256i temp;
-
+static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i t0;
     a0 = a[0];
     a1 = a[1];
     a2 = a[2];
     a3 = a[3];
-    a4 = a[4];
-    a5 = a[5];
-    a6 = a[6];
-    a7 = a[7];
-
     b0 = b[0];
     b1 = b[1];
     b2 = b[2];
     b3 = b[3];
-    b4 = b[4];
-    b5 = b[5];
-    b6 = b[6];
-    b7 = b[7];
-
-    c[0] = _mm256_mullo_epi16(a0, b0);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    c[1] = mul_add(a1, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b2);
-    temp = mul_add(a1, b1, temp);
-    c[2] = mul_add(a2, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b3);
-    temp = mul_add(a1, b2, temp);
-    temp = mul_add(a2, b1, temp);
-    c[3] = mul_add(a3, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b4);
-    temp = mul_add(a1, b3, temp);
-    temp = mul_add(a3, b1, temp);
-    temp = mul_add(a4, b0, temp);
-    c[4] = mul_add(a2, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b5);
-    temp = mul_add(a1, b4, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add( a4, b1, temp);
-    c[5] = mul_add(a5, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b6);
-    temp = mul_add(a1, b5, temp);
-    temp = mul_add(a5, b1, temp);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a3, b3, temp);
-    c[6] = mul_add(a4, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b7);
-    temp = mul_add(a1, b6, temp);
-    temp = mul_add(a6, b1, temp);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a4, b3, temp);
-    c[7] = mul_add(a5, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[8]);
-    temp = mul_add(a1, b7, temp);
-    temp = mul_add(a7, b1, temp);
-    temp = mul_add(a[8], b0, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a5, b3, temp);
-    c[8] = mul_add(a6, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[9]);
-    temp = mul_add(a1, b[8], temp);
-    temp = mul_add(a[8], b1, temp);
-    temp = mul_add(a[9], b0, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a6, b3, temp);
-    c[9] = mul_add(a7, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[10]);
-    temp = mul_add(a1, b[9], temp);
-    temp = mul_add(a[9], b1, temp);
-    temp = mul_add(a[10], b0, temp);
-    temp = mul_add(a2, b[8], temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a7, b3, temp);
-    c[10] = mul_add(a[8], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[11]);
-    temp = mul_add(a1, b[10], temp);
-    temp = mul_add(a[10], b1, temp);
-    temp = mul_add(a[11], b0, temp);
-    temp = mul_add(a2, b[9], temp);
-    temp = mul_add(a3, b[8], temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a[8], b3, temp);
-    c[11] = mul_add(a[9], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[12]);
-    temp = mul_add(a1, b[11], temp);
-    temp = mul_add(a[11], b1, temp);
-    temp = mul_add(a[12], b0, temp);
-    temp = mul_add(a2, b[10], temp);
-    temp = mul_add(a3, b[9], temp);
-    temp = mul_add(a4, b[8], temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a[8], b4, temp);
-    temp = mul_add(a[9], b3, temp);
-    c[12] = mul_add(a[10], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[13]);
-    temp = mul_add(a1, b[12], temp);
-    temp = mul_add(a[12], b1, temp);
-    temp = mul_add(a[13], b0, temp);
-    temp = mul_add(a2, b[11], temp);
-    temp = mul_add(a3, b[10], temp);
-    temp = mul_add(a4, b[9], temp);
-    temp = mul_add(a5, b[8], temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a[8], b5, temp);
-    temp = mul_add(a[9], b4, temp);
-    temp = mul_add(a[10], b3, temp);
-    c[13] = mul_add(a[11], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[14]);
-    temp = mul_add(a1, b[13], temp);
-    temp = mul_add(a[13], b1, temp);
-    temp = mul_add(a[14], b0, temp);
-    temp = mul_add(a2, b[12], temp);
-    temp = mul_add(a3, b[11], temp);
-    temp = mul_add(a4, b[10], temp);
-    temp = mul_add(a5, b[9], temp);
-    temp = mul_add(a6, b[8], temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a[8], b6, temp);
-    temp = mul_add(a[9], b5, temp);
-    temp = mul_add(a[10], b4, temp);
-    temp = mul_add(a[11], b3, temp);
-    c[14] = mul_add(a[12], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[15]);
-    temp = mul_add(a1, b[14], temp);
-    temp = mul_add(a[14], b1, temp);
-    temp = mul_add(a[15], b0, temp);
-    temp = mul_add(a2, b[13], temp);
-    temp = mul_add(a3, b[12], temp);
-    temp = mul_add(a4, b[11], temp);
-    temp = mul_add(a5, b[10], temp);
-    temp = mul_add(a6, b[9], temp);
-    temp = mul_add(a7, b[8], temp);
-    temp = mul_add(a[8], b7, temp);
-    temp = mul_add(a[9], b6, temp);
-    temp = mul_add(a[10], b5, temp);
-    temp = mul_add(a[11], b4, temp);
-    temp = mul_add(a[12], b3, temp);
-    c[15] = mul_add(a[13], b2, temp);
-
-    // unrolled second triangle
-    a0 = a[14];
-    a1 = a[15];
-    a2 = a[13];
-    a3 = a[12];
-    a4 = a[11];
-    a5 = a[10];
-    a6 = a[9];
-    a7 = a[8];
-
-    b0 = b[14];
-    b1 = b[15];
-    b2 = b[13];
-    b3 = b[12];
-    b4 = b[11];
-    b5 = b[10];
-    b6 = b[9];
-    b7 = b[8];
-
-    temp = _mm256_mullo_epi16(a[1], b1);
-    temp = mul_add(a[2], b0, temp);
-    temp = mul_add(a[3], b2, temp);
-    temp = mul_add(a[4], b3, temp);
-    temp = mul_add(a[5], b4, temp);
-    temp = mul_add(a[6], b5, temp);
-    temp = mul_add(a[7], b6, temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a6, b[7], temp);
-    temp = mul_add(a5, b[6], temp);
-    temp = mul_add(a4, b[5], temp);
-    temp = mul_add(a3, b[4], temp);
-    temp = mul_add(a2, b[3], temp);
-    temp = mul_add(a0, b[2], temp);
-    c[16] = mul_add(a1, b[1], temp);
-
-    temp = _mm256_mullo_epi16(a[2], b1);
-    temp = mul_add(a[3], b0, temp);
-    temp = mul_add(a[4], b2, temp);
-    temp = mul_add(a[5], b3, temp);
-    temp = mul_add(a[6], b4, temp);
-    temp = mul_add(a[7], b5, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a5, b[7], temp);
-    temp = mul_add(a4, b[6], temp);
-    temp = mul_add(a3, b[5], temp);
-    temp = mul_add(a2, b[4], temp);
-    temp = mul_add(a0, b[3], temp);
-    c[17] = mul_add(a1, b[2], temp);
-
-    temp = _mm256_mullo_epi16(a[3], b1);
-    temp = mul_add(a[4], b0, temp);
-    temp = mul_add(a[5], b2, temp);
-    temp = mul_add(a[6], b3, temp);
-    temp = mul_add(a[7], b4, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a4, b[7], temp);
-    temp = mul_add(a3, b[6], temp);
-    temp = mul_add(a2, b[5], temp);
-    temp = mul_add(a0, b[4], temp);
-    c[18] = mul_add(a1, b[3], temp);
-
-    temp = _mm256_mullo_epi16(a[4], b1);
-    temp = mul_add(a[5], b0, temp);
-    temp = mul_add(a[6], b2, temp);
-    temp = mul_add(a[7], b3, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a3, b[7], temp);
-    temp = mul_add(a2, b[6], temp);
-    temp = mul_add(a0, b[5], temp);
-    c[19] = mul_add(a1, b[4], temp);
-
-    temp = _mm256_mullo_epi16(a[5], b1);
-    temp = mul_add(a[6], b0, temp);
-    temp = mul_add(a[7], b2, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a2, b[7], temp);
-    temp = mul_add(a0, b[6], temp);
-    c[20] = mul_add(a1, b[5], temp);
-
-    temp = _mm256_mullo_epi16(a[6], b1);
-    temp = mul_add(a[7], b0, temp);
-    temp = mul_add(a7, b2, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a0, b[7], temp);
-    c[21] = mul_add(a1, b[6], temp);
-
-    temp = _mm256_mullo_epi16(a[7], b1);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a6, b2, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a0, b7, temp);
-    c[22] = mul_add(a1, b[7], temp);
-
-    temp = _mm256_mullo_epi16(a7, b1);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a5, b2, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a0, b6, temp);
-    c[23] = mul_add(a1, b7, temp);
-
-    temp = _mm256_mullo_epi16(a6, b1);
-    temp = mul_add(a5, b0, temp);
-    temp = mul_add(a4, b2, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a0, b5, temp);
-    c[24] = mul_add(a1, b6, temp);
-
-    temp = _mm256_mullo_epi16(a5, b1);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a0, b4, temp);
-    c[25] = mul_add(a1, b5, temp);
-
-    temp = _mm256_mullo_epi16(a4, b1);
-    temp = mul_add(a3, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    temp = mul_add(a0, b3, temp);
-    c[26] = mul_add(a1, b4, temp);
-
-    temp = _mm256_mullo_epi16(a3, b1);
-    temp = mul_add(a2, b0, temp);
-    temp = mul_add(a0, b2, temp);
-    c[27] = mul_add(a1, b3, temp);
-
-    temp = _mm256_mullo_epi16(a2, b1);
-    temp = mul_add(a0, b0, temp);
-    c[28] = mul_add(a1, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    c[29] = mul_add(a1, b0, temp);
-
-    c[30] = _mm256_mullo_epi16(a1, b1);
-
-    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+    c[0] = mac(a0, b0, c[0]);
+    t0 = mac(a0, b1, c[1]);
+    c[1] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[2]);
+    t0 = mac(a1, b1, t0);
+    c[2] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[3]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[3] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[4]);
+    t0 = mac(a2, b2, t0);
+    c[4] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[5]);
+    c[5] = mac(a3, b2, t0);
+    c[6] = mac(a3, b3, c[6]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    a0 = a[4];
+    a1 = a[5];
+    a2 = a[6];
+    a3 = a[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    a0 = a[8];
+    a1 = a[9];
+    a2 = a[10];
+    a3 = a[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    a0 = a[12];
+    a1 = a[13];
+    a2 = a[14];
+    a3 = a[15];
+    c[24] = mac(a0, b0, c[24]);
+    t0 = mac(a0, b1, c[25]);
+    c[25] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[26]);
+    t0 = mac(a1, b1, t0);
+    c[26] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[27]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[27] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[28]);
+    t0 = mac(a2, b2, t0);
+    c[28] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[29]);
+    c[29] = mac(a3, b2, t0);
+    c[30] = mac(a3, b3, c[30]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
 }
 
+
 static void transpose(__m256i *M) {
     __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
     __m256i temp, temp0, temp1, temp2;
@@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co
     //-----------------Forward transposes ends---------------------------------
 
     if (accumulate == 0) {
-        schoolbook_avx(vc, va, vb);
-        schoolbook_avx(vc + 32, va + 16, vb + 16);
-        schoolbook_avx(vc + 64, va + 32, vb + 32);
-        schoolbook_avx(vc + 96, va + 48, vb + 48);
+        schoolbook16x16(vc, va, vb);
+        schoolbook16x16(vc + 32, va + 16, vb + 16);
+        schoolbook16x16(vc + 64, va + 32, vb + 32);
+        schoolbook16x16(vc + 96, va + 48, vb + 48);
     } else {
-        schoolbook_avx_acc(vc, va, vb);
-        schoolbook_avx_acc(vc + 32, va + 16, vb + 16);
-        schoolbook_avx_acc(vc + 64, va + 32, vb + 32);
-        schoolbook_avx_acc(vc + 96, va + 48, vb + 48);
+        schoolbook16x16_acc(vc, va, vb);
+        schoolbook16x16_acc(vc + 32, va + 16, vb + 16);
+        schoolbook16x16_acc(vc + 64, va + 32, vb + 32);
+        schoolbook16x16_acc(vc + 96, va + 48, vb + 48);
     }
 }
 
diff --git a/crypto_kem/lightsaber/META.yml b/crypto_kem/lightsaber/META.yml
index ad9d6acc..ec0f7517 100644
--- a/crypto_kem/lightsaber/META.yml
+++ b/crypto_kem/lightsaber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/lightsaber/avx2/poly_mul.c b/crypto_kem/lightsaber/avx2/poly_mul.c
index 9ae8de05..51504491 100644
--- a/crypto_kem/lightsaber/avx2/poly_mul.c
+++ b/crypto_kem/lightsaber/avx2/poly_mul.c
@@ -4,701 +4,673 @@
 
 #define L (SABER_N / 64)
 
-static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) {
-    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
-}
-
-static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) {
-    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-    __m256i temp;
+/* 16 word parallel multiply */
+#define mul(a, b)     _mm256_mullo_epi16((a), (b))
+/* 16 word parallel multiply and accumulate */
+#define mac(a, b, c)  _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c))
 
+static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i t0;
     a0 = a[0];
     a1 = a[1];
     a2 = a[2];
     a3 = a[3];
-    a4 = a[4];
-    a5 = a[5];
-    a6 = a[6];
-    a7 = a[7];
-
     b0 = b[0];
     b1 = b[1];
     b2 = b[2];
     b3 = b[3];
-    b4 = b[4];
-    b5 = b[5];
-    b6 = b[6];
-    b7 = b[7];
-
-    c[0] = mul_add(a0, b0, c[0]);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    temp = mul_add(a1, b0, temp);
-    c[1] = _mm256_add_epi16(temp, c[1]);
-
-    temp = _mm256_mullo_epi16(a0, b2);
-    temp = mul_add(a1, b1, temp);
-    temp = mul_add(a2, b0, temp);
-    c[2] = _mm256_add_epi16(temp, c[2]);
-
-    temp = _mm256_mullo_epi16(a0, b3);
-    temp = mul_add(a1, b2, temp);
-    temp = mul_add(a2, b1, temp);
-    temp = mul_add(a3, b0, temp);
-    c[3] = _mm256_add_epi16(temp, c[3]);
-
-    temp = _mm256_mullo_epi16(a0, b4);
-    temp = mul_add(a1, b3, temp);
-    temp = mul_add(a3, b1, temp);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    c[4] = _mm256_add_epi16(temp, c[4]);
-
-    temp = _mm256_mullo_epi16(a0, b5);
-    temp = mul_add(a1, b4, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add( a4, b1, temp);
-    temp = mul_add(a5, b0, temp);
-    c[5] = _mm256_add_epi16(temp, c[5]);
-
-    temp = _mm256_mullo_epi16(a0, b6);
-    temp = mul_add(a1, b5, temp);
-    temp = mul_add(a5, b1, temp);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a4, b2, temp);
-    c[6] = _mm256_add_epi16(temp, c[6]);
-
-    temp = _mm256_mullo_epi16(a0, b7);
-    temp = mul_add(a1, b6, temp);
-    temp = mul_add(a6, b1, temp);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a5, b2, temp);
-    c[7] = _mm256_add_epi16(temp, c[7]);
-
-    temp = _mm256_mullo_epi16(a0, b[8]);
-    temp = mul_add(a1, b7, temp);
-    temp = mul_add(a7, b1, temp);
-    temp = mul_add(a[8], b0, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a6, b2, temp);
-    c[8] = _mm256_add_epi16(temp, c[8]);
-
-    temp = _mm256_mullo_epi16(a0, b[9]);
-    temp = mul_add(a1, b[8], temp);
-    temp = mul_add(a[8], b1, temp);
-    temp = mul_add(a[9], b0, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a7, b2, temp);
-    c[9] = _mm256_add_epi16(temp, c[9]);
-
-    temp = _mm256_mullo_epi16(a0, b[10]);
-    temp = mul_add(a1, b[9], temp);
-    temp = mul_add(a[9], b1, temp);
-    temp = mul_add(a[10], b0, temp);
-    temp = mul_add(a2, b[8], temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a[8], b2, temp);
-    c[10] = _mm256_add_epi16(temp, c[10]);
-
-    temp = _mm256_mullo_epi16(a0, b[11]);
-    temp = mul_add(a1, b[10], temp);
-    temp = mul_add(a[10], b1, temp);
-    temp = mul_add(a[11], b0, temp);
-    temp = mul_add(a2, b[9], temp);
-    temp = mul_add(a3, b[8], temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a[8], b3, temp);
-    temp = mul_add(a[9], b2, temp);
-    c[11] = _mm256_add_epi16(temp, c[11]);
-
-    temp = _mm256_mullo_epi16(a0, b[12]);
-    temp = mul_add(a1, b[11], temp);
-    temp = mul_add(a[11], b1, temp);
-    temp = mul_add(a[12], b0, temp);
-    temp = mul_add(a2, b[10], temp);
-    temp = mul_add(a3, b[9], temp);
-    temp = mul_add(a4, b[8], temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a[8], b4, temp);
-    temp = mul_add(a[9], b3, temp);
-    temp = mul_add(a[10], b2, temp);
-    c[12] = _mm256_add_epi16(temp, c[12]);
-
-    temp = _mm256_mullo_epi16(a0, b[13]);
-    temp = mul_add(a1, b[12], temp);
-    temp = mul_add(a[12], b1, temp);
-    temp = mul_add(a[13], b0, temp);
-    temp = mul_add(a2, b[11], temp);
-    temp = mul_add(a3, b[10], temp);
-    temp = mul_add(a4, b[9], temp);
-    temp = mul_add(a5, b[8], temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a[8], b5, temp);
-    temp = mul_add(a[9], b4, temp);
-    temp = mul_add(a[10], b3, temp);
-    temp = mul_add(a[11], b2, temp);
-    c[13] = _mm256_add_epi16(temp, c[13]);
-
-    temp = _mm256_mullo_epi16(a0, b[14]);
-    temp = mul_add(a1, b[13], temp);
-    temp = mul_add(a[13], b1, temp);
-    temp = mul_add(a[14], b0, temp);
-    temp = mul_add(a2, b[12], temp);
-    temp = mul_add(a3, b[11], temp);
-    temp = mul_add(a4, b[10], temp);
-    temp = mul_add(a5, b[9], temp);
-    temp = mul_add(a6, b[8], temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a[8], b6, temp);
-    temp = mul_add(a[9], b5, temp);
-    temp = mul_add(a[10], b4, temp);
-    temp = mul_add(a[11], b3, temp);
-    temp = mul_add(a[12], b2, temp);
-    c[14] = _mm256_add_epi16(temp, c[14]);
-
-    temp = _mm256_mullo_epi16(a0, b[15]);
-    temp = mul_add(a1, b[14], temp);
-    temp = mul_add(a[14], b1, temp);
-    temp = mul_add(a[15], b0, temp);
-    temp = mul_add(a2, b[13], temp);
-    temp = mul_add(a3, b[12], temp);
-    temp = mul_add(a4, b[11], temp);
-    temp = mul_add(a5, b[10], temp);
-    temp = mul_add(a6, b[9], temp);
-    temp = mul_add(a7, b[8], temp);
-    temp = mul_add(a[8], b7, temp);
-    temp = mul_add(a[9], b6, temp);
-    temp = mul_add(a[10], b5, temp);
-    temp = mul_add(a[11], b4, temp);
-    temp = mul_add(a[12], b3, temp);
-    temp = mul_add(a[13], b2, temp);
-    c[15] = _mm256_add_epi16(temp, c[15]);
-
-    a0 = a[14];
-    a1 = a[15];
-    a2 = a[13];
-    a3 = a[12];
-    a4 = a[11];
-    a5 = a[10];
-    a6 = a[9];
-    a7 = a[8];
-
-    b0 = b[14];
-    b1 = b[15];
-    b2 = b[13];
-    b3 = b[12];
-    b4 = b[11];
-    b5 = b[10];
-    b6 = b[9];
-    b7 = b[8];
-
-    temp = _mm256_mullo_epi16(a[1], b1);
-    temp = mul_add(a[2], b0, temp);
-    temp = mul_add(a[3], b2, temp);
-    temp = mul_add(a[4], b3, temp);
-    temp = mul_add(a[5], b4, temp);
-    temp = mul_add(a[6], b5, temp);
-    temp = mul_add(a[7], b6, temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a6, b[7], temp);
-    temp = mul_add(a5, b[6], temp);
-    temp = mul_add(a4, b[5], temp);
-    temp = mul_add(a3, b[4], temp);
-    temp = mul_add(a2, b[3], temp);
-    temp = mul_add(a0, b[2], temp);
-    temp = mul_add(a1, b[1], temp);
-    c[16] = _mm256_add_epi16(temp, c[16]);
-
-    temp = _mm256_mullo_epi16(a[2], b1);
-    temp = mul_add(a[3], b0, temp);
-    temp = mul_add(a[4], b2, temp);
-    temp = mul_add(a[5], b3, temp);
-    temp = mul_add(a[6], b4, temp);
-    temp = mul_add(a[7], b5, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a5, b[7], temp);
-    temp = mul_add(a4, b[6], temp);
-    temp = mul_add(a3, b[5], temp);
-    temp = mul_add(a2, b[4], temp);
-    temp = mul_add(a0, b[3], temp);
-    temp = mul_add(a1, b[2], temp);
-    c[17] = _mm256_add_epi16(temp, c[17]);
-
-    temp = _mm256_mullo_epi16(a[3], b1);
-    temp = mul_add(a[4], b0, temp);
-    temp = mul_add(a[5], b2, temp);
-    temp = mul_add(a[6], b3, temp);
-    temp = mul_add(a[7], b4, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a4, b[7], temp);
-    temp = mul_add(a3, b[6], temp);
-    temp = mul_add(a2, b[5], temp);
-    temp = mul_add(a0, b[4], temp);
-    temp = mul_add(a1, b[3], temp);
-    c[18] = _mm256_add_epi16(temp, c[18]);
-
-    temp = _mm256_mullo_epi16(a[4], b1);
-    temp = mul_add(a[5], b0, temp);
-    temp = mul_add(a[6], b2, temp);
-    temp = mul_add(a[7], b3, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a3, b[7], temp);
-    temp = mul_add(a2, b[6], temp);
-    temp = mul_add(a0, b[5], temp);
-    temp = mul_add(a1, b[4], temp);
-    c[19] = _mm256_add_epi16(temp, c[19]);
-
-    temp = _mm256_mullo_epi16(a[5], b1);
-    temp = mul_add(a[6], b0, temp);
-    temp = mul_add(a[7], b2, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a2, b[7], temp);
-    temp = mul_add(a0, b[6], temp);
-    temp = mul_add(a1, b[5], temp);
-    c[20] = _mm256_add_epi16(temp, c[20]);
-
-    temp = _mm256_mullo_epi16(a[6], b1);
-    temp = mul_add(a[7], b0, temp);
-    temp = mul_add(a7, b2, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a0, b[7], temp);
-    temp = mul_add(a1, b[6], temp);
-    c[21] = _mm256_add_epi16(temp, c[21]);
-
-    temp = _mm256_mullo_epi16(a[7], b1);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a6, b2, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a0, b7, temp);
-    temp = mul_add(a1, b[7], temp);
-    c[22] = _mm256_add_epi16(temp, c[22]);
-
-    temp = _mm256_mullo_epi16(a7, b1);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a5, b2, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a0, b6, temp);
-    temp = mul_add(a1, b7, temp);
-    c[23] = _mm256_add_epi16(temp, c[23]);
-
-    temp = _mm256_mullo_epi16(a6, b1);
-    temp = mul_add(a5, b0, temp);
-    temp = mul_add(a4, b2, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a0, b5, temp);
-    temp = mul_add(a1, b6, temp);
-    c[24] = _mm256_add_epi16(temp, c[24]);
-
-    temp = _mm256_mullo_epi16(a5, b1);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a0, b4, temp);
-    temp = mul_add(a1, b5, temp);
-    c[25] = _mm256_add_epi16(temp, c[25]);
-
-    temp = _mm256_mullo_epi16(a4, b1);
-    temp = mul_add(a3, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    temp = mul_add(a0, b3, temp);
-    temp = mul_add(a1, b4, temp);
-    c[26] = _mm256_add_epi16(temp, c[26]);
-
-    temp = _mm256_mullo_epi16(a3, b1);
-    temp = mul_add(a2, b0, temp);
-    temp = mul_add(a0, b2, temp);
-    temp = mul_add(a1, b3, temp);
-    c[27] = _mm256_add_epi16(temp, c[27]);
-
-    temp = _mm256_mullo_epi16(a2, b1);
-    temp = mul_add(a0, b0, temp);
-    temp = mul_add(a1, b2, temp);
-    c[28] = _mm256_add_epi16(temp, c[28]);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    temp = mul_add(a1, b0, temp);
-    c[29] = _mm256_add_epi16(temp, c[29]);
-
-    c[30] = mul_add(a1, b1, c[30]);
-
-    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+    c[0] = mul(a0, b0);
+    t0 = mul(a0, b1);
+    c[1] = mac(a1, b0, t0);
+    t0 = mul(a0, b2);
+    t0 = mac(a1, b1, t0);
+    c[2] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[3] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[4] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[5] = mac(a3, b2, t0);
+    c[6] = mul(a3, b3);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mul(a3, b3);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mul(a3, b3);
+    a0 = a[4];
+    a1 = a[5];
+    a2 = a[6];
+    a3 = a[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    a0 = a[8];
+    a1 = a[9];
+    a2 = a[10];
+    a3 = a[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mul(a3, b3);
+    a0 = a[12];
+    a1 = a[13];
+    a2 = a[14];
+    a3 = a[15];
+    c[24] = mac(a0, b0, c[24]);
+    t0 = mac(a0, b1, c[25]);
+    c[25] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[26]);
+    t0 = mac(a1, b1, t0);
+    c[26] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[27] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[28] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[29] = mac(a3, b2, t0);
+    c[30] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    c[31] = _mm256_setzero_si256();
 }
 
-
-static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) {
-    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-    __m256i temp;
-
+static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i t0;
     a0 = a[0];
     a1 = a[1];
     a2 = a[2];
     a3 = a[3];
-    a4 = a[4];
-    a5 = a[5];
-    a6 = a[6];
-    a7 = a[7];
-
     b0 = b[0];
     b1 = b[1];
     b2 = b[2];
     b3 = b[3];
-    b4 = b[4];
-    b5 = b[5];
-    b6 = b[6];
-    b7 = b[7];
-
-    c[0] = _mm256_mullo_epi16(a0, b0);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    c[1] = mul_add(a1, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b2);
-    temp = mul_add(a1, b1, temp);
-    c[2] = mul_add(a2, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b3);
-    temp = mul_add(a1, b2, temp);
-    temp = mul_add(a2, b1, temp);
-    c[3] = mul_add(a3, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b4);
-    temp = mul_add(a1, b3, temp);
-    temp = mul_add(a3, b1, temp);
-    temp = mul_add(a4, b0, temp);
-    c[4] = mul_add(a2, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b5);
-    temp = mul_add(a1, b4, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add( a4, b1, temp);
-    c[5] = mul_add(a5, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b6);
-    temp = mul_add(a1, b5, temp);
-    temp = mul_add(a5, b1, temp);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a3, b3, temp);
-    c[6] = mul_add(a4, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b7);
-    temp = mul_add(a1, b6, temp);
-    temp = mul_add(a6, b1, temp);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a4, b3, temp);
-    c[7] = mul_add(a5, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[8]);
-    temp = mul_add(a1, b7, temp);
-    temp = mul_add(a7, b1, temp);
-    temp = mul_add(a[8], b0, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a5, b3, temp);
-    c[8] = mul_add(a6, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[9]);
-    temp = mul_add(a1, b[8], temp);
-    temp = mul_add(a[8], b1, temp);
-    temp = mul_add(a[9], b0, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a6, b3, temp);
-    c[9] = mul_add(a7, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[10]);
-    temp = mul_add(a1, b[9], temp);
-    temp = mul_add(a[9], b1, temp);
-    temp = mul_add(a[10], b0, temp);
-    temp = mul_add(a2, b[8], temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a7, b3, temp);
-    c[10] = mul_add(a[8], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[11]);
-    temp = mul_add(a1, b[10], temp);
-    temp = mul_add(a[10], b1, temp);
-    temp = mul_add(a[11], b0, temp);
-    temp = mul_add(a2, b[9], temp);
-    temp = mul_add(a3, b[8], temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a[8], b3, temp);
-    c[11] = mul_add(a[9], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[12]);
-    temp = mul_add(a1, b[11], temp);
-    temp = mul_add(a[11], b1, temp);
-    temp = mul_add(a[12], b0, temp);
-    temp = mul_add(a2, b[10], temp);
-    temp = mul_add(a3, b[9], temp);
-    temp = mul_add(a4, b[8], temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a[8], b4, temp);
-    temp = mul_add(a[9], b3, temp);
-    c[12] = mul_add(a[10], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[13]);
-    temp = mul_add(a1, b[12], temp);
-    temp = mul_add(a[12], b1, temp);
-    temp = mul_add(a[13], b0, temp);
-    temp = mul_add(a2, b[11], temp);
-    temp = mul_add(a3, b[10], temp);
-    temp = mul_add(a4, b[9], temp);
-    temp = mul_add(a5, b[8], temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a[8], b5, temp);
-    temp = mul_add(a[9], b4, temp);
-    temp = mul_add(a[10], b3, temp);
-    c[13] = mul_add(a[11], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[14]);
-    temp = mul_add(a1, b[13], temp);
-    temp = mul_add(a[13], b1, temp);
-    temp = mul_add(a[14], b0, temp);
-    temp = mul_add(a2, b[12], temp);
-    temp = mul_add(a3, b[11], temp);
-    temp = mul_add(a4, b[10], temp);
-    temp = mul_add(a5, b[9], temp);
-    temp = mul_add(a6, b[8], temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a[8], b6, temp);
-    temp = mul_add(a[9], b5, temp);
-    temp = mul_add(a[10], b4, temp);
-    temp = mul_add(a[11], b3, temp);
-    c[14] = mul_add(a[12], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[15]);
-    temp = mul_add(a1, b[14], temp);
-    temp = mul_add(a[14], b1, temp);
-    temp = mul_add(a[15], b0, temp);
-    temp = mul_add(a2, b[13], temp);
-    temp = mul_add(a3, b[12], temp);
-    temp = mul_add(a4, b[11], temp);
-    temp = mul_add(a5, b[10], temp);
-    temp = mul_add(a6, b[9], temp);
-    temp = mul_add(a7, b[8], temp);
-    temp = mul_add(a[8], b7, temp);
-    temp = mul_add(a[9], b6, temp);
-    temp = mul_add(a[10], b5, temp);
-    temp = mul_add(a[11], b4, temp);
-    temp = mul_add(a[12], b3, temp);
-    c[15] = mul_add(a[13], b2, temp);
-
-    // unrolled second triangle
-    a0 = a[14];
-    a1 = a[15];
-    a2 = a[13];
-    a3 = a[12];
-    a4 = a[11];
-    a5 = a[10];
-    a6 = a[9];
-    a7 = a[8];
-
-    b0 = b[14];
-    b1 = b[15];
-    b2 = b[13];
-    b3 = b[12];
-    b4 = b[11];
-    b5 = b[10];
-    b6 = b[9];
-    b7 = b[8];
-
-    temp = _mm256_mullo_epi16(a[1], b1);
-    temp = mul_add(a[2], b0, temp);
-    temp = mul_add(a[3], b2, temp);
-    temp = mul_add(a[4], b3, temp);
-    temp = mul_add(a[5], b4, temp);
-    temp = mul_add(a[6], b5, temp);
-    temp = mul_add(a[7], b6, temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a6, b[7], temp);
-    temp = mul_add(a5, b[6], temp);
-    temp = mul_add(a4, b[5], temp);
-    temp = mul_add(a3, b[4], temp);
-    temp = mul_add(a2, b[3], temp);
-    temp = mul_add(a0, b[2], temp);
-    c[16] = mul_add(a1, b[1], temp);
-
-    temp = _mm256_mullo_epi16(a[2], b1);
-    temp = mul_add(a[3], b0, temp);
-    temp = mul_add(a[4], b2, temp);
-    temp = mul_add(a[5], b3, temp);
-    temp = mul_add(a[6], b4, temp);
-    temp = mul_add(a[7], b5, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a5, b[7], temp);
-    temp = mul_add(a4, b[6], temp);
-    temp = mul_add(a3, b[5], temp);
-    temp = mul_add(a2, b[4], temp);
-    temp = mul_add(a0, b[3], temp);
-    c[17] = mul_add(a1, b[2], temp);
-
-    temp = _mm256_mullo_epi16(a[3], b1);
-    temp = mul_add(a[4], b0, temp);
-    temp = mul_add(a[5], b2, temp);
-    temp = mul_add(a[6], b3, temp);
-    temp = mul_add(a[7], b4, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a4, b[7], temp);
-    temp = mul_add(a3, b[6], temp);
-    temp = mul_add(a2, b[5], temp);
-    temp = mul_add(a0, b[4], temp);
-    c[18] = mul_add(a1, b[3], temp);
-
-    temp = _mm256_mullo_epi16(a[4], b1);
-    temp = mul_add(a[5], b0, temp);
-    temp = mul_add(a[6], b2, temp);
-    temp = mul_add(a[7], b3, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a3, b[7], temp);
-    temp = mul_add(a2, b[6], temp);
-    temp = mul_add(a0, b[5], temp);
-    c[19] = mul_add(a1, b[4], temp);
-
-    temp = _mm256_mullo_epi16(a[5], b1);
-    temp = mul_add(a[6], b0, temp);
-    temp = mul_add(a[7], b2, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a2, b[7], temp);
-    temp = mul_add(a0, b[6], temp);
-    c[20] = mul_add(a1, b[5], temp);
-
-    temp = _mm256_mullo_epi16(a[6], b1);
-    temp = mul_add(a[7], b0, temp);
-    temp = mul_add(a7, b2, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a0, b[7], temp);
-    c[21] = mul_add(a1, b[6], temp);
-
-    temp = _mm256_mullo_epi16(a[7], b1);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a6, b2, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a0, b7, temp);
-    c[22] = mul_add(a1, b[7], temp);
-
-    temp = _mm256_mullo_epi16(a7, b1);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a5, b2, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a0, b6, temp);
-    c[23] = mul_add(a1, b7, temp);
-
-    temp = _mm256_mullo_epi16(a6, b1);
-    temp = mul_add(a5, b0, temp);
-    temp = mul_add(a4, b2, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a0, b5, temp);
-    c[24] = mul_add(a1, b6, temp);
-
-    temp = _mm256_mullo_epi16(a5, b1);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a0, b4, temp);
-    c[25] = mul_add(a1, b5, temp);
-
-    temp = _mm256_mullo_epi16(a4, b1);
-    temp = mul_add(a3, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    temp = mul_add(a0, b3, temp);
-    c[26] = mul_add(a1, b4, temp);
-
-    temp = _mm256_mullo_epi16(a3, b1);
-    temp = mul_add(a2, b0, temp);
-    temp = mul_add(a0, b2, temp);
-    c[27] = mul_add(a1, b3, temp);
-
-    temp = _mm256_mullo_epi16(a2, b1);
-    temp = mul_add(a0, b0, temp);
-    c[28] = mul_add(a1, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    c[29] = mul_add(a1, b0, temp);
-
-    c[30] = _mm256_mullo_epi16(a1, b1);
-
-    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+    c[0] = mac(a0, b0, c[0]);
+    t0 = mac(a0, b1, c[1]);
+    c[1] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[2]);
+    t0 = mac(a1, b1, t0);
+    c[2] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[3]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[3] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[4]);
+    t0 = mac(a2, b2, t0);
+    c[4] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[5]);
+    c[5] = mac(a3, b2, t0);
+    c[6] = mac(a3, b3, c[6]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    a0 = a[4];
+    a1 = a[5];
+    a2 = a[6];
+    a3 = a[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    a0 = a[8];
+    a1 = a[9];
+    a2 = a[10];
+    a3 = a[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    a0 = a[12];
+    a1 = a[13];
+    a2 = a[14];
+    a3 = a[15];
+    c[24] = mac(a0, b0, c[24]);
+    t0 = mac(a0, b1, c[25]);
+    c[25] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[26]);
+    t0 = mac(a1, b1, t0);
+    c[26] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[27]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[27] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[28]);
+    t0 = mac(a2, b2, t0);
+    c[28] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[29]);
+    c[29] = mac(a3, b2, t0);
+    c[30] = mac(a3, b3, c[30]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
 }
 
+
 static void transpose(__m256i *M) {
     __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
     __m256i temp, temp0, temp1, temp2;
@@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co
     //-----------------Forward transposes ends---------------------------------
 
     if (accumulate == 0) {
-        schoolbook_avx(vc, va, vb);
-        schoolbook_avx(vc + 32, va + 16, vb + 16);
-        schoolbook_avx(vc + 64, va + 32, vb + 32);
-        schoolbook_avx(vc + 96, va + 48, vb + 48);
+        schoolbook16x16(vc, va, vb);
+        schoolbook16x16(vc + 32, va + 16, vb + 16);
+        schoolbook16x16(vc + 64, va + 32, vb + 32);
+        schoolbook16x16(vc + 96, va + 48, vb + 48);
     } else {
-        schoolbook_avx_acc(vc, va, vb);
-        schoolbook_avx_acc(vc + 32, va + 16, vb + 16);
-        schoolbook_avx_acc(vc + 64, va + 32, vb + 32);
-        schoolbook_avx_acc(vc + 96, va + 48, vb + 48);
+        schoolbook16x16_acc(vc, va, vb);
+        schoolbook16x16_acc(vc + 32, va + 16, vb + 16);
+        schoolbook16x16_acc(vc + 64, va + 32, vb + 32);
+        schoolbook16x16_acc(vc + 96, va + 48, vb + 48);
     }
 }
 
diff --git a/crypto_kem/saber/META.yml b/crypto_kem/saber/META.yml
index f6375c71..742d77c5 100644
--- a/crypto_kem/saber/META.yml
+++ b/crypto_kem/saber/META.yml
@@ -14,9 +14,9 @@ principal-submitters:
   - Frederik Vercauteren
 implementations:
     - name: clean
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber
     - name: avx2
-      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/3a63008f/saber
+      version: https://github.com/KULeuven-COSIC/SABER/tree/509cc5ec3a7e12a751ccdd2ef5bd6e54e00bd350 via https://github.com/jschanck/package-pqclean/tree/1ae84c3c/saber
       supported_platforms:
           - architecture: x86_64
             operating_systems:
diff --git a/crypto_kem/saber/avx2/poly_mul.c b/crypto_kem/saber/avx2/poly_mul.c
index 5ec0aa73..2090e64f 100644
--- a/crypto_kem/saber/avx2/poly_mul.c
+++ b/crypto_kem/saber/avx2/poly_mul.c
@@ -4,701 +4,673 @@
 
 #define L (SABER_N / 64)
 
-static inline __m256i mul_add(__m256i a, __m256i b, __m256i c) {
-    return _mm256_add_epi16(_mm256_mullo_epi16(a, b), c);
-}
-
-static void schoolbook_avx_acc(__m256i *c, const __m256i *a, const __m256i *b) {
-    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-    __m256i temp;
+/* 16 word parallel multiply */
+#define mul(a, b)     _mm256_mullo_epi16((a), (b))
+/* 16 word parallel multiply and accumulate */
+#define mac(a, b, c)  _mm256_add_epi16(_mm256_mullo_epi16((a), (b)), (c))
 
+static void schoolbook16x16(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i t0;
     a0 = a[0];
     a1 = a[1];
     a2 = a[2];
     a3 = a[3];
-    a4 = a[4];
-    a5 = a[5];
-    a6 = a[6];
-    a7 = a[7];
-
     b0 = b[0];
     b1 = b[1];
     b2 = b[2];
     b3 = b[3];
-    b4 = b[4];
-    b5 = b[5];
-    b6 = b[6];
-    b7 = b[7];
-
-    c[0] = mul_add(a0, b0, c[0]);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    temp = mul_add(a1, b0, temp);
-    c[1] = _mm256_add_epi16(temp, c[1]);
-
-    temp = _mm256_mullo_epi16(a0, b2);
-    temp = mul_add(a1, b1, temp);
-    temp = mul_add(a2, b0, temp);
-    c[2] = _mm256_add_epi16(temp, c[2]);
-
-    temp = _mm256_mullo_epi16(a0, b3);
-    temp = mul_add(a1, b2, temp);
-    temp = mul_add(a2, b1, temp);
-    temp = mul_add(a3, b0, temp);
-    c[3] = _mm256_add_epi16(temp, c[3]);
-
-    temp = _mm256_mullo_epi16(a0, b4);
-    temp = mul_add(a1, b3, temp);
-    temp = mul_add(a3, b1, temp);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    c[4] = _mm256_add_epi16(temp, c[4]);
-
-    temp = _mm256_mullo_epi16(a0, b5);
-    temp = mul_add(a1, b4, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add( a4, b1, temp);
-    temp = mul_add(a5, b0, temp);
-    c[5] = _mm256_add_epi16(temp, c[5]);
-
-    temp = _mm256_mullo_epi16(a0, b6);
-    temp = mul_add(a1, b5, temp);
-    temp = mul_add(a5, b1, temp);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a4, b2, temp);
-    c[6] = _mm256_add_epi16(temp, c[6]);
-
-    temp = _mm256_mullo_epi16(a0, b7);
-    temp = mul_add(a1, b6, temp);
-    temp = mul_add(a6, b1, temp);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a5, b2, temp);
-    c[7] = _mm256_add_epi16(temp, c[7]);
-
-    temp = _mm256_mullo_epi16(a0, b[8]);
-    temp = mul_add(a1, b7, temp);
-    temp = mul_add(a7, b1, temp);
-    temp = mul_add(a[8], b0, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a6, b2, temp);
-    c[8] = _mm256_add_epi16(temp, c[8]);
-
-    temp = _mm256_mullo_epi16(a0, b[9]);
-    temp = mul_add(a1, b[8], temp);
-    temp = mul_add(a[8], b1, temp);
-    temp = mul_add(a[9], b0, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a7, b2, temp);
-    c[9] = _mm256_add_epi16(temp, c[9]);
-
-    temp = _mm256_mullo_epi16(a0, b[10]);
-    temp = mul_add(a1, b[9], temp);
-    temp = mul_add(a[9], b1, temp);
-    temp = mul_add(a[10], b0, temp);
-    temp = mul_add(a2, b[8], temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a[8], b2, temp);
-    c[10] = _mm256_add_epi16(temp, c[10]);
-
-    temp = _mm256_mullo_epi16(a0, b[11]);
-    temp = mul_add(a1, b[10], temp);
-    temp = mul_add(a[10], b1, temp);
-    temp = mul_add(a[11], b0, temp);
-    temp = mul_add(a2, b[9], temp);
-    temp = mul_add(a3, b[8], temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a[8], b3, temp);
-    temp = mul_add(a[9], b2, temp);
-    c[11] = _mm256_add_epi16(temp, c[11]);
-
-    temp = _mm256_mullo_epi16(a0, b[12]);
-    temp = mul_add(a1, b[11], temp);
-    temp = mul_add(a[11], b1, temp);
-    temp = mul_add(a[12], b0, temp);
-    temp = mul_add(a2, b[10], temp);
-    temp = mul_add(a3, b[9], temp);
-    temp = mul_add(a4, b[8], temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a[8], b4, temp);
-    temp = mul_add(a[9], b3, temp);
-    temp = mul_add(a[10], b2, temp);
-    c[12] = _mm256_add_epi16(temp, c[12]);
-
-    temp = _mm256_mullo_epi16(a0, b[13]);
-    temp = mul_add(a1, b[12], temp);
-    temp = mul_add(a[12], b1, temp);
-    temp = mul_add(a[13], b0, temp);
-    temp = mul_add(a2, b[11], temp);
-    temp = mul_add(a3, b[10], temp);
-    temp = mul_add(a4, b[9], temp);
-    temp = mul_add(a5, b[8], temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a[8], b5, temp);
-    temp = mul_add(a[9], b4, temp);
-    temp = mul_add(a[10], b3, temp);
-    temp = mul_add(a[11], b2, temp);
-    c[13] = _mm256_add_epi16(temp, c[13]);
-
-    temp = _mm256_mullo_epi16(a0, b[14]);
-    temp = mul_add(a1, b[13], temp);
-    temp = mul_add(a[13], b1, temp);
-    temp = mul_add(a[14], b0, temp);
-    temp = mul_add(a2, b[12], temp);
-    temp = mul_add(a3, b[11], temp);
-    temp = mul_add(a4, b[10], temp);
-    temp = mul_add(a5, b[9], temp);
-    temp = mul_add(a6, b[8], temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a[8], b6, temp);
-    temp = mul_add(a[9], b5, temp);
-    temp = mul_add(a[10], b4, temp);
-    temp = mul_add(a[11], b3, temp);
-    temp = mul_add(a[12], b2, temp);
-    c[14] = _mm256_add_epi16(temp, c[14]);
-
-    temp = _mm256_mullo_epi16(a0, b[15]);
-    temp = mul_add(a1, b[14], temp);
-    temp = mul_add(a[14], b1, temp);
-    temp = mul_add(a[15], b0, temp);
-    temp = mul_add(a2, b[13], temp);
-    temp = mul_add(a3, b[12], temp);
-    temp = mul_add(a4, b[11], temp);
-    temp = mul_add(a5, b[10], temp);
-    temp = mul_add(a6, b[9], temp);
-    temp = mul_add(a7, b[8], temp);
-    temp = mul_add(a[8], b7, temp);
-    temp = mul_add(a[9], b6, temp);
-    temp = mul_add(a[10], b5, temp);
-    temp = mul_add(a[11], b4, temp);
-    temp = mul_add(a[12], b3, temp);
-    temp = mul_add(a[13], b2, temp);
-    c[15] = _mm256_add_epi16(temp, c[15]);
-
-    a0 = a[14];
-    a1 = a[15];
-    a2 = a[13];
-    a3 = a[12];
-    a4 = a[11];
-    a5 = a[10];
-    a6 = a[9];
-    a7 = a[8];
-
-    b0 = b[14];
-    b1 = b[15];
-    b2 = b[13];
-    b3 = b[12];
-    b4 = b[11];
-    b5 = b[10];
-    b6 = b[9];
-    b7 = b[8];
-
-    temp = _mm256_mullo_epi16(a[1], b1);
-    temp = mul_add(a[2], b0, temp);
-    temp = mul_add(a[3], b2, temp);
-    temp = mul_add(a[4], b3, temp);
-    temp = mul_add(a[5], b4, temp);
-    temp = mul_add(a[6], b5, temp);
-    temp = mul_add(a[7], b6, temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a6, b[7], temp);
-    temp = mul_add(a5, b[6], temp);
-    temp = mul_add(a4, b[5], temp);
-    temp = mul_add(a3, b[4], temp);
-    temp = mul_add(a2, b[3], temp);
-    temp = mul_add(a0, b[2], temp);
-    temp = mul_add(a1, b[1], temp);
-    c[16] = _mm256_add_epi16(temp, c[16]);
-
-    temp = _mm256_mullo_epi16(a[2], b1);
-    temp = mul_add(a[3], b0, temp);
-    temp = mul_add(a[4], b2, temp);
-    temp = mul_add(a[5], b3, temp);
-    temp = mul_add(a[6], b4, temp);
-    temp = mul_add(a[7], b5, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a5, b[7], temp);
-    temp = mul_add(a4, b[6], temp);
-    temp = mul_add(a3, b[5], temp);
-    temp = mul_add(a2, b[4], temp);
-    temp = mul_add(a0, b[3], temp);
-    temp = mul_add(a1, b[2], temp);
-    c[17] = _mm256_add_epi16(temp, c[17]);
-
-    temp = _mm256_mullo_epi16(a[3], b1);
-    temp = mul_add(a[4], b0, temp);
-    temp = mul_add(a[5], b2, temp);
-    temp = mul_add(a[6], b3, temp);
-    temp = mul_add(a[7], b4, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a4, b[7], temp);
-    temp = mul_add(a3, b[6], temp);
-    temp = mul_add(a2, b[5], temp);
-    temp = mul_add(a0, b[4], temp);
-    temp = mul_add(a1, b[3], temp);
-    c[18] = _mm256_add_epi16(temp, c[18]);
-
-    temp = _mm256_mullo_epi16(a[4], b1);
-    temp = mul_add(a[5], b0, temp);
-    temp = mul_add(a[6], b2, temp);
-    temp = mul_add(a[7], b3, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a3, b[7], temp);
-    temp = mul_add(a2, b[6], temp);
-    temp = mul_add(a0, b[5], temp);
-    temp = mul_add(a1, b[4], temp);
-    c[19] = _mm256_add_epi16(temp, c[19]);
-
-    temp = _mm256_mullo_epi16(a[5], b1);
-    temp = mul_add(a[6], b0, temp);
-    temp = mul_add(a[7], b2, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a2, b[7], temp);
-    temp = mul_add(a0, b[6], temp);
-    temp = mul_add(a1, b[5], temp);
-    c[20] = _mm256_add_epi16(temp, c[20]);
-
-    temp = _mm256_mullo_epi16(a[6], b1);
-    temp = mul_add(a[7], b0, temp);
-    temp = mul_add(a7, b2, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a0, b[7], temp);
-    temp = mul_add(a1, b[6], temp);
-    c[21] = _mm256_add_epi16(temp, c[21]);
-
-    temp = _mm256_mullo_epi16(a[7], b1);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a6, b2, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a0, b7, temp);
-    temp = mul_add(a1, b[7], temp);
-    c[22] = _mm256_add_epi16(temp, c[22]);
-
-    temp = _mm256_mullo_epi16(a7, b1);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a5, b2, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a0, b6, temp);
-    temp = mul_add(a1, b7, temp);
-    c[23] = _mm256_add_epi16(temp, c[23]);
-
-    temp = _mm256_mullo_epi16(a6, b1);
-    temp = mul_add(a5, b0, temp);
-    temp = mul_add(a4, b2, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a0, b5, temp);
-    temp = mul_add(a1, b6, temp);
-    c[24] = _mm256_add_epi16(temp, c[24]);
-
-    temp = _mm256_mullo_epi16(a5, b1);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a0, b4, temp);
-    temp = mul_add(a1, b5, temp);
-    c[25] = _mm256_add_epi16(temp, c[25]);
-
-    temp = _mm256_mullo_epi16(a4, b1);
-    temp = mul_add(a3, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    temp = mul_add(a0, b3, temp);
-    temp = mul_add(a1, b4, temp);
-    c[26] = _mm256_add_epi16(temp, c[26]);
-
-    temp = _mm256_mullo_epi16(a3, b1);
-    temp = mul_add(a2, b0, temp);
-    temp = mul_add(a0, b2, temp);
-    temp = mul_add(a1, b3, temp);
-    c[27] = _mm256_add_epi16(temp, c[27]);
-
-    temp = _mm256_mullo_epi16(a2, b1);
-    temp = mul_add(a0, b0, temp);
-    temp = mul_add(a1, b2, temp);
-    c[28] = _mm256_add_epi16(temp, c[28]);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    temp = mul_add(a1, b0, temp);
-    c[29] = _mm256_add_epi16(temp, c[29]);
-
-    c[30] = mul_add(a1, b1, c[30]);
-
-    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+    c[0] = mul(a0, b0);
+    t0 = mul(a0, b1);
+    c[1] = mac(a1, b0, t0);
+    t0 = mul(a0, b2);
+    t0 = mac(a1, b1, t0);
+    c[2] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[3] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[4] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[5] = mac(a3, b2, t0);
+    c[6] = mul(a3, b3);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mul(a3, b3);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mul(a3, b3);
+    a0 = a[4];
+    a1 = a[5];
+    a2 = a[6];
+    a3 = a[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    a0 = a[8];
+    a1 = a[9];
+    a2 = a[10];
+    a3 = a[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mul(a3, b3);
+    a0 = a[12];
+    a1 = a[13];
+    a2 = a[14];
+    a3 = a[15];
+    c[24] = mac(a0, b0, c[24]);
+    t0 = mac(a0, b1, c[25]);
+    c[25] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[26]);
+    t0 = mac(a1, b1, t0);
+    c[26] = mac(a2, b0, t0);
+    t0 = mul(a0, b3);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[27] = mac(a3, b0, t0);
+    t0 = mul(a1, b3);
+    t0 = mac(a2, b2, t0);
+    c[28] = mac(a3, b1, t0);
+    t0 = mul(a2, b3);
+    c[29] = mac(a3, b2, t0);
+    c[30] = mul(a3, b3);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    c[31] = _mm256_setzero_si256();
 }
 
-
-static void schoolbook_avx(__m256i *c, const __m256i *a, const __m256i *b) {
-    __m256i a0, a1, a2, a3, a4, a5, a6, a7, b0, b1, b2, b3, b4, b5, b6, b7;
-    __m256i temp;
-
+static void schoolbook16x16_acc(__m256i *c, const __m256i *a, const __m256i *b) {
+    __m256i a0, a1, a2, a3;
+    __m256i b0, b1, b2, b3;
+    __m256i t0;
     a0 = a[0];
     a1 = a[1];
     a2 = a[2];
     a3 = a[3];
-    a4 = a[4];
-    a5 = a[5];
-    a6 = a[6];
-    a7 = a[7];
-
     b0 = b[0];
     b1 = b[1];
     b2 = b[2];
     b3 = b[3];
-    b4 = b[4];
-    b5 = b[5];
-    b6 = b[6];
-    b7 = b[7];
-
-    c[0] = _mm256_mullo_epi16(a0, b0);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    c[1] = mul_add(a1, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b2);
-    temp = mul_add(a1, b1, temp);
-    c[2] = mul_add(a2, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b3);
-    temp = mul_add(a1, b2, temp);
-    temp = mul_add(a2, b1, temp);
-    c[3] = mul_add(a3, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b4);
-    temp = mul_add(a1, b3, temp);
-    temp = mul_add(a3, b1, temp);
-    temp = mul_add(a4, b0, temp);
-    c[4] = mul_add(a2, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b5);
-    temp = mul_add(a1, b4, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add( a4, b1, temp);
-    c[5] = mul_add(a5, b0, temp);
-
-    temp = _mm256_mullo_epi16(a0, b6);
-    temp = mul_add(a1, b5, temp);
-    temp = mul_add(a5, b1, temp);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a3, b3, temp);
-    c[6] = mul_add(a4, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b7);
-    temp = mul_add(a1, b6, temp);
-    temp = mul_add(a6, b1, temp);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a4, b3, temp);
-    c[7] = mul_add(a5, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[8]);
-    temp = mul_add(a1, b7, temp);
-    temp = mul_add(a7, b1, temp);
-    temp = mul_add(a[8], b0, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a5, b3, temp);
-    c[8] = mul_add(a6, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[9]);
-    temp = mul_add(a1, b[8], temp);
-    temp = mul_add(a[8], b1, temp);
-    temp = mul_add(a[9], b0, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a6, b3, temp);
-    c[9] = mul_add(a7, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[10]);
-    temp = mul_add(a1, b[9], temp);
-    temp = mul_add(a[9], b1, temp);
-    temp = mul_add(a[10], b0, temp);
-    temp = mul_add(a2, b[8], temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a7, b3, temp);
-    c[10] = mul_add(a[8], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[11]);
-    temp = mul_add(a1, b[10], temp);
-    temp = mul_add(a[10], b1, temp);
-    temp = mul_add(a[11], b0, temp);
-    temp = mul_add(a2, b[9], temp);
-    temp = mul_add(a3, b[8], temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a[8], b3, temp);
-    c[11] = mul_add(a[9], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[12]);
-    temp = mul_add(a1, b[11], temp);
-    temp = mul_add(a[11], b1, temp);
-    temp = mul_add(a[12], b0, temp);
-    temp = mul_add(a2, b[10], temp);
-    temp = mul_add(a3, b[9], temp);
-    temp = mul_add(a4, b[8], temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a[8], b4, temp);
-    temp = mul_add(a[9], b3, temp);
-    c[12] = mul_add(a[10], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[13]);
-    temp = mul_add(a1, b[12], temp);
-    temp = mul_add(a[12], b1, temp);
-    temp = mul_add(a[13], b0, temp);
-    temp = mul_add(a2, b[11], temp);
-    temp = mul_add(a3, b[10], temp);
-    temp = mul_add(a4, b[9], temp);
-    temp = mul_add(a5, b[8], temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a[8], b5, temp);
-    temp = mul_add(a[9], b4, temp);
-    temp = mul_add(a[10], b3, temp);
-    c[13] = mul_add(a[11], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[14]);
-    temp = mul_add(a1, b[13], temp);
-    temp = mul_add(a[13], b1, temp);
-    temp = mul_add(a[14], b0, temp);
-    temp = mul_add(a2, b[12], temp);
-    temp = mul_add(a3, b[11], temp);
-    temp = mul_add(a4, b[10], temp);
-    temp = mul_add(a5, b[9], temp);
-    temp = mul_add(a6, b[8], temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a[8], b6, temp);
-    temp = mul_add(a[9], b5, temp);
-    temp = mul_add(a[10], b4, temp);
-    temp = mul_add(a[11], b3, temp);
-    c[14] = mul_add(a[12], b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b[15]);
-    temp = mul_add(a1, b[14], temp);
-    temp = mul_add(a[14], b1, temp);
-    temp = mul_add(a[15], b0, temp);
-    temp = mul_add(a2, b[13], temp);
-    temp = mul_add(a3, b[12], temp);
-    temp = mul_add(a4, b[11], temp);
-    temp = mul_add(a5, b[10], temp);
-    temp = mul_add(a6, b[9], temp);
-    temp = mul_add(a7, b[8], temp);
-    temp = mul_add(a[8], b7, temp);
-    temp = mul_add(a[9], b6, temp);
-    temp = mul_add(a[10], b5, temp);
-    temp = mul_add(a[11], b4, temp);
-    temp = mul_add(a[12], b3, temp);
-    c[15] = mul_add(a[13], b2, temp);
-
-    // unrolled second triangle
-    a0 = a[14];
-    a1 = a[15];
-    a2 = a[13];
-    a3 = a[12];
-    a4 = a[11];
-    a5 = a[10];
-    a6 = a[9];
-    a7 = a[8];
-
-    b0 = b[14];
-    b1 = b[15];
-    b2 = b[13];
-    b3 = b[12];
-    b4 = b[11];
-    b5 = b[10];
-    b6 = b[9];
-    b7 = b[8];
-
-    temp = _mm256_mullo_epi16(a[1], b1);
-    temp = mul_add(a[2], b0, temp);
-    temp = mul_add(a[3], b2, temp);
-    temp = mul_add(a[4], b3, temp);
-    temp = mul_add(a[5], b4, temp);
-    temp = mul_add(a[6], b5, temp);
-    temp = mul_add(a[7], b6, temp);
-    temp = mul_add(a7, b7, temp);
-    temp = mul_add(a6, b[7], temp);
-    temp = mul_add(a5, b[6], temp);
-    temp = mul_add(a4, b[5], temp);
-    temp = mul_add(a3, b[4], temp);
-    temp = mul_add(a2, b[3], temp);
-    temp = mul_add(a0, b[2], temp);
-    c[16] = mul_add(a1, b[1], temp);
-
-    temp = _mm256_mullo_epi16(a[2], b1);
-    temp = mul_add(a[3], b0, temp);
-    temp = mul_add(a[4], b2, temp);
-    temp = mul_add(a[5], b3, temp);
-    temp = mul_add(a[6], b4, temp);
-    temp = mul_add(a[7], b5, temp);
-    temp = mul_add(a7, b6, temp);
-    temp = mul_add(a6, b7, temp);
-    temp = mul_add(a5, b[7], temp);
-    temp = mul_add(a4, b[6], temp);
-    temp = mul_add(a3, b[5], temp);
-    temp = mul_add(a2, b[4], temp);
-    temp = mul_add(a0, b[3], temp);
-    c[17] = mul_add(a1, b[2], temp);
-
-    temp = _mm256_mullo_epi16(a[3], b1);
-    temp = mul_add(a[4], b0, temp);
-    temp = mul_add(a[5], b2, temp);
-    temp = mul_add(a[6], b3, temp);
-    temp = mul_add(a[7], b4, temp);
-    temp = mul_add(a7, b5, temp);
-    temp = mul_add(a6, b6, temp);
-    temp = mul_add(a5, b7, temp);
-    temp = mul_add(a4, b[7], temp);
-    temp = mul_add(a3, b[6], temp);
-    temp = mul_add(a2, b[5], temp);
-    temp = mul_add(a0, b[4], temp);
-    c[18] = mul_add(a1, b[3], temp);
-
-    temp = _mm256_mullo_epi16(a[4], b1);
-    temp = mul_add(a[5], b0, temp);
-    temp = mul_add(a[6], b2, temp);
-    temp = mul_add(a[7], b3, temp);
-    temp = mul_add(a7, b4, temp);
-    temp = mul_add(a6, b5, temp);
-    temp = mul_add(a5, b6, temp);
-    temp = mul_add(a4, b7, temp);
-    temp = mul_add(a3, b[7], temp);
-    temp = mul_add(a2, b[6], temp);
-    temp = mul_add(a0, b[5], temp);
-    c[19] = mul_add(a1, b[4], temp);
-
-    temp = _mm256_mullo_epi16(a[5], b1);
-    temp = mul_add(a[6], b0, temp);
-    temp = mul_add(a[7], b2, temp);
-    temp = mul_add(a7, b3, temp);
-    temp = mul_add(a6, b4, temp);
-    temp = mul_add(a5, b5, temp);
-    temp = mul_add(a4, b6, temp);
-    temp = mul_add(a3, b7, temp);
-    temp = mul_add(a2, b[7], temp);
-    temp = mul_add(a0, b[6], temp);
-    c[20] = mul_add(a1, b[5], temp);
-
-    temp = _mm256_mullo_epi16(a[6], b1);
-    temp = mul_add(a[7], b0, temp);
-    temp = mul_add(a7, b2, temp);
-    temp = mul_add(a6, b3, temp);
-    temp = mul_add(a5, b4, temp);
-    temp = mul_add(a4, b5, temp);
-    temp = mul_add(a3, b6, temp);
-    temp = mul_add(a2, b7, temp);
-    temp = mul_add(a0, b[7], temp);
-    c[21] = mul_add(a1, b[6], temp);
-
-    temp = _mm256_mullo_epi16(a[7], b1);
-    temp = mul_add(a7, b0, temp);
-    temp = mul_add(a6, b2, temp);
-    temp = mul_add(a5, b3, temp);
-    temp = mul_add(a4, b4, temp);
-    temp = mul_add(a3, b5, temp);
-    temp = mul_add(a2, b6, temp);
-    temp = mul_add(a0, b7, temp);
-    c[22] = mul_add(a1, b[7], temp);
-
-    temp = _mm256_mullo_epi16(a7, b1);
-    temp = mul_add(a6, b0, temp);
-    temp = mul_add(a5, b2, temp);
-    temp = mul_add(a4, b3, temp);
-    temp = mul_add(a3, b4, temp);
-    temp = mul_add(a2, b5, temp);
-    temp = mul_add(a0, b6, temp);
-    c[23] = mul_add(a1, b7, temp);
-
-    temp = _mm256_mullo_epi16(a6, b1);
-    temp = mul_add(a5, b0, temp);
-    temp = mul_add(a4, b2, temp);
-    temp = mul_add(a3, b3, temp);
-    temp = mul_add(a2, b4, temp);
-    temp = mul_add(a0, b5, temp);
-    c[24] = mul_add(a1, b6, temp);
-
-    temp = _mm256_mullo_epi16(a5, b1);
-    temp = mul_add(a4, b0, temp);
-    temp = mul_add(a3, b2, temp);
-    temp = mul_add(a2, b3, temp);
-    temp = mul_add(a0, b4, temp);
-    c[25] = mul_add(a1, b5, temp);
-
-    temp = _mm256_mullo_epi16(a4, b1);
-    temp = mul_add(a3, b0, temp);
-    temp = mul_add(a2, b2, temp);
-    temp = mul_add(a0, b3, temp);
-    c[26] = mul_add(a1, b4, temp);
-
-    temp = _mm256_mullo_epi16(a3, b1);
-    temp = mul_add(a2, b0, temp);
-    temp = mul_add(a0, b2, temp);
-    c[27] = mul_add(a1, b3, temp);
-
-    temp = _mm256_mullo_epi16(a2, b1);
-    temp = mul_add(a0, b0, temp);
-    c[28] = mul_add(a1, b2, temp);
-
-    temp = _mm256_mullo_epi16(a0, b1);
-    c[29] = mul_add(a1, b0, temp);
-
-    c[30] = _mm256_mullo_epi16(a1, b1);
-
-    c[31] = _mm256_set_epi64x(0, 0, 0, 0);
+    c[0] = mac(a0, b0, c[0]);
+    t0 = mac(a0, b1, c[1]);
+    c[1] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[2]);
+    t0 = mac(a1, b1, t0);
+    c[2] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[3]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[3] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[4]);
+    t0 = mac(a2, b2, t0);
+    c[4] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[5]);
+    c[5] = mac(a3, b2, t0);
+    c[6] = mac(a3, b3, c[6]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    a0 = a[4];
+    a1 = a[5];
+    a2 = a[6];
+    a3 = a[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[4] = mac(a0, b0, c[4]);
+    t0 = mac(a0, b1, c[5]);
+    c[5] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[6]);
+    t0 = mac(a1, b1, t0);
+    c[6] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[7]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[7] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[8]);
+    t0 = mac(a2, b2, t0);
+    c[8] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[9]);
+    c[9] = mac(a3, b2, t0);
+    c[10] = mac(a3, b3, c[10]);
+    a0 = a[8];
+    a1 = a[9];
+    a2 = a[10];
+    a3 = a[11];
+    c[8] = mac(a0, b0, c[8]);
+    t0 = mac(a0, b1, c[9]);
+    c[9] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[10]);
+    t0 = mac(a1, b1, t0);
+    c[10] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[11]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[11] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[12]);
+    t0 = mac(a2, b2, t0);
+    c[12] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[13]);
+    c[13] = mac(a3, b2, t0);
+    c[14] = mac(a3, b3, c[14]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[12];
+    b1 = b[13];
+    b2 = b[14];
+    b3 = b[15];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    a0 = a[12];
+    a1 = a[13];
+    a2 = a[14];
+    a3 = a[15];
+    c[24] = mac(a0, b0, c[24]);
+    t0 = mac(a0, b1, c[25]);
+    c[25] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[26]);
+    t0 = mac(a1, b1, t0);
+    c[26] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[27]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[27] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[28]);
+    t0 = mac(a2, b2, t0);
+    c[28] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[29]);
+    c[29] = mac(a3, b2, t0);
+    c[30] = mac(a3, b3, c[30]);
+    b0 = b[8];
+    b1 = b[9];
+    b2 = b[10];
+    b3 = b[11];
+    c[20] = mac(a0, b0, c[20]);
+    t0 = mac(a0, b1, c[21]);
+    c[21] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[22]);
+    t0 = mac(a1, b1, t0);
+    c[22] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[23]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[23] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[24]);
+    t0 = mac(a2, b2, t0);
+    c[24] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[25]);
+    c[25] = mac(a3, b2, t0);
+    c[26] = mac(a3, b3, c[26]);
+    b0 = b[4];
+    b1 = b[5];
+    b2 = b[6];
+    b3 = b[7];
+    c[16] = mac(a0, b0, c[16]);
+    t0 = mac(a0, b1, c[17]);
+    c[17] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[18]);
+    t0 = mac(a1, b1, t0);
+    c[18] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[19]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[19] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[20]);
+    t0 = mac(a2, b2, t0);
+    c[20] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[21]);
+    c[21] = mac(a3, b2, t0);
+    c[22] = mac(a3, b3, c[22]);
+    b0 = b[0];
+    b1 = b[1];
+    b2 = b[2];
+    b3 = b[3];
+    c[12] = mac(a0, b0, c[12]);
+    t0 = mac(a0, b1, c[13]);
+    c[13] = mac(a1, b0, t0);
+    t0 = mac(a0, b2, c[14]);
+    t0 = mac(a1, b1, t0);
+    c[14] = mac(a2, b0, t0);
+    t0 = mac(a0, b3, c[15]);
+    t0 = mac(a1, b2, t0);
+    t0 = mac(a2, b1, t0);
+    c[15] = mac(a3, b0, t0);
+    t0 = mac(a1, b3, c[16]);
+    t0 = mac(a2, b2, t0);
+    c[16] = mac(a3, b1, t0);
+    t0 = mac(a2, b3, c[17]);
+    c[17] = mac(a3, b2, t0);
+    c[18] = mac(a3, b3, c[18]);
 }
 
+
 static void transpose(__m256i *M) {
     __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11;
     __m256i temp, temp0, temp1, temp2;
@@ -916,15 +888,15 @@ static void batch_64coefficient_multiplications(toom4_points_product *c_eval, co
     //-----------------Forward transposes ends---------------------------------
 
     if (accumulate == 0) {
-        schoolbook_avx(vc, va, vb);
-        schoolbook_avx(vc + 32, va + 16, vb + 16);
-        schoolbook_avx(vc + 64, va + 32, vb + 32);
-        schoolbook_avx(vc + 96, va + 48, vb + 48);
+        schoolbook16x16(vc, va, vb);
+        schoolbook16x16(vc + 32, va + 16, vb + 16);
+        schoolbook16x16(vc + 64, va + 32, vb + 32);
+        schoolbook16x16(vc + 96, va + 48, vb + 48);
     } else {
-        schoolbook_avx_acc(vc, va, vb);
-        schoolbook_avx_acc(vc + 32, va + 16, vb + 16);
-        schoolbook_avx_acc(vc + 64, va + 32, vb + 32);
-        schoolbook_avx_acc(vc + 96, va + 48, vb + 48);
+        schoolbook16x16_acc(vc, va, vb);
+        schoolbook16x16_acc(vc + 32, va + 16, vb + 16);
+        schoolbook16x16_acc(vc + 64, va + 32, vb + 32);
+        schoolbook16x16_acc(vc + 96, va + 48, vb + 48);
     }
 }