Normalize whitespace

5 years ago · 7028025eb9
--- a/.clang-format
+++ b/.clang-format
@@ -2,5 +2,6 @@
 Language:        Cpp
 BasedOnStyle:  LLVM
 AllowShortFunctionsOnASingleLine: false
 IndentWidth: 4
 ...

--- a/+ 4
+++ b/+ 4
@@ -18,6 +18,10 @@ clean:
 .PHONY: format
 format:
 	find . -iname *.h -o -iname *.c | xargs clang-format -i -style=file
 ifneq (,$(shell which dos2unix))
 	# TODO should we make this mandatory?
 	find . -iname *.h -o -iname *.c | xargs dos2unix -q
 endif

 .PHONY: tidy
 tidy: require_scheme
--- a/common/fips202.c
+++ b/common/fips202.c
@@ -22,14 +22,14 @@
 * Returns the loaded 64-bit unsigned integer
 **************************************************/
 static uint64_t load64(const unsigned char *x) {
  unsigned int i;
  uint64_t r = 0;
    unsigned int i;
    uint64_t r = 0;

  for (i = 0; i < 8; ++i) {
    r |= (uint64_t)x[i] << 8 * i;
  }
    for (i = 0; i < 8; ++i) {
        r |= (uint64_t)x[i] << 8 * i;
    }

  return r;
    return r;
 }

 /*************************************************
@@ -41,11 +41,11 @@ static uint64_t load64(const unsigned char *x) {
 *              - uint64_t u: input 64-bit unsigned integer
 **************************************************/
 static void store64(uint8_t *x, uint64_t u) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < 8; ++i) {
    x[i] = u >> 8 * i;
  }
    for (i = 0; i < 8; ++i) {
        x[i] = u >> 8 * i;
    }
 }

 /* Keccak round constants */
@@ -71,266 +71,266 @@ static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
 * Arguments:   - uint64_t *state: pointer to input/output Keccak state
 **************************************************/
 static void KeccakF1600_StatePermute(uint64_t *state) {
  int round;

  uint64_t Aba, Abe, Abi, Abo, Abu;
  uint64_t Aga, Age, Agi, Ago, Agu;
  uint64_t Aka, Ake, Aki, Ako, Aku;
  uint64_t Ama, Ame, Ami, Amo, Amu;
  uint64_t Asa, Ase, Asi, Aso, Asu;
  uint64_t BCa, BCe, BCi, BCo, BCu;
  uint64_t Da, De, Di, Do, Du;
  uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
  uint64_t Ega, Ege, Egi, Ego, Egu;
  uint64_t Eka, Eke, Eki, Eko, Eku;
  uint64_t Ema, Eme, Emi, Emo, Emu;
  uint64_t Esa, Ese, Esi, Eso, Esu;

  // copyFromState(A, state)
  Aba = state[0];
  Abe = state[1];
  Abi = state[2];
  Abo = state[3];
  Abu = state[4];
  Aga = state[5];
  Age = state[6];
  Agi = state[7];
  Ago = state[8];
  Agu = state[9];
  Aka = state[10];
  Ake = state[11];
  Aki = state[12];
  Ako = state[13];
  Aku = state[14];
  Ama = state[15];
  Ame = state[16];
  Ami = state[17];
  Amo = state[18];
  Amu = state[19];
  Asa = state[20];
  Ase = state[21];
  Asi = state[22];
  Aso = state[23];
  Asu = state[24];

  for (round = 0; round < NROUNDS; round += 2) {
    //    prepareTheta
    BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
    BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase;
    BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
    BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
    BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;

    // thetaRhoPiChiIotaPrepareTheta(round  , A, E)
    Da = BCu ^ ROL(BCe, 1);
    De = BCa ^ ROL(BCi, 1);
    Di = BCe ^ ROL(BCo, 1);
    Do = BCi ^ ROL(BCu, 1);
    Du = BCo ^ ROL(BCa, 1);

    Aba ^= Da;
    BCa = Aba;
    Age ^= De;
    BCe = ROL(Age, 44);
    Aki ^= Di;
    BCi = ROL(Aki, 43);
    Amo ^= Do;
    BCo = ROL(Amo, 21);
    Asu ^= Du;
    BCu = ROL(Asu, 14);
    Eba = BCa ^ ((~BCe) & BCi);
    Eba ^= KeccakF_RoundConstants[round];
    Ebe = BCe ^ ((~BCi) & BCo);
    Ebi = BCi ^ ((~BCo) & BCu);
    Ebo = BCo ^ ((~BCu) & BCa);
    Ebu = BCu ^ ((~BCa) & BCe);

    Abo ^= Do;
    BCa = ROL(Abo, 28);
    Agu ^= Du;
    BCe = ROL(Agu, 20);
    Aka ^= Da;
    BCi = ROL(Aka, 3);
    Ame ^= De;
    BCo = ROL(Ame, 45);
    Asi ^= Di;
    BCu = ROL(Asi, 61);
    Ega = BCa ^ ((~BCe) & BCi);
    Ege = BCe ^ ((~BCi) & BCo);
    Egi = BCi ^ ((~BCo) & BCu);
    Ego = BCo ^ ((~BCu) & BCa);
    Egu = BCu ^ ((~BCa) & BCe);

    Abe ^= De;
    BCa = ROL(Abe, 1);
    Agi ^= Di;
    BCe = ROL(Agi, 6);
    Ako ^= Do;
    BCi = ROL(Ako, 25);
    Amu ^= Du;
    BCo = ROL(Amu, 8);
    Asa ^= Da;
    BCu = ROL(Asa, 18);
    Eka = BCa ^ ((~BCe) & BCi);
    Eke = BCe ^ ((~BCi) & BCo);
    Eki = BCi ^ ((~BCo) & BCu);
    Eko = BCo ^ ((~BCu) & BCa);
    Eku = BCu ^ ((~BCa) & BCe);

    Abu ^= Du;
    BCa = ROL(Abu, 27);
    Aga ^= Da;
    BCe = ROL(Aga, 36);
    Ake ^= De;
    BCi = ROL(Ake, 10);
    Ami ^= Di;
    BCo = ROL(Ami, 15);
    Aso ^= Do;
    BCu = ROL(Aso, 56);
    Ema = BCa ^ ((~BCe) & BCi);
    Eme = BCe ^ ((~BCi) & BCo);
    Emi = BCi ^ ((~BCo) & BCu);
    Emo = BCo ^ ((~BCu) & BCa);
    Emu = BCu ^ ((~BCa) & BCe);

    Abi ^= Di;
    BCa = ROL(Abi, 62);
    Ago ^= Do;
    BCe = ROL(Ago, 55);
    Aku ^= Du;
    BCi = ROL(Aku, 39);
    Ama ^= Da;
    BCo = ROL(Ama, 41);
    Ase ^= De;
    BCu = ROL(Ase, 2);
    Esa = BCa ^ ((~BCe) & BCi);
    Ese = BCe ^ ((~BCi) & BCo);
    Esi = BCi ^ ((~BCo) & BCu);
    Eso = BCo ^ ((~BCu) & BCa);
    Esu = BCu ^ ((~BCa) & BCe);

    //    prepareTheta
    BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
    BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
    BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
    BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
    BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;

    // thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
    Da = BCu ^ ROL(BCe, 1);
    De = BCa ^ ROL(BCi, 1);
    Di = BCe ^ ROL(BCo, 1);
    Do = BCi ^ ROL(BCu, 1);
    Du = BCo ^ ROL(BCa, 1);

    Eba ^= Da;
    BCa = Eba;
    Ege ^= De;
    BCe = ROL(Ege, 44);
    Eki ^= Di;
    BCi = ROL(Eki, 43);
    Emo ^= Do;
    BCo = ROL(Emo, 21);
    Esu ^= Du;
    BCu = ROL(Esu, 14);
    Aba = BCa ^ ((~BCe) & BCi);
    Aba ^= KeccakF_RoundConstants[round + 1];
    Abe = BCe ^ ((~BCi) & BCo);
    Abi = BCi ^ ((~BCo) & BCu);
    Abo = BCo ^ ((~BCu) & BCa);
    Abu = BCu ^ ((~BCa) & BCe);

    Ebo ^= Do;
    BCa = ROL(Ebo, 28);
    Egu ^= Du;
    BCe = ROL(Egu, 20);
    Eka ^= Da;
    BCi = ROL(Eka, 3);
    Eme ^= De;
    BCo = ROL(Eme, 45);
    Esi ^= Di;
    BCu = ROL(Esi, 61);
    Aga = BCa ^ ((~BCe) & BCi);
    Age = BCe ^ ((~BCi) & BCo);
    Agi = BCi ^ ((~BCo) & BCu);
    Ago = BCo ^ ((~BCu) & BCa);
    Agu = BCu ^ ((~BCa) & BCe);

    Ebe ^= De;
    BCa = ROL(Ebe, 1);
    Egi ^= Di;
    BCe = ROL(Egi, 6);
    Eko ^= Do;
    BCi = ROL(Eko, 25);
    Emu ^= Du;
    BCo = ROL(Emu, 8);
    Esa ^= Da;
    BCu = ROL(Esa, 18);
    Aka = BCa ^ ((~BCe) & BCi);
    Ake = BCe ^ ((~BCi) & BCo);
    Aki = BCi ^ ((~BCo) & BCu);
    Ako = BCo ^ ((~BCu) & BCa);
    Aku = BCu ^ ((~BCa) & BCe);

    Ebu ^= Du;
    BCa = ROL(Ebu, 27);
    Ega ^= Da;
    BCe = ROL(Ega, 36);
    Eke ^= De;
    BCi = ROL(Eke, 10);
    Emi ^= Di;
    BCo = ROL(Emi, 15);
    Eso ^= Do;
    BCu = ROL(Eso, 56);
    Ama = BCa ^ ((~BCe) & BCi);
    Ame = BCe ^ ((~BCi) & BCo);
    Ami = BCi ^ ((~BCo) & BCu);
    Amo = BCo ^ ((~BCu) & BCa);
    Amu = BCu ^ ((~BCa) & BCe);

    Ebi ^= Di;
    BCa = ROL(Ebi, 62);
    Ego ^= Do;
    BCe = ROL(Ego, 55);
    Eku ^= Du;
    BCi = ROL(Eku, 39);
    Ema ^= Da;
    BCo = ROL(Ema, 41);
    Ese ^= De;
    BCu = ROL(Ese, 2);
    Asa = BCa ^ ((~BCe) & BCi);
    Ase = BCe ^ ((~BCi) & BCo);
    Asi = BCi ^ ((~BCo) & BCu);
    Aso = BCo ^ ((~BCu) & BCa);
    Asu = BCu ^ ((~BCa) & BCe);
  }

  // copyToState(state, A)
  state[0] = Aba;
  state[1] = Abe;
  state[2] = Abi;
  state[3] = Abo;
  state[4] = Abu;
  state[5] = Aga;
  state[6] = Age;
  state[7] = Agi;
  state[8] = Ago;
  state[9] = Agu;
  state[10] = Aka;
  state[11] = Ake;
  state[12] = Aki;
  state[13] = Ako;
  state[14] = Aku;
  state[15] = Ama;
  state[16] = Ame;
  state[17] = Ami;
  state[18] = Amo;
  state[19] = Amu;
  state[20] = Asa;
  state[21] = Ase;
  state[22] = Asi;
  state[23] = Aso;
  state[24] = Asu;
    int round;

    uint64_t Aba, Abe, Abi, Abo, Abu;
    uint64_t Aga, Age, Agi, Ago, Agu;
    uint64_t Aka, Ake, Aki, Ako, Aku;
    uint64_t Ama, Ame, Ami, Amo, Amu;
    uint64_t Asa, Ase, Asi, Aso, Asu;
    uint64_t BCa, BCe, BCi, BCo, BCu;
    uint64_t Da, De, Di, Do, Du;
    uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
    uint64_t Ega, Ege, Egi, Ego, Egu;
    uint64_t Eka, Eke, Eki, Eko, Eku;
    uint64_t Ema, Eme, Emi, Emo, Emu;
    uint64_t Esa, Ese, Esi, Eso, Esu;

    // copyFromState(A, state)
    Aba = state[0];
    Abe = state[1];
    Abi = state[2];
    Abo = state[3];
    Abu = state[4];
    Aga = state[5];
    Age = state[6];
    Agi = state[7];
    Ago = state[8];
    Agu = state[9];
    Aka = state[10];
    Ake = state[11];
    Aki = state[12];
    Ako = state[13];
    Aku = state[14];
    Ama = state[15];
    Ame = state[16];
    Ami = state[17];
    Amo = state[18];
    Amu = state[19];
    Asa = state[20];
    Ase = state[21];
    Asi = state[22];
    Aso = state[23];
    Asu = state[24];

    for (round = 0; round < NROUNDS; round += 2) {
        //    prepareTheta
        BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa;
        BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase;
        BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi;
        BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso;
        BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu;

        // thetaRhoPiChiIotaPrepareTheta(round  , A, E)
        Da = BCu ^ ROL(BCe, 1);
        De = BCa ^ ROL(BCi, 1);
        Di = BCe ^ ROL(BCo, 1);
        Do = BCi ^ ROL(BCu, 1);
        Du = BCo ^ ROL(BCa, 1);

        Aba ^= Da;
        BCa = Aba;
        Age ^= De;
        BCe = ROL(Age, 44);
        Aki ^= Di;
        BCi = ROL(Aki, 43);
        Amo ^= Do;
        BCo = ROL(Amo, 21);
        Asu ^= Du;
        BCu = ROL(Asu, 14);
        Eba = BCa ^ ((~BCe) & BCi);
        Eba ^= KeccakF_RoundConstants[round];
        Ebe = BCe ^ ((~BCi) & BCo);
        Ebi = BCi ^ ((~BCo) & BCu);
        Ebo = BCo ^ ((~BCu) & BCa);
        Ebu = BCu ^ ((~BCa) & BCe);

        Abo ^= Do;
        BCa = ROL(Abo, 28);
        Agu ^= Du;
        BCe = ROL(Agu, 20);
        Aka ^= Da;
        BCi = ROL(Aka, 3);
        Ame ^= De;
        BCo = ROL(Ame, 45);
        Asi ^= Di;
        BCu = ROL(Asi, 61);
        Ega = BCa ^ ((~BCe) & BCi);
        Ege = BCe ^ ((~BCi) & BCo);
        Egi = BCi ^ ((~BCo) & BCu);
        Ego = BCo ^ ((~BCu) & BCa);
        Egu = BCu ^ ((~BCa) & BCe);

        Abe ^= De;
        BCa = ROL(Abe, 1);
        Agi ^= Di;
        BCe = ROL(Agi, 6);
        Ako ^= Do;
        BCi = ROL(Ako, 25);
        Amu ^= Du;
        BCo = ROL(Amu, 8);
        Asa ^= Da;
        BCu = ROL(Asa, 18);
        Eka = BCa ^ ((~BCe) & BCi);
        Eke = BCe ^ ((~BCi) & BCo);
        Eki = BCi ^ ((~BCo) & BCu);
        Eko = BCo ^ ((~BCu) & BCa);
        Eku = BCu ^ ((~BCa) & BCe);

        Abu ^= Du;
        BCa = ROL(Abu, 27);
        Aga ^= Da;
        BCe = ROL(Aga, 36);
        Ake ^= De;
        BCi = ROL(Ake, 10);
        Ami ^= Di;
        BCo = ROL(Ami, 15);
        Aso ^= Do;
        BCu = ROL(Aso, 56);
        Ema = BCa ^ ((~BCe) & BCi);
        Eme = BCe ^ ((~BCi) & BCo);
        Emi = BCi ^ ((~BCo) & BCu);
        Emo = BCo ^ ((~BCu) & BCa);
        Emu = BCu ^ ((~BCa) & BCe);

        Abi ^= Di;
        BCa = ROL(Abi, 62);
        Ago ^= Do;
        BCe = ROL(Ago, 55);
        Aku ^= Du;
        BCi = ROL(Aku, 39);
        Ama ^= Da;
        BCo = ROL(Ama, 41);
        Ase ^= De;
        BCu = ROL(Ase, 2);
        Esa = BCa ^ ((~BCe) & BCi);
        Ese = BCe ^ ((~BCi) & BCo);
        Esi = BCi ^ ((~BCo) & BCu);
        Eso = BCo ^ ((~BCu) & BCa);
        Esu = BCu ^ ((~BCa) & BCe);

        //    prepareTheta
        BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa;
        BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese;
        BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi;
        BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso;
        BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu;

        // thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
        Da = BCu ^ ROL(BCe, 1);
        De = BCa ^ ROL(BCi, 1);
        Di = BCe ^ ROL(BCo, 1);
        Do = BCi ^ ROL(BCu, 1);
        Du = BCo ^ ROL(BCa, 1);

        Eba ^= Da;
        BCa = Eba;
        Ege ^= De;
        BCe = ROL(Ege, 44);
        Eki ^= Di;
        BCi = ROL(Eki, 43);
        Emo ^= Do;
        BCo = ROL(Emo, 21);
        Esu ^= Du;
        BCu = ROL(Esu, 14);
        Aba = BCa ^ ((~BCe) & BCi);
        Aba ^= KeccakF_RoundConstants[round + 1];
        Abe = BCe ^ ((~BCi) & BCo);
        Abi = BCi ^ ((~BCo) & BCu);
        Abo = BCo ^ ((~BCu) & BCa);
        Abu = BCu ^ ((~BCa) & BCe);

        Ebo ^= Do;
        BCa = ROL(Ebo, 28);
        Egu ^= Du;
        BCe = ROL(Egu, 20);
        Eka ^= Da;
        BCi = ROL(Eka, 3);
        Eme ^= De;
        BCo = ROL(Eme, 45);
        Esi ^= Di;
        BCu = ROL(Esi, 61);
        Aga = BCa ^ ((~BCe) & BCi);
        Age = BCe ^ ((~BCi) & BCo);
        Agi = BCi ^ ((~BCo) & BCu);
        Ago = BCo ^ ((~BCu) & BCa);
        Agu = BCu ^ ((~BCa) & BCe);

        Ebe ^= De;
        BCa = ROL(Ebe, 1);
        Egi ^= Di;
        BCe = ROL(Egi, 6);
        Eko ^= Do;
        BCi = ROL(Eko, 25);
        Emu ^= Du;
        BCo = ROL(Emu, 8);
        Esa ^= Da;
        BCu = ROL(Esa, 18);
        Aka = BCa ^ ((~BCe) & BCi);
        Ake = BCe ^ ((~BCi) & BCo);
        Aki = BCi ^ ((~BCo) & BCu);
        Ako = BCo ^ ((~BCu) & BCa);
        Aku = BCu ^ ((~BCa) & BCe);

        Ebu ^= Du;
        BCa = ROL(Ebu, 27);
        Ega ^= Da;
        BCe = ROL(Ega, 36);
        Eke ^= De;
        BCi = ROL(Eke, 10);
        Emi ^= Di;
        BCo = ROL(Emi, 15);
        Eso ^= Do;
        BCu = ROL(Eso, 56);
        Ama = BCa ^ ((~BCe) & BCi);
        Ame = BCe ^ ((~BCi) & BCo);
        Ami = BCi ^ ((~BCo) & BCu);
        Amo = BCo ^ ((~BCu) & BCa);
        Amu = BCu ^ ((~BCa) & BCe);

        Ebi ^= Di;
        BCa = ROL(Ebi, 62);
        Ego ^= Do;
        BCe = ROL(Ego, 55);
        Eku ^= Du;
        BCi = ROL(Eku, 39);
        Ema ^= Da;
        BCo = ROL(Ema, 41);
        Ese ^= De;
        BCu = ROL(Ese, 2);
        Asa = BCa ^ ((~BCe) & BCi);
        Ase = BCe ^ ((~BCi) & BCo);
        Asi = BCi ^ ((~BCo) & BCu);
        Aso = BCo ^ ((~BCu) & BCa);
        Asu = BCu ^ ((~BCa) & BCe);
    }

    // copyToState(state, A)
    state[0] = Aba;
    state[1] = Abe;
    state[2] = Abi;
    state[3] = Abo;
    state[4] = Abu;
    state[5] = Aga;
    state[6] = Age;
    state[7] = Agi;
    state[8] = Ago;
    state[9] = Agu;
    state[10] = Aka;
    state[11] = Ake;
    state[12] = Aki;
    state[13] = Ako;
    state[14] = Aku;
    state[15] = Ama;
    state[16] = Ame;
    state[17] = Ami;
    state[18] = Amo;
    state[19] = Amu;
    state[20] = Asa;
    state[21] = Ase;
    state[22] = Asi;
    state[23] = Aso;
    state[24] = Asu;
 }

 /*************************************************
@@ -348,35 +348,35 @@ static void KeccakF1600_StatePermute(uint64_t *state) {
 **************************************************/
 static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m,
                          unsigned long long mlen, unsigned char p) {
  unsigned int i;
  unsigned char t[200];
    unsigned int i;
    unsigned char t[200];

    /* Zero state */
    for (i = 0; i < 25; ++i) {
        s[i] = 0;
    }

  /* Zero state */
  for (i = 0; i < 25; ++i) {
    s[i] = 0;
  }
    while (mlen >= r) {
        for (i = 0; i < r / 8; ++i) {
            s[i] ^= load64(m + 8 * i);
        }

  while (mlen >= r) {
    for (i = 0; i < r / 8; ++i) {
      s[i] ^= load64(m + 8 * i);
        KeccakF1600_StatePermute(s);
        mlen -= r;
        m += r;
    }

    KeccakF1600_StatePermute(s);
    mlen -= r;
    m += r;
  }

  for (i = 0; i < r; ++i) {
    t[i] = 0;
  }
  for (i = 0; i < mlen; ++i) {
    t[i] = m[i];
  }
  t[i] = p;
  t[r - 1] |= 128;
  for (i = 0; i < r / 8; ++i) {
    s[i] ^= load64(t + 8 * i);
  }
    for (i = 0; i < r; ++i) {
        t[i] = 0;
    }
    for (i = 0; i < mlen; ++i) {
        t[i] = m[i];
    }
    t[i] = p;
    t[r - 1] |= 128;
    for (i = 0; i < r / 8; ++i) {
        s[i] ^= load64(t + 8 * i);
    }
 }

 /*************************************************
@@ -394,16 +394,16 @@ static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m,
 **************************************************/
 static void keccak_squeezeblocks(unsigned char *h, unsigned long nblocks,
                                 uint64_t *s, unsigned int r) {
  unsigned int i;

  while (nblocks > 0) {
    KeccakF1600_StatePermute(s);
    for (i = 0; i < (r >> 3); i++) {
      store64(h + 8 * i, s[i]);
    unsigned int i;

    while (nblocks > 0) {
        KeccakF1600_StatePermute(s);
        for (i = 0; i < (r >> 3); i++) {
            store64(h + 8 * i, s[i]);
        }
        h += r;
        nblocks--;
    }
    h += r;
    nblocks--;
  }
 }

 /*************************************************
@@ -419,7 +419,7 @@ static void keccak_squeezeblocks(unsigned char *h, unsigned long nblocks,
 **************************************************/
 void shake128_absorb(uint64_t *s, const unsigned char *input,
                     unsigned long long inlen) {
  keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F);
    keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F);
 }

 /*************************************************
@@ -436,7 +436,7 @@ void shake128_absorb(uint64_t *s, const unsigned char *input,
 **************************************************/
 void shake128_squeezeblocks(unsigned char *output, unsigned long nblocks,
                            uint64_t *s) {
  keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE);
    keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE);
 }

 /*************************************************
@@ -452,7 +452,7 @@ void shake128_squeezeblocks(unsigned char *output, unsigned long nblocks,
 **************************************************/
 void shake256_absorb(uint64_t *s, const unsigned char *input,
                     unsigned long long inlen) {
  keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F);
    keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F);
 }

 /*************************************************
@@ -469,7 +469,7 @@ void shake256_absorb(uint64_t *s, const unsigned char *input,
 **************************************************/
 void shake256_squeezeblocks(unsigned char *output, unsigned long nblocks,
                            uint64_t *s) {
  keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
    keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
 }

 /*************************************************
@@ -484,23 +484,23 @@ void shake256_squeezeblocks(unsigned char *output, unsigned long nblocks,
 **************************************************/
 void shake128(unsigned char *output, unsigned long long outlen,
              const unsigned char *input, unsigned long long inlen) {
  unsigned int i;
  unsigned long nblocks = outlen / SHAKE128_RATE;
  unsigned char t[SHAKE128_RATE];
  uint64_t s[25];

  shake128_absorb(s, input, inlen);
  shake128_squeezeblocks(output, nblocks, s);

  output += nblocks * SHAKE128_RATE;
  outlen -= nblocks * SHAKE128_RATE;

  if (outlen) {
    shake128_squeezeblocks(t, 1, s);
    for (i = 0; i < outlen; ++i) {
      output[i] = t[i];
    unsigned int i;
    unsigned long nblocks = outlen / SHAKE128_RATE;
    unsigned char t[SHAKE128_RATE];
    uint64_t s[25];

    shake128_absorb(s, input, inlen);
    shake128_squeezeblocks(output, nblocks, s);

    output += nblocks * SHAKE128_RATE;
    outlen -= nblocks * SHAKE128_RATE;

    if (outlen) {
        shake128_squeezeblocks(t, 1, s);
        for (i = 0; i < outlen; ++i) {
            output[i] = t[i];
        }
    }
  }
 }

 /*************************************************
@@ -515,23 +515,23 @@ void shake128(unsigned char *output, unsigned long long outlen,
 **************************************************/
 void shake256(unsigned char *output, unsigned long long outlen,
              const unsigned char *input, unsigned long long inlen) {
  unsigned int i;
  unsigned long nblocks = outlen / SHAKE256_RATE;
  unsigned char t[SHAKE256_RATE];
  uint64_t s[25];

  shake256_absorb(s, input, inlen);
  shake256_squeezeblocks(output, nblocks, s);

  output += nblocks * SHAKE256_RATE;
  outlen -= nblocks * SHAKE256_RATE;

  if (outlen) {
    shake256_squeezeblocks(t, 1, s);
    for (i = 0; i < outlen; ++i) {
      output[i] = t[i];
    unsigned int i;
    unsigned long nblocks = outlen / SHAKE256_RATE;
    unsigned char t[SHAKE256_RATE];
    uint64_t s[25];

    shake256_absorb(s, input, inlen);
    shake256_squeezeblocks(output, nblocks, s);

    output += nblocks * SHAKE256_RATE;
    outlen -= nblocks * SHAKE256_RATE;

    if (outlen) {
        shake256_squeezeblocks(t, 1, s);
        for (i = 0; i < outlen; ++i) {
            output[i] = t[i];
        }
    }
  }
 }

 /*************************************************
@@ -545,19 +545,19 @@ void shake256(unsigned char *output, unsigned long long outlen,
 **************************************************/
 void sha3_256(unsigned char *output, const unsigned char *input,
              unsigned long long inlen) {
  uint64_t s[25];
  unsigned char t[SHA3_256_RATE];
  size_t i;
    uint64_t s[25];
    unsigned char t[SHA3_256_RATE];
    size_t i;

  /* Absorb input */
  keccak_absorb(s, SHA3_256_RATE, input, inlen, 0x06);
    /* Absorb input */
    keccak_absorb(s, SHA3_256_RATE, input, inlen, 0x06);

  /* Squeeze output */
  keccak_squeezeblocks(t, 1, s, SHA3_256_RATE);
    /* Squeeze output */
    keccak_squeezeblocks(t, 1, s, SHA3_256_RATE);

  for (i = 0; i < 32; i++) {
    output[i] = t[i];
  }
    for (i = 0; i < 32; i++) {
        output[i] = t[i];
    }
 }

 /*************************************************
@@ -571,17 +571,17 @@ void sha3_256(unsigned char *output, const unsigned char *input,
 **************************************************/
 void sha3_512(unsigned char *output, const unsigned char *input,
              unsigned long long inlen) {
  uint64_t s[25];
  unsigned char t[SHA3_512_RATE];
  size_t i;
    uint64_t s[25];
    unsigned char t[SHA3_512_RATE];
    size_t i;

  /* Absorb input */
  keccak_absorb(s, SHA3_512_RATE, input, inlen, 0x06);
    /* Absorb input */
    keccak_absorb(s, SHA3_512_RATE, input, inlen, 0x06);

  /* Squeeze output */
  keccak_squeezeblocks(t, 1, s, SHA3_512_RATE);
    /* Squeeze output */
    keccak_squeezeblocks(t, 1, s, SHA3_512_RATE);

  for (i = 0; i < 64; i++) {
    output[i] = t[i];
  }
    for (i = 0; i < 64; i++) {
        output[i] = t[i];
    }
 }
--- a/common/notrandombytes.c
+++ b/common/notrandombytes.c
@@ -20,57 +20,57 @@ static int32_t outleft = 0;
 #define MUSH(i, b) x = t[i] += (((x ^ seed[i]) + sum) ^ ROTATE(x, b));

 static void surf(void) {
  uint32_t t[12];
  uint32_t x;
  uint32_t sum = 0;
  int32_t r;
  int32_t i;
  int32_t loop;
    uint32_t t[12];
    uint32_t x;
    uint32_t sum = 0;
    int32_t r;
    int32_t i;
    int32_t loop;

  for (i = 0; i < 12; ++i) {
    t[i] = in[i] ^ seed[12 + i];
  }
  for (i = 0; i < 8; ++i) {
    out[i] = seed[24 + i];
  }
  x = t[11];
  for (loop = 0; loop < 2; ++loop) {
    for (r = 0; r < 16; ++r) {
      sum += 0x9e3779b9;
      MUSH(0, 5)
      MUSH(1, 7)
      MUSH(2, 9)
      MUSH(3, 13)
      MUSH(4, 5)
      MUSH(5, 7)
      MUSH(6, 9)
      MUSH(7, 13)
      MUSH(8, 5)
      MUSH(9, 7)
      MUSH(10, 9)
      MUSH(11, 13)
    for (i = 0; i < 12; ++i) {
        t[i] = in[i] ^ seed[12 + i];
    }
    for (i = 0; i < 8; ++i) {
      out[i] ^= t[i + 4];
        out[i] = seed[24 + i];
    }
    x = t[11];
    for (loop = 0; loop < 2; ++loop) {
        for (r = 0; r < 16; ++r) {
            sum += 0x9e3779b9;
            MUSH(0, 5)
            MUSH(1, 7)
            MUSH(2, 9)
            MUSH(3, 13)
            MUSH(4, 5)
            MUSH(5, 7)
            MUSH(6, 9)
            MUSH(7, 13)
            MUSH(8, 5)
            MUSH(9, 7)
            MUSH(10, 9)
            MUSH(11, 13)
        }
        for (i = 0; i < 8; ++i) {
            out[i] ^= t[i + 4];
        }
    }
  }
 }

 void randombytes(uint8_t *x, uint64_t xlen) {
  while (xlen > 0) {
    if (!outleft) {
      if (!++in[0]) {
        if (!++in[1]) {
          if (!++in[2]) {
            ++in[3];
          }
    while (xlen > 0) {
        if (!outleft) {
            if (!++in[0]) {
                if (!++in[1]) {
                    if (!++in[2]) {
                        ++in[3];
                    }
                }
            }
            surf();
            outleft = 8;
        }
      }
      surf();
      outleft = 8;
        *x = out[--outleft];
        ++x;
        --xlen;
    }
    *x = out[--outleft];
    ++x;
    --xlen;
  }
 }
--- a/common/sha2.c
+++ b/common/sha2.c
@@ -7,28 +7,28 @@
 typedef unsigned long long uint64;

 static uint64 load_bigendian(const unsigned char *x) {
  return (uint64)(x[7]) | (((uint64)(x[6])) << 8) | (((uint64)(x[5])) << 16) |
         (((uint64)(x[4])) << 24) | (((uint64)(x[3])) << 32) |
         (((uint64)(x[2])) << 40) | (((uint64)(x[1])) << 48) |
         (((uint64)(x[0])) << 56);
    return (uint64)(x[7]) | (((uint64)(x[6])) << 8) | (((uint64)(x[5])) << 16) |
           (((uint64)(x[4])) << 24) | (((uint64)(x[3])) << 32) |
           (((uint64)(x[2])) << 40) | (((uint64)(x[1])) << 48) |
           (((uint64)(x[0])) << 56);
 }

 static void store_bigendian(unsigned char *x, uint64 u) {
  x[7] = u;
  u >>= 8;
  x[6] = u;
  u >>= 8;
  x[5] = u;
  u >>= 8;
  x[4] = u;
  u >>= 8;
  x[3] = u;
  u >>= 8;
  x[2] = u;
  u >>= 8;
  x[1] = u;
  u >>= 8;
  x[0] = u;
    x[7] = u;
    u >>= 8;
    x[6] = u;
    u >>= 8;
    x[5] = u;
    u >>= 8;
    x[4] = u;
    u >>= 8;
    x[3] = u;
    u >>= 8;
    x[2] = u;
    u >>= 8;
    x[1] = u;
    u >>= 8;
    x[0] = u;
 }

 #define SHR(x, c) ((x) >> (c))
@@ -44,210 +44,210 @@ static void store_bigendian(unsigned char *x, uint64 u) {
 #define M(w0, w14, w9, w1) w0 = sigma1(w14) + (w9) + sigma0(w1) + (w0);

 #define EXPAND                                                                 \
  M(w0, w14, w9, w1)                                                           \
  M(w1, w15, w10, w2)                                                          \
  M(w2, w0, w11, w3)                                                           \
  M(w3, w1, w12, w4)                                                           \
  M(w4, w2, w13, w5)                                                           \
  M(w5, w3, w14, w6)                                                           \
  M(w6, w4, w15, w7)                                                           \
  M(w7, w5, w0, w8)                                                            \
  M(w8, w6, w1, w9)                                                            \
  M(w9, w7, w2, w10)                                                           \
  M(w10, w8, w3, w11)                                                          \
  M(w11, w9, w4, w12)                                                          \
  M(w12, w10, w5, w13)                                                         \
  M(w13, w11, w6, w14)                                                         \
  M(w14, w12, w7, w15)                                                         \
  M(w15, w13, w8, w0)
    M(w0, w14, w9, w1)                                                         \
    M(w1, w15, w10, w2)                                                        \
    M(w2, w0, w11, w3)                                                         \
    M(w3, w1, w12, w4)                                                         \
    M(w4, w2, w13, w5)                                                         \
    M(w5, w3, w14, w6)                                                         \
    M(w6, w4, w15, w7)                                                         \
    M(w7, w5, w0, w8)                                                          \
    M(w8, w6, w1, w9)                                                          \
    M(w9, w7, w2, w10)                                                         \
    M(w10, w8, w3, w11)                                                        \
    M(w11, w9, w4, w12)                                                        \
    M(w12, w10, w5, w13)                                                       \
    M(w13, w11, w6, w14)                                                       \
    M(w14, w12, w7, w15)                                                       \
    M(w15, w13, w8, w0)

 #define F(w, k)                                                                \
  T1 = h + Sigma1(e) + Ch(e, f, g) + (k) + (w);                                \
  T2 = Sigma0(a) + Maj(a, b, c);                                               \
  h = g;                                                                       \
  g = f;                                                                       \
  f = e;                                                                       \
  e = d + T1;                                                                  \
  d = c;                                                                       \
  c = b;                                                                       \
  b = a;                                                                       \
  a = T1 + T2;
    T1 = h + Sigma1(e) + Ch(e, f, g) + (k) + (w);                              \
    T2 = Sigma0(a) + Maj(a, b, c);                                             \
    h = g;                                                                     \
    g = f;                                                                     \
    f = e;                                                                     \
    e = d + T1;                                                                \
    d = c;                                                                     \
    c = b;                                                                     \
    b = a;                                                                     \
    a = T1 + T2;

 static int crypto_hashblocks_sha512(unsigned char *statebytes,
                                    const unsigned char *in,
                                    unsigned long long inlen) {
  uint64 state[8];
  uint64 a;
  uint64 b;
  uint64 c;
  uint64 d;
  uint64 e;
  uint64 f;
  uint64 g;
  uint64 h;
  uint64 T1;
  uint64 T2;

  a = load_bigendian(statebytes + 0);
  state[0] = a;
  b = load_bigendian(statebytes + 8);
  state[1] = b;
  c = load_bigendian(statebytes + 16);
  state[2] = c;
  d = load_bigendian(statebytes + 24);
  state[3] = d;
  e = load_bigendian(statebytes + 32);
  state[4] = e;
  f = load_bigendian(statebytes + 40);
  state[5] = f;
  g = load_bigendian(statebytes + 48);
  state[6] = g;
  h = load_bigendian(statebytes + 56);
  state[7] = h;

  while (inlen >= 128) {
    uint64 w0 = load_bigendian(in + 0);
    uint64 w1 = load_bigendian(in + 8);
    uint64 w2 = load_bigendian(in + 16);
    uint64 w3 = load_bigendian(in + 24);
    uint64 w4 = load_bigendian(in + 32);
    uint64 w5 = load_bigendian(in + 40);
    uint64 w6 = load_bigendian(in + 48);
    uint64 w7 = load_bigendian(in + 56);
    uint64 w8 = load_bigendian(in + 64);
    uint64 w9 = load_bigendian(in + 72);
    uint64 w10 = load_bigendian(in + 80);
    uint64 w11 = load_bigendian(in + 88);
    uint64 w12 = load_bigendian(in + 96);
    uint64 w13 = load_bigendian(in + 104);
    uint64 w14 = load_bigendian(in + 112);
    uint64 w15 = load_bigendian(in + 120);

    F(w0, 0x428a2f98d728ae22ULL)
    F(w1, 0x7137449123ef65cdULL)
    F(w2, 0xb5c0fbcfec4d3b2fULL)
    F(w3, 0xe9b5dba58189dbbcULL)
    F(w4, 0x3956c25bf348b538ULL)
    F(w5, 0x59f111f1b605d019ULL)
    F(w6, 0x923f82a4af194f9bULL)
    F(w7, 0xab1c5ed5da6d8118ULL)
    F(w8, 0xd807aa98a3030242ULL)
    F(w9, 0x12835b0145706fbeULL)
    F(w10, 0x243185be4ee4b28cULL)
    F(w11, 0x550c7dc3d5ffb4e2ULL)
    F(w12, 0x72be5d74f27b896fULL)
    F(w13, 0x80deb1fe3b1696b1ULL)
    F(w14, 0x9bdc06a725c71235ULL)
    F(w15, 0xc19bf174cf692694ULL)

    EXPAND

    F(w0, 0xe49b69c19ef14ad2ULL)
    F(w1, 0xefbe4786384f25e3ULL)
    F(w2, 0x0fc19dc68b8cd5b5ULL)
    F(w3, 0x240ca1cc77ac9c65ULL)
    F(w4, 0x2de92c6f592b0275ULL)
    F(w5, 0x4a7484aa6ea6e483ULL)
    F(w6, 0x5cb0a9dcbd41fbd4ULL)
    F(w7, 0x76f988da831153b5ULL)
    F(w8, 0x983e5152ee66dfabULL)
    F(w9, 0xa831c66d2db43210ULL)
    F(w10, 0xb00327c898fb213fULL)
    F(w11, 0xbf597fc7beef0ee4ULL)
    F(w12, 0xc6e00bf33da88fc2ULL)
    F(w13, 0xd5a79147930aa725ULL)
    F(w14, 0x06ca6351e003826fULL)
    F(w15, 0x142929670a0e6e70ULL)

    EXPAND

    F(w0, 0x27b70a8546d22ffcULL)
    F(w1, 0x2e1b21385c26c926ULL)
    F(w2, 0x4d2c6dfc5ac42aedULL)
    F(w3, 0x53380d139d95b3dfULL)
    F(w4, 0x650a73548baf63deULL)
    F(w5, 0x766a0abb3c77b2a8ULL)
    F(w6, 0x81c2c92e47edaee6ULL)
    F(w7, 0x92722c851482353bULL)
    F(w8, 0xa2bfe8a14cf10364ULL)
    F(w9, 0xa81a664bbc423001ULL)
    F(w10, 0xc24b8b70d0f89791ULL)
    F(w11, 0xc76c51a30654be30ULL)
    F(w12, 0xd192e819d6ef5218ULL)
    F(w13, 0xd69906245565a910ULL)
    F(w14, 0xf40e35855771202aULL)
    F(w15, 0x106aa07032bbd1b8ULL)

    EXPAND

    F(w0, 0x19a4c116b8d2d0c8ULL)
    F(w1, 0x1e376c085141ab53ULL)
    F(w2, 0x2748774cdf8eeb99ULL)
    F(w3, 0x34b0bcb5e19b48a8ULL)
    F(w4, 0x391c0cb3c5c95a63ULL)
    F(w5, 0x4ed8aa4ae3418acbULL)
    F(w6, 0x5b9cca4f7763e373ULL)
    F(w7, 0x682e6ff3d6b2b8a3ULL)
    F(w8, 0x748f82ee5defb2fcULL)
    F(w9, 0x78a5636f43172f60ULL)
    F(w10, 0x84c87814a1f0ab72ULL)
    F(w11, 0x8cc702081a6439ecULL)
    F(w12, 0x90befffa23631e28ULL)
    F(w13, 0xa4506cebde82bde9ULL)
    F(w14, 0xbef9a3f7b2c67915ULL)
    F(w15, 0xc67178f2e372532bULL)

    EXPAND

    F(w0, 0xca273eceea26619cULL)
    F(w1, 0xd186b8c721c0c207ULL)
    F(w2, 0xeada7dd6cde0eb1eULL)
    F(w3, 0xf57d4f7fee6ed178ULL)
    F(w4, 0x06f067aa72176fbaULL)
    F(w5, 0x0a637dc5a2c898a6ULL)
    F(w6, 0x113f9804bef90daeULL)
    F(w7, 0x1b710b35131c471bULL)
    F(w8, 0x28db77f523047d84ULL)
    F(w9, 0x32caab7b40c72493ULL)
    F(w10, 0x3c9ebe0a15c9bebcULL)
    F(w11, 0x431d67c49c100d4cULL)
    F(w12, 0x4cc5d4becb3e42b6ULL)
    F(w13, 0x597f299cfc657e2aULL)
    F(w14, 0x5fcb6fab3ad6faecULL)
    F(w15, 0x6c44198c4a475817ULL)

    a += state[0];
    b += state[1];
    c += state[2];
    d += state[3];
    e += state[4];
    f += state[5];
    g += state[6];
    h += state[7];

    uint64 state[8];
    uint64 a;
    uint64 b;
    uint64 c;
    uint64 d;
    uint64 e;
    uint64 f;
    uint64 g;
    uint64 h;
    uint64 T1;
    uint64 T2;

    a = load_bigendian(statebytes + 0);
    state[0] = a;
    b = load_bigendian(statebytes + 8);
    state[1] = b;
    c = load_bigendian(statebytes + 16);
    state[2] = c;
    d = load_bigendian(statebytes + 24);
    state[3] = d;
    e = load_bigendian(statebytes + 32);
    state[4] = e;
    f = load_bigendian(statebytes + 40);
    state[5] = f;
    g = load_bigendian(statebytes + 48);
    state[6] = g;
    h = load_bigendian(statebytes + 56);
    state[7] = h;

    in += 128;
    inlen -= 128;
  }
    while (inlen >= 128) {
        uint64 w0 = load_bigendian(in + 0);
        uint64 w1 = load_bigendian(in + 8);
        uint64 w2 = load_bigendian(in + 16);
        uint64 w3 = load_bigendian(in + 24);
        uint64 w4 = load_bigendian(in + 32);
        uint64 w5 = load_bigendian(in + 40);
        uint64 w6 = load_bigendian(in + 48);
        uint64 w7 = load_bigendian(in + 56);
        uint64 w8 = load_bigendian(in + 64);
        uint64 w9 = load_bigendian(in + 72);
        uint64 w10 = load_bigendian(in + 80);
        uint64 w11 = load_bigendian(in + 88);
        uint64 w12 = load_bigendian(in + 96);
        uint64 w13 = load_bigendian(in + 104);
        uint64 w14 = load_bigendian(in + 112);
        uint64 w15 = load_bigendian(in + 120);

        F(w0, 0x428a2f98d728ae22ULL)
        F(w1, 0x7137449123ef65cdULL)
        F(w2, 0xb5c0fbcfec4d3b2fULL)
        F(w3, 0xe9b5dba58189dbbcULL)
        F(w4, 0x3956c25bf348b538ULL)
        F(w5, 0x59f111f1b605d019ULL)
        F(w6, 0x923f82a4af194f9bULL)
        F(w7, 0xab1c5ed5da6d8118ULL)
        F(w8, 0xd807aa98a3030242ULL)
        F(w9, 0x12835b0145706fbeULL)
        F(w10, 0x243185be4ee4b28cULL)
        F(w11, 0x550c7dc3d5ffb4e2ULL)
        F(w12, 0x72be5d74f27b896fULL)
        F(w13, 0x80deb1fe3b1696b1ULL)
        F(w14, 0x9bdc06a725c71235ULL)
        F(w15, 0xc19bf174cf692694ULL)

        EXPAND

        F(w0, 0xe49b69c19ef14ad2ULL)
        F(w1, 0xefbe4786384f25e3ULL)
        F(w2, 0x0fc19dc68b8cd5b5ULL)
        F(w3, 0x240ca1cc77ac9c65ULL)
        F(w4, 0x2de92c6f592b0275ULL)
        F(w5, 0x4a7484aa6ea6e483ULL)
        F(w6, 0x5cb0a9dcbd41fbd4ULL)
        F(w7, 0x76f988da831153b5ULL)
        F(w8, 0x983e5152ee66dfabULL)
        F(w9, 0xa831c66d2db43210ULL)
        F(w10, 0xb00327c898fb213fULL)
        F(w11, 0xbf597fc7beef0ee4ULL)
        F(w12, 0xc6e00bf33da88fc2ULL)
        F(w13, 0xd5a79147930aa725ULL)
        F(w14, 0x06ca6351e003826fULL)
        F(w15, 0x142929670a0e6e70ULL)

        EXPAND

        F(w0, 0x27b70a8546d22ffcULL)
        F(w1, 0x2e1b21385c26c926ULL)
        F(w2, 0x4d2c6dfc5ac42aedULL)
        F(w3, 0x53380d139d95b3dfULL)
        F(w4, 0x650a73548baf63deULL)
        F(w5, 0x766a0abb3c77b2a8ULL)
        F(w6, 0x81c2c92e47edaee6ULL)
        F(w7, 0x92722c851482353bULL)
        F(w8, 0xa2bfe8a14cf10364ULL)
        F(w9, 0xa81a664bbc423001ULL)
        F(w10, 0xc24b8b70d0f89791ULL)
        F(w11, 0xc76c51a30654be30ULL)
        F(w12, 0xd192e819d6ef5218ULL)
        F(w13, 0xd69906245565a910ULL)
        F(w14, 0xf40e35855771202aULL)
        F(w15, 0x106aa07032bbd1b8ULL)

        EXPAND

        F(w0, 0x19a4c116b8d2d0c8ULL)
        F(w1, 0x1e376c085141ab53ULL)
        F(w2, 0x2748774cdf8eeb99ULL)
        F(w3, 0x34b0bcb5e19b48a8ULL)
        F(w4, 0x391c0cb3c5c95a63ULL)
        F(w5, 0x4ed8aa4ae3418acbULL)
        F(w6, 0x5b9cca4f7763e373ULL)
        F(w7, 0x682e6ff3d6b2b8a3ULL)
        F(w8, 0x748f82ee5defb2fcULL)
        F(w9, 0x78a5636f43172f60ULL)
        F(w10, 0x84c87814a1f0ab72ULL)
        F(w11, 0x8cc702081a6439ecULL)
        F(w12, 0x90befffa23631e28ULL)
        F(w13, 0xa4506cebde82bde9ULL)
        F(w14, 0xbef9a3f7b2c67915ULL)
        F(w15, 0xc67178f2e372532bULL)

        EXPAND

        F(w0, 0xca273eceea26619cULL)
        F(w1, 0xd186b8c721c0c207ULL)
        F(w2, 0xeada7dd6cde0eb1eULL)
        F(w3, 0xf57d4f7fee6ed178ULL)
        F(w4, 0x06f067aa72176fbaULL)
        F(w5, 0x0a637dc5a2c898a6ULL)
        F(w6, 0x113f9804bef90daeULL)
        F(w7, 0x1b710b35131c471bULL)
        F(w8, 0x28db77f523047d84ULL)
        F(w9, 0x32caab7b40c72493ULL)
        F(w10, 0x3c9ebe0a15c9bebcULL)
        F(w11, 0x431d67c49c100d4cULL)
        F(w12, 0x4cc5d4becb3e42b6ULL)
        F(w13, 0x597f299cfc657e2aULL)
        F(w14, 0x5fcb6fab3ad6faecULL)
        F(w15, 0x6c44198c4a475817ULL)

        a += state[0];
        b += state[1];
        c += state[2];
        d += state[3];
        e += state[4];
        f += state[5];
        g += state[6];
        h += state[7];

        state[0] = a;
        state[1] = b;
        state[2] = c;
        state[3] = d;
        state[4] = e;
        state[5] = f;
        state[6] = g;
        state[7] = h;

        in += 128;
        inlen -= 128;
    }

  store_bigendian(statebytes + 0, state[0]);
  store_bigendian(statebytes + 8, state[1]);
  store_bigendian(statebytes + 16, state[2]);
  store_bigendian(statebytes + 24, state[3]);
  store_bigendian(statebytes + 32, state[4]);
  store_bigendian(statebytes + 40, state[5]);
  store_bigendian(statebytes + 48, state[6]);
  store_bigendian(statebytes + 56, state[7]);
    store_bigendian(statebytes + 0, state[0]);
    store_bigendian(statebytes + 8, state[1]);
    store_bigendian(statebytes + 16, state[2]);
    store_bigendian(statebytes + 24, state[3]);
    store_bigendian(statebytes + 32, state[4]);
    store_bigendian(statebytes + 40, state[5]);
    store_bigendian(statebytes + 48, state[6]);
    store_bigendian(statebytes + 56, state[7]);

  return inlen;
    return inlen;
 }

 #define blocks crypto_hashblocks_sha512
@@ -270,116 +270,116 @@ static const unsigned char iv_512[64] = {

 int sha384(unsigned char *out, const unsigned char *in,
           unsigned long long inlen) {
  unsigned char h[64];
  unsigned char padded[256];
  unsigned int i;
  unsigned long long bytes = inlen;

  for (i = 0; i < 64; ++i) {
    h[i] = iv_384[i];
  }

  blocks(h, in, inlen);
  in += inlen;
  inlen &= 127;
  in -= inlen;

  for (i = 0; i < inlen; ++i) {
    padded[i] = in[i];
  }
  padded[inlen] = 0x80;

  if (inlen < 112) {
    for (i = inlen + 1; i < 119; ++i) {
      padded[i] = 0;
    unsigned char h[64];
    unsigned char padded[256];
    unsigned int i;
    unsigned long long bytes = inlen;

    for (i = 0; i < 64; ++i) {
        h[i] = iv_384[i];
    }

    blocks(h, in, inlen);
    in += inlen;
    inlen &= 127;
    in -= inlen;

    for (i = 0; i < inlen; ++i) {
        padded[i] = in[i];
    }
    padded[119] = bytes >> 61;
    padded[120] = bytes >> 53;
    padded[121] = bytes >> 45;
    padded[122] = bytes >> 37;
    padded[123] = bytes >> 29;
    padded[124] = bytes >> 21;
    padded[125] = bytes >> 13;
    padded[126] = bytes >> 5;
    padded[127] = bytes << 3;
    blocks(h, padded, 128);
  } else {
    for (i = inlen + 1; i < 247; ++i) {
      padded[i] = 0;
    padded[inlen] = 0x80;

    if (inlen < 112) {
        for (i = inlen + 1; i < 119; ++i) {
            padded[i] = 0;
        }
        padded[119] = bytes >> 61;
        padded[120] = bytes >> 53;
        padded[121] = bytes >> 45;
        padded[122] = bytes >> 37;
        padded[123] = bytes >> 29;
        padded[124] = bytes >> 21;
        padded[125] = bytes >> 13;
        padded[126] = bytes >> 5;
        padded[127] = bytes << 3;
        blocks(h, padded, 128);
    } else {
        for (i = inlen + 1; i < 247; ++i) {
            padded[i] = 0;
        }
        padded[247] = bytes >> 61;
        padded[248] = bytes >> 53;
        padded[249] = bytes >> 45;
        padded[250] = bytes >> 37;
        padded[251] = bytes >> 29;
        padded[252] = bytes >> 21;
        padded[253] = bytes >> 13;
        padded[254] = bytes >> 5;
        padded[255] = bytes << 3;
        blocks(h, padded, 256);
    }

    for (i = 0; i < 48; ++i) {
        out[i] = h[i];
    }
    padded[247] = bytes >> 61;
    padded[248] = bytes >> 53;
    padded[249] = bytes >> 45;
    padded[250] = bytes >> 37;
    padded[251] = bytes >> 29;
    padded[252] = bytes >> 21;
    padded[253] = bytes >> 13;
    padded[254] = bytes >> 5;
    padded[255] = bytes << 3;
    blocks(h, padded, 256);
  }

  for (i = 0; i < 48; ++i) {
    out[i] = h[i];
  }

  return 0;

    return 0;
 }

 int sha512(unsigned char *out, const unsigned char *in,
           unsigned long long inlen) {
  unsigned char h[64];
  unsigned char padded[256];
  unsigned int i;
  unsigned long long bytes = inlen;

  for (i = 0; i < 64; ++i) {
    h[i] = iv_512[i];
  }

  blocks(h, in, inlen);
  in += inlen;
  inlen &= 127;
  in -= inlen;

  for (i = 0; i < inlen; ++i) {
    padded[i] = in[i];
  }
  padded[inlen] = 0x80;

  if (inlen < 112) {
    for (i = inlen + 1; i < 119; ++i) {
      padded[i] = 0;
    unsigned char h[64];
    unsigned char padded[256];
    unsigned int i;
    unsigned long long bytes = inlen;

    for (i = 0; i < 64; ++i) {
        h[i] = iv_512[i];
    }

    blocks(h, in, inlen);
    in += inlen;
    inlen &= 127;
    in -= inlen;

    for (i = 0; i < inlen; ++i) {
        padded[i] = in[i];
    }
    padded[inlen] = 0x80;

    if (inlen < 112) {
        for (i = inlen + 1; i < 119; ++i) {
            padded[i] = 0;
        }
        padded[119] = bytes >> 61;
        padded[120] = bytes >> 53;
        padded[121] = bytes >> 45;
        padded[122] = bytes >> 37;
        padded[123] = bytes >> 29;
        padded[124] = bytes >> 21;
        padded[125] = bytes >> 13;
        padded[126] = bytes >> 5;
        padded[127] = bytes << 3;
        blocks(h, padded, 128);
    } else {
        for (i = inlen + 1; i < 247; ++i) {
            padded[i] = 0;
        }
        padded[247] = bytes >> 61;
        padded[248] = bytes >> 53;
        padded[249] = bytes >> 45;
        padded[250] = bytes >> 37;
        padded[251] = bytes >> 29;
        padded[252] = bytes >> 21;
        padded[253] = bytes >> 13;
        padded[254] = bytes >> 5;
        padded[255] = bytes << 3;
        blocks(h, padded, 256);
    }
    padded[119] = bytes >> 61;
    padded[120] = bytes >> 53;
    padded[121] = bytes >> 45;
    padded[122] = bytes >> 37;
    padded[123] = bytes >> 29;
    padded[124] = bytes >> 21;
    padded[125] = bytes >> 13;
    padded[126] = bytes >> 5;
    padded[127] = bytes << 3;
    blocks(h, padded, 128);
  } else {
    for (i = inlen + 1; i < 247; ++i) {
      padded[i] = 0;

    for (i = 0; i < 64; ++i) {
        out[i] = h[i];
    }
    padded[247] = bytes >> 61;
    padded[248] = bytes >> 53;
    padded[249] = bytes >> 45;
    padded[250] = bytes >> 37;
    padded[251] = bytes >> 29;
    padded[252] = bytes >> 21;
    padded[253] = bytes >> 13;
    padded[254] = bytes >> 5;
    padded[255] = bytes << 3;
    blocks(h, padded, 256);
  }

  for (i = 0; i < 64; ++i) {
    out[i] = h[i];
  }

  return 0;

    return 0;
 }
--- a/crypto_kem/kyber768/clean/cbd.c
+++ b/crypto_kem/kyber768/clean/cbd.c
@@ -13,12 +13,12 @@
 * Returns 64-bit unsigned integer loaded from x
 **************************************************/
 static uint64_t load_littleendian(const unsigned char *x, int bytes) {
  int i;
  uint64_t r = x[0];
  for (i = 1; i < bytes; i++) {
    r |= (uint64_t)x[i] << (8 * i);
  }
  return r;
    int i;
    uint64_t r = x[0];
    for (i = 1; i < bytes; i++) {
        r |= (uint64_t)x[i] << (8 * i);
    }
    return r;
 }

 /*************************************************
@@ -33,78 +33,78 @@ static uint64_t load_littleendian(const unsigned char *x, int bytes) {
 **************************************************/
 void cbd(poly *r, const unsigned char *buf) {
 #if KYBER_ETA == 3
  uint32_t t, d, a[4], b[4];
  int i, j;
    uint32_t t, d, a[4], b[4];
    int i, j;

  for (i = 0; i < KYBER_N / 4; i++) {
    t = load_littleendian(buf + 3 * i, 3);
    d = 0;
    for (j = 0; j < 3; j++)
      d += (t >> j) & 0x249249;
    for (i = 0; i < KYBER_N / 4; i++) {
        t = load_littleendian(buf + 3 * i, 3);
        d = 0;
        for (j = 0; j < 3; j++)
            d += (t >> j) & 0x249249;

    a[0] = d & 0x7;
    b[0] = (d >> 3) & 0x7;
    a[1] = (d >> 6) & 0x7;
    b[1] = (d >> 9) & 0x7;
    a[2] = (d >> 12) & 0x7;
    b[2] = (d >> 15) & 0x7;
    a[3] = (d >> 18) & 0x7;
    b[3] = (d >> 21);
        a[0] = d & 0x7;
        b[0] = (d >> 3) & 0x7;
        a[1] = (d >> 6) & 0x7;
        b[1] = (d >> 9) & 0x7;
        a[2] = (d >> 12) & 0x7;
        b[2] = (d >> 15) & 0x7;
        a[3] = (d >> 18) & 0x7;
        b[3] = (d >> 21);

    r->coeffs[4 * i + 0] = a[0] + KYBER_Q - b[0];
    r->coeffs[4 * i + 1] = a[1] + KYBER_Q - b[1];
    r->coeffs[4 * i + 2] = a[2] + KYBER_Q - b[2];
    r->coeffs[4 * i + 3] = a[3] + KYBER_Q - b[3];
  }
        r->coeffs[4 * i + 0] = a[0] + KYBER_Q - b[0];
        r->coeffs[4 * i + 1] = a[1] + KYBER_Q - b[1];
        r->coeffs[4 * i + 2] = a[2] + KYBER_Q - b[2];
        r->coeffs[4 * i + 3] = a[3] + KYBER_Q - b[3];
    }
 #elif KYBER_ETA == 4
  uint32_t t, d, a[4], b[4];
  int i, j;
    uint32_t t, d, a[4], b[4];
    int i, j;

  for (i = 0; i < KYBER_N / 4; i++) {
    t = load_littleendian(buf + 4 * i, 4);
    d = 0;
    for (j = 0; j < 4; j++) {
      d += (t >> j) & 0x11111111;
    }
    for (i = 0; i < KYBER_N / 4; i++) {
        t = load_littleendian(buf + 4 * i, 4);
        d = 0;
        for (j = 0; j < 4; j++) {
            d += (t >> j) & 0x11111111;
        }

    a[0] = d & 0xf;
    b[0] = (d >> 4) & 0xf;
    a[1] = (d >> 8) & 0xf;
    b[1] = (d >> 12) & 0xf;
    a[2] = (d >> 16) & 0xf;
    b[2] = (d >> 20) & 0xf;
    a[3] = (d >> 24) & 0xf;
    b[3] = (d >> 28);
        a[0] = d & 0xf;
        b[0] = (d >> 4) & 0xf;
        a[1] = (d >> 8) & 0xf;
        b[1] = (d >> 12) & 0xf;
        a[2] = (d >> 16) & 0xf;
        b[2] = (d >> 20) & 0xf;
        a[3] = (d >> 24) & 0xf;
        b[3] = (d >> 28);

    r->coeffs[4 * i + 0] = a[0] + KYBER_Q - b[0];
    r->coeffs[4 * i + 1] = a[1] + KYBER_Q - b[1];
    r->coeffs[4 * i + 2] = a[2] + KYBER_Q - b[2];
    r->coeffs[4 * i + 3] = a[3] + KYBER_Q - b[3];
  }
        r->coeffs[4 * i + 0] = a[0] + KYBER_Q - b[0];
        r->coeffs[4 * i + 1] = a[1] + KYBER_Q - b[1];
        r->coeffs[4 * i + 2] = a[2] + KYBER_Q - b[2];
        r->coeffs[4 * i + 3] = a[3] + KYBER_Q - b[3];
    }
 #elif KYBER_ETA == 5
  uint64_t t, d, a[4], b[4];
  int i, j;
    uint64_t t, d, a[4], b[4];
    int i, j;

  for (i = 0; i < KYBER_N / 4; i++) {
    t = load_littleendian(buf + 5 * i, 5);
    d = 0;
    for (j = 0; j < 5; j++)
      d += (t >> j) & 0x0842108421UL;
    for (i = 0; i < KYBER_N / 4; i++) {
        t = load_littleendian(buf + 5 * i, 5);
        d = 0;
        for (j = 0; j < 5; j++)
            d += (t >> j) & 0x0842108421UL;

    a[0] = d & 0x1f;
    b[0] = (d >> 5) & 0x1f;
    a[1] = (d >> 10) & 0x1f;
    b[1] = (d >> 15) & 0x1f;
    a[2] = (d >> 20) & 0x1f;
    b[2] = (d >> 25) & 0x1f;
    a[3] = (d >> 30) & 0x1f;
    b[3] = (d >> 35);
        a[0] = d & 0x1f;
        b[0] = (d >> 5) & 0x1f;
        a[1] = (d >> 10) & 0x1f;
        b[1] = (d >> 15) & 0x1f;
        a[2] = (d >> 20) & 0x1f;
        b[2] = (d >> 25) & 0x1f;
        a[3] = (d >> 30) & 0x1f;
        b[3] = (d >> 35);

    r->coeffs[4 * i + 0] = a[0] + KYBER_Q - b[0];
    r->coeffs[4 * i + 1] = a[1] + KYBER_Q - b[1];
    r->coeffs[4 * i + 2] = a[2] + KYBER_Q - b[2];
    r->coeffs[4 * i + 3] = a[3] + KYBER_Q - b[3];
  }
        r->coeffs[4 * i + 0] = a[0] + KYBER_Q - b[0];
        r->coeffs[4 * i + 1] = a[1] + KYBER_Q - b[1];
        r->coeffs[4 * i + 2] = a[2] + KYBER_Q - b[2];
        r->coeffs[4 * i + 3] = a[3] + KYBER_Q - b[3];
    }
 #else
 #error "poly_getnoise in poly.c only supports eta in {3,4,5}"
 #endif
--- a/crypto_kem/kyber768/clean/indcpa.c
+++ b/crypto_kem/kyber768/clean/indcpa.c
@@ -19,11 +19,11 @@
 **************************************************/
 static void pack_pk(unsigned char *r, const polyvec *pk,
                    const unsigned char *seed) {
  int i;
  polyvec_compress(r, pk);
  for (i = 0; i < KYBER_SYMBYTES; i++) {
    r[i + KYBER_POLYVECCOMPRESSEDBYTES] = seed[i];
  }
    int i;
    polyvec_compress(r, pk);
    for (i = 0; i < KYBER_SYMBYTES; i++) {
        r[i + KYBER_POLYVECCOMPRESSEDBYTES] = seed[i];
    }
 }

 /*************************************************
@@ -41,12 +41,12 @@ static void pack_pk(unsigned char *r, const polyvec *pk,
 **************************************************/
 static void unpack_pk(polyvec *pk, unsigned char *seed,
                      const unsigned char *packedpk) {
  int i;
  polyvec_decompress(pk, packedpk);
    int i;
    polyvec_decompress(pk, packedpk);

  for (i = 0; i < KYBER_SYMBYTES; i++) {
    seed[i] = packedpk[i + KYBER_POLYVECCOMPRESSEDBYTES];
  }
    for (i = 0; i < KYBER_SYMBYTES; i++) {
        seed[i] = packedpk[i + KYBER_POLYVECCOMPRESSEDBYTES];
    }
 }

 /*************************************************
@@ -61,8 +61,8 @@ static void unpack_pk(polyvec *pk, unsigned char *seed,
 *polynomials b const unsigned char *seed: pointer to the input polynomial v
 **************************************************/
 static void pack_ciphertext(unsigned char *r, const polyvec *b, const poly *v) {
  polyvec_compress(r, b);
  poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
    polyvec_compress(r, b);
    poly_compress(r + KYBER_POLYVECCOMPRESSEDBYTES, v);
 }

 /*************************************************
@@ -78,8 +78,8 @@ static void pack_ciphertext(unsigned char *r, const polyvec *b, const poly *v) {
 *ciphertext
 **************************************************/
 static void unpack_ciphertext(polyvec *b, poly *v, const unsigned char *c) {
  polyvec_decompress(b, c);
  poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
    polyvec_decompress(b, c);
    poly_decompress(v, c + KYBER_POLYVECCOMPRESSEDBYTES);
 }

 /*************************************************
@@ -92,7 +92,7 @@ static void unpack_ciphertext(polyvec *b, poly *v, const unsigned char *c) {
 *(secret key)
 **************************************************/
 static void pack_sk(unsigned char *r, const polyvec *sk) {
  polyvec_tobytes(r, sk);
    polyvec_tobytes(r, sk);
 }

 /*************************************************
@@ -107,7 +107,7 @@ static void pack_sk(unsigned char *r, const polyvec *sk) {
 *secret key
 **************************************************/
 static void unpack_sk(polyvec *sk, const unsigned char *packedsk) {
  polyvec_frombytes(sk, packedsk);
    polyvec_frombytes(sk, packedsk);
 }

 #define gen_a(A, B) gen_matrix(A, B, 0)
@@ -129,49 +129,49 @@ static void unpack_sk(polyvec *sk, const unsigned char *packedsk) {
 void gen_matrix(polyvec *a, const unsigned char *seed,
                int transposed) // Not static for benchmarking
 {
  unsigned int pos = 0, ctr;
  uint16_t val;
  unsigned int nblocks;
  const unsigned int maxnblocks = 4;
  uint8_t buf[SHAKE128_RATE * maxnblocks];
  int i, j;
  uint64_t state[25]; // SHAKE state
  unsigned char extseed[KYBER_SYMBYTES + 2];

  for (i = 0; i < KYBER_SYMBYTES; i++) {
    extseed[i] = seed[i];
  }

  for (i = 0; i < KYBER_K; i++) {
    for (j = 0; j < KYBER_K; j++) {
      ctr = pos = 0;
      nblocks = maxnblocks;
      if (transposed) {
        extseed[KYBER_SYMBYTES] = i;
        extseed[KYBER_SYMBYTES + 1] = j;
      } else {
        extseed[KYBER_SYMBYTES] = j;
        extseed[KYBER_SYMBYTES + 1] = i;
      }

      shake128_absorb(state, extseed, KYBER_SYMBYTES + 2);
      shake128_squeezeblocks(buf, nblocks, state);

      while (ctr < KYBER_N) {
        val = (buf[pos] | ((uint16_t)buf[pos + 1] << 8)) & 0x1fff;
        if (val < KYBER_Q) {
          a[i].vec[j].coeffs[ctr++] = val;
        }
        pos += 2;
    unsigned int pos = 0, ctr;
    uint16_t val;
    unsigned int nblocks;
    const unsigned int maxnblocks = 4;
    uint8_t buf[SHAKE128_RATE * maxnblocks];
    int i, j;
    uint64_t state[25]; // SHAKE state
    unsigned char extseed[KYBER_SYMBYTES + 2];

    for (i = 0; i < KYBER_SYMBYTES; i++) {
        extseed[i] = seed[i];
    }

        if (pos > SHAKE128_RATE * nblocks - 2) {
          nblocks = 1;
          shake128_squeezeblocks(buf, nblocks, state);
          pos = 0;
    for (i = 0; i < KYBER_K; i++) {
        for (j = 0; j < KYBER_K; j++) {
            ctr = pos = 0;
            nblocks = maxnblocks;
            if (transposed) {
                extseed[KYBER_SYMBYTES] = i;
                extseed[KYBER_SYMBYTES + 1] = j;
            } else {
                extseed[KYBER_SYMBYTES] = j;
                extseed[KYBER_SYMBYTES + 1] = i;
            }

            shake128_absorb(state, extseed, KYBER_SYMBYTES + 2);
            shake128_squeezeblocks(buf, nblocks, state);

            while (ctr < KYBER_N) {
                val = (buf[pos] | ((uint16_t)buf[pos + 1] << 8)) & 0x1fff;
                if (val < KYBER_Q) {
                    a[i].vec[j].coeffs[ctr++] = val;
                }
                pos += 2;

                if (pos > SHAKE128_RATE * nblocks - 2) {
                    nblocks = 1;
                    shake128_squeezeblocks(buf, nblocks, state);
                    pos = 0;
                }
            }
        }
      }
    }
  }
 }

 /*************************************************
@@ -186,38 +186,38 @@ void gen_matrix(polyvec *a, const unsigned char *seed,
 *KYBER_INDCPA_SECRETKEYBYTES bytes)
 **************************************************/
 void indcpa_keypair(unsigned char *pk, unsigned char *sk) {
  polyvec a[KYBER_K], e, pkpv, skpv;
  unsigned char buf[KYBER_SYMBYTES + KYBER_SYMBYTES];
  unsigned char *publicseed = buf;
  unsigned char *noiseseed = buf + KYBER_SYMBYTES;
  int i;
  unsigned char nonce = 0;
    polyvec a[KYBER_K], e, pkpv, skpv;
    unsigned char buf[KYBER_SYMBYTES + KYBER_SYMBYTES];
    unsigned char *publicseed = buf;
    unsigned char *noiseseed = buf + KYBER_SYMBYTES;
    int i;
    unsigned char nonce = 0;

  randombytes(buf, KYBER_SYMBYTES);
  sha3_512(buf, buf, KYBER_SYMBYTES);
    randombytes(buf, KYBER_SYMBYTES);
    sha3_512(buf, buf, KYBER_SYMBYTES);

  gen_a(a, publicseed);
    gen_a(a, publicseed);

  for (i = 0; i < KYBER_K; i++) {
    poly_getnoise(skpv.vec + i, noiseseed, nonce++);
  }
    for (i = 0; i < KYBER_K; i++) {
        poly_getnoise(skpv.vec + i, noiseseed, nonce++);
    }

  polyvec_ntt(&skpv);
    polyvec_ntt(&skpv);

  for (i = 0; i < KYBER_K; i++) {
    poly_getnoise(e.vec + i, noiseseed, nonce++);
  }
    for (i = 0; i < KYBER_K; i++) {
        poly_getnoise(e.vec + i, noiseseed, nonce++);
    }

  // matrix-vector multiplication
  for (i = 0; i < KYBER_K; i++) {
    polyvec_pointwise_acc(&pkpv.vec[i], &skpv, a + i);
  }
    // matrix-vector multiplication
    for (i = 0; i < KYBER_K; i++) {
        polyvec_pointwise_acc(&pkpv.vec[i], &skpv, a + i);
    }

  polyvec_invntt(&pkpv);
  polyvec_add(&pkpv, &pkpv, &e);
    polyvec_invntt(&pkpv);
    polyvec_add(&pkpv, &pkpv, &e);

  pack_sk(sk, &skpv);
  pack_pk(pk, &pkpv, publicseed);
    pack_sk(sk, &skpv);
    pack_pk(pk, &pkpv, publicseed);
 }

 /*************************************************
@@ -238,47 +238,47 @@ void indcpa_keypair(unsigned char *pk, unsigned char *sk) {
 **************************************************/
 void indcpa_enc(unsigned char *c, const unsigned char *m,
                const unsigned char *pk, const unsigned char *coins) {
  polyvec sp, pkpv, ep, at[KYBER_K], bp;
  poly v, k, epp;
  unsigned char seed[KYBER_SYMBYTES];
  int i;
  unsigned char nonce = 0;
    polyvec sp, pkpv, ep, at[KYBER_K], bp;
    poly v, k, epp;
    unsigned char seed[KYBER_SYMBYTES];
    int i;
    unsigned char nonce = 0;

  unpack_pk(&pkpv, seed, pk);
    unpack_pk(&pkpv, seed, pk);

  poly_frommsg(&k, m);
    poly_frommsg(&k, m);

  polyvec_ntt(&pkpv);
    polyvec_ntt(&pkpv);

  gen_at(at, seed);
    gen_at(at, seed);

  for (i = 0; i < KYBER_K; i++) {
    poly_getnoise(sp.vec + i, coins, nonce++);
  }
    for (i = 0; i < KYBER_K; i++) {
        poly_getnoise(sp.vec + i, coins, nonce++);
    }

  polyvec_ntt(&sp);
    polyvec_ntt(&sp);

  for (i = 0; i < KYBER_K; i++) {
    poly_getnoise(ep.vec + i, coins, nonce++);
  }
    for (i = 0; i < KYBER_K; i++) {
        poly_getnoise(ep.vec + i, coins, nonce++);
    }

  // matrix-vector multiplication
  for (i = 0; i < KYBER_K; i++) {
    polyvec_pointwise_acc(&bp.vec[i], &sp, at + i);
  }
    // matrix-vector multiplication
    for (i = 0; i < KYBER_K; i++) {
        polyvec_pointwise_acc(&bp.vec[i], &sp, at + i);
    }

  polyvec_invntt(&bp);
  polyvec_add(&bp, &bp, &ep);
    polyvec_invntt(&bp);
    polyvec_add(&bp, &bp, &ep);

  polyvec_pointwise_acc(&v, &pkpv, &sp);
  poly_invntt(&v);
    polyvec_pointwise_acc(&v, &pkpv, &sp);
    poly_invntt(&v);

  poly_getnoise(&epp, coins, nonce++);
    poly_getnoise(&epp, coins, nonce++);

  poly_add(&v, &v, &epp);
  poly_add(&v, &v, &k);
    poly_add(&v, &v, &epp);
    poly_add(&v, &v, &k);

  pack_ciphertext(c, &bp, &v);
    pack_ciphertext(c, &bp, &v);
 }

 /*************************************************
@@ -296,18 +296,18 @@ void indcpa_enc(unsigned char *c, const unsigned char *m,
 **************************************************/
 void indcpa_dec(unsigned char *m, const unsigned char *c,
                const unsigned char *sk) {
  polyvec bp, skpv;
  poly v, mp;
    polyvec bp, skpv;
    poly v, mp;

  unpack_ciphertext(&bp, &v, c);
  unpack_sk(&skpv, sk);
    unpack_ciphertext(&bp, &v, c);
    unpack_sk(&skpv, sk);

  polyvec_ntt(&bp);
    polyvec_ntt(&bp);

  polyvec_pointwise_acc(&mp, &skpv, &bp);
  poly_invntt(&mp);
    polyvec_pointwise_acc(&mp, &skpv, &bp);
    poly_invntt(&mp);

  poly_sub(&mp, &mp, &v);
    poly_sub(&mp, &mp, &v);

  poly_tomsg(m, &mp);
    poly_tomsg(m, &mp);
 }
--- a/crypto_kem/kyber768/clean/kem.c
+++ b/crypto_kem/kyber768/clean/kem.c
@@ -1,122 +1,126 @@
 #include "api.h"
 #include "fips202.h"
 #include "indcpa.h"
 #include "params.h"
 #include "randombytes.h"
 #include "verify.h"

 /*************************************************
 * Name:        crypto_kem_keypair
 *
 * Description: Generates public and private key
 *              for CCA-secure Kyber key encapsulation mechanism
 *
 * Arguments:   - unsigned char *pk: pointer to output public key (an already
 *allocated array of CRYPTO_PUBLICKEYBYTES bytes)
 *              - unsigned char *sk: pointer to output private key (an already
 *allocated array of CRYPTO_SECRETKEYBYTES bytes)
 *
 * Returns 0 (success)
 **************************************************/
 int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
  size_t i;
  indcpa_keypair(pk, sk);
  for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
    sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
  }
  sha3_256(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk,
           KYBER_PUBLICKEYBYTES);
  randombytes(sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES,
              KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */
  return 0;
 }

 /*************************************************
 * Name:        crypto_kem_enc
 *
 * Description: Generates cipher text and shared
 *              secret for given public key
 *
 * Arguments:   - unsigned char *ct:       pointer to output cipher text (an
 *already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
 *              - unsigned char *ss:       pointer to output shared secret (an
 *already allocated array of CRYPTO_BYTES bytes)
 *              - const unsigned char *pk: pointer to input public key (an
 *already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
 *
 * Returns 0 (success)
 **************************************************/
 int crypto_kem_enc(unsigned char *ct, unsigned char *ss,
                   const unsigned char *pk) {
  unsigned char kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
  unsigned char buf[2 * KYBER_SYMBYTES];

  randombytes(buf, KYBER_SYMBYTES);
  sha3_256(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */

  sha3_256(buf + KYBER_SYMBYTES, pk,
           KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins +
                                     contributory KEM */
  sha3_512(kr, buf, 2 * KYBER_SYMBYTES);

  indcpa_enc(ct, buf, pk,
             kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */

  sha3_256(kr + KYBER_SYMBYTES, ct,
           KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */
  sha3_256(ss, kr,
           2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */
  return 0;
 }

 /*************************************************
 * Name:        crypto_kem_dec
 *
 * Description: Generates shared secret for given
 *              cipher text and private key
 *
 * Arguments:   - unsigned char *ss:       pointer to output shared secret (an
 *already allocated array of CRYPTO_BYTES bytes)
 *              - const unsigned char *ct: pointer to input cipher text (an
 *already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
 *              - const unsigned char *sk: pointer to input private key (an
 *already allocated array of CRYPTO_SECRETKEYBYTES bytes)
 *
 * Returns 0.
 *
 * On failure, ss will contain a pseudo-random value.
 **************************************************/
 int crypto_kem_dec(unsigned char *ss, const unsigned char *ct,
                   const unsigned char *sk) {
  size_t i;
  int fail;
  unsigned char cmp[KYBER_CIPHERTEXTBYTES];
  unsigned char buf[2 * KYBER_SYMBYTES];
  unsigned char kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins, qrom-hash */
  const unsigned char *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;

  indcpa_dec(buf, ct, sk);

  for (i = 0; i < KYBER_SYMBYTES;
       i++) { /* Multitarget countermeasure for coins + contributory KEM */
    buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES +
                                 i]; /* Save hash by storing H(pk) in sk */
  }
  sha3_512(kr, buf, 2 * KYBER_SYMBYTES);

  indcpa_enc(cmp, buf, pk,
             kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */

  fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);

  sha3_256(kr + KYBER_SYMBYTES, ct,
           KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c)  */

  cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES,
       fail); /* Overwrite pre-k with z on re-encryption failure */

  sha3_256(ss, kr,
           2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */

  return 0;
 }
 #include "api.h"
 #include "fips202.h"
 #include "indcpa.h"
 #include "params.h"
 #include "randombytes.h"
 #include "verify.h"

 /*************************************************
 * Name:        crypto_kem_keypair
 *
 * Description: Generates public and private key
 *              for CCA-secure Kyber key encapsulation mechanism
 *
 * Arguments:   - unsigned char *pk: pointer to output public key (an already
 *allocated array of CRYPTO_PUBLICKEYBYTES bytes)
 *              - unsigned char *sk: pointer to output private key (an already
 *allocated array of CRYPTO_SECRETKEYBYTES bytes)
 *
 * Returns 0 (success)
 **************************************************/
 int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) {
    size_t i;
    indcpa_keypair(pk, sk);
    for (i = 0; i < KYBER_INDCPA_PUBLICKEYBYTES; i++) {
        sk[i + KYBER_INDCPA_SECRETKEYBYTES] = pk[i];
    }
    sha3_256(sk + KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES, pk,
             KYBER_PUBLICKEYBYTES);
    randombytes(
        sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES,
        KYBER_SYMBYTES); /* Value z for pseudo-random output on reject */
    return 0;
 }

 /*************************************************
 * Name:        crypto_kem_enc
 *
 * Description: Generates cipher text and shared
 *              secret for given public key
 *
 * Arguments:   - unsigned char *ct:       pointer to output cipher text (an
 *already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
 *              - unsigned char *ss:       pointer to output shared secret (an
 *already allocated array of CRYPTO_BYTES bytes)
 *              - const unsigned char *pk: pointer to input public key (an
 *already allocated array of CRYPTO_PUBLICKEYBYTES bytes)
 *
 * Returns 0 (success)
 **************************************************/
 int crypto_kem_enc(unsigned char *ct, unsigned char *ss,
                   const unsigned char *pk) {
    unsigned char kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins */
    unsigned char buf[2 * KYBER_SYMBYTES];

    randombytes(buf, KYBER_SYMBYTES);
    sha3_256(buf, buf, KYBER_SYMBYTES); /* Don't release system RNG output */

    sha3_256(buf + KYBER_SYMBYTES, pk,
             KYBER_PUBLICKEYBYTES); /* Multitarget countermeasure for coins +
                                       contributory KEM */
    sha3_512(kr, buf, 2 * KYBER_SYMBYTES);

    indcpa_enc(ct, buf, pk,
               kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */

    sha3_256(kr + KYBER_SYMBYTES, ct,
             KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c) */
    sha3_256(
        ss, kr,
        2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */
    return 0;
 }

 /*************************************************
 * Name:        crypto_kem_dec
 *
 * Description: Generates shared secret for given
 *              cipher text and private key
 *
 * Arguments:   - unsigned char *ss:       pointer to output shared secret (an
 *already allocated array of CRYPTO_BYTES bytes)
 *              - const unsigned char *ct: pointer to input cipher text (an
 *already allocated array of CRYPTO_CIPHERTEXTBYTES bytes)
 *              - const unsigned char *sk: pointer to input private key (an
 *already allocated array of CRYPTO_SECRETKEYBYTES bytes)
 *
 * Returns 0.
 *
 * On failure, ss will contain a pseudo-random value.
 **************************************************/
 int crypto_kem_dec(unsigned char *ss, const unsigned char *ct,
                   const unsigned char *sk) {
    size_t i;
    int fail;
    unsigned char cmp[KYBER_CIPHERTEXTBYTES];
    unsigned char buf[2 * KYBER_SYMBYTES];
    unsigned char
        kr[2 * KYBER_SYMBYTES]; /* Will contain key, coins, qrom-hash */
    const unsigned char *pk = sk + KYBER_INDCPA_SECRETKEYBYTES;

    indcpa_dec(buf, ct, sk);

    for (i = 0; i < KYBER_SYMBYTES;
         i++) { /* Multitarget countermeasure for coins + contributory KEM */
        buf[KYBER_SYMBYTES + i] = sk[KYBER_SECRETKEYBYTES - 2 * KYBER_SYMBYTES +
                                     i]; /* Save hash by storing H(pk) in sk */
    }
    sha3_512(kr, buf, 2 * KYBER_SYMBYTES);

    indcpa_enc(cmp, buf, pk,
               kr + KYBER_SYMBYTES); /* coins are in kr+KYBER_SYMBYTES */

    fail = verify(ct, cmp, KYBER_CIPHERTEXTBYTES);

    sha3_256(kr + KYBER_SYMBYTES, ct,
             KYBER_CIPHERTEXTBYTES); /* overwrite coins in kr with H(c)  */

    cmov(kr, sk + KYBER_SECRETKEYBYTES - KYBER_SYMBYTES, KYBER_SYMBYTES,
         fail); /* Overwrite pre-k with z on re-encryption failure */

    sha3_256(
        ss, kr,
        2 * KYBER_SYMBYTES); /* hash concatenation of pre-k and H(c) to k */

    return 0;
 }
--- a/crypto_kem/kyber768/clean/kex.c
+++ b/crypto_kem/kyber768/clean/kex.c
@@ -3,49 +3,49 @@
 #include "verify.h"

 void kyber_uake_initA(u8 *send, u8 *tk, u8 *sk, const u8 *pkb) {
  crypto_kem_keypair(send, sk);
  crypto_kem_enc(send + KYBER_PUBLICKEYBYTES, tk, pkb);
    crypto_kem_keypair(send, sk);
    crypto_kem_enc(send + KYBER_PUBLICKEYBYTES, tk, pkb);
 }

 void kyber_uake_sharedB(u8 *send, u8 *k, const u8 *recv, const u8 *skb) {
  unsigned char buf[2 * KYBER_SYMBYTES];
  crypto_kem_enc(send, buf, recv);
  crypto_kem_dec(buf + KYBER_SYMBYTES, recv + KYBER_PUBLICKEYBYTES, skb);
  shake256(k, KYBER_SYMBYTES, buf, 2 * KYBER_SYMBYTES);
    unsigned char buf[2 * KYBER_SYMBYTES];
    crypto_kem_enc(send, buf, recv);
    crypto_kem_dec(buf + KYBER_SYMBYTES, recv + KYBER_PUBLICKEYBYTES, skb);
    shake256(k, KYBER_SYMBYTES, buf, 2 * KYBER_SYMBYTES);
 }

 void kyber_uake_sharedA(u8 *k, const u8 *recv, const u8 *tk, const u8 *sk) {
  unsigned char buf[2 * KYBER_SYMBYTES];
  int i;
  crypto_kem_dec(buf, recv, sk);
  for (i = 0; i < KYBER_SYMBYTES; i++) {
    buf[i + KYBER_SYMBYTES] = tk[i];
  }
  shake256(k, KYBER_SYMBYTES, buf, 2 * KYBER_SYMBYTES);
    unsigned char buf[2 * KYBER_SYMBYTES];
    int i;
    crypto_kem_dec(buf, recv, sk);
    for (i = 0; i < KYBER_SYMBYTES; i++) {
        buf[i + KYBER_SYMBYTES] = tk[i];
    }
    shake256(k, KYBER_SYMBYTES, buf, 2 * KYBER_SYMBYTES);
 }

 void kyber_ake_initA(u8 *send, u8 *tk, u8 *sk, const u8 *pkb) {
  crypto_kem_keypair(send, sk);
  crypto_kem_enc(send + KYBER_PUBLICKEYBYTES, tk, pkb);
    crypto_kem_keypair(send, sk);
    crypto_kem_enc(send + KYBER_PUBLICKEYBYTES, tk, pkb);
 }

 void kyber_ake_sharedB(u8 *send, u8 *k, const u8 *recv, const u8 *skb,
                       const u8 *pka) {
  unsigned char buf[3 * KYBER_SYMBYTES];
  crypto_kem_enc(send, buf, recv);
  crypto_kem_enc(send + KYBER_CIPHERTEXTBYTES, buf + KYBER_SYMBYTES, pka);
  crypto_kem_dec(buf + 2 * KYBER_SYMBYTES, recv + KYBER_PUBLICKEYBYTES, skb);
  shake256(k, KYBER_SYMBYTES, buf, 3 * KYBER_SYMBYTES);
    unsigned char buf[3 * KYBER_SYMBYTES];
    crypto_kem_enc(send, buf, recv);
    crypto_kem_enc(send + KYBER_CIPHERTEXTBYTES, buf + KYBER_SYMBYTES, pka);
    crypto_kem_dec(buf + 2 * KYBER_SYMBYTES, recv + KYBER_PUBLICKEYBYTES, skb);
    shake256(k, KYBER_SYMBYTES, buf, 3 * KYBER_SYMBYTES);
 }

 void kyber_ake_sharedA(u8 *k, const u8 *recv, const u8 *tk, const u8 *sk,
                       const u8 *ska) {
  unsigned char buf[3 * KYBER_SYMBYTES];
  int i;
  crypto_kem_dec(buf, recv, sk);
  crypto_kem_dec(buf + KYBER_SYMBYTES, recv + KYBER_CIPHERTEXTBYTES, ska);
  for (i = 0; i < KYBER_SYMBYTES; i++) {
    buf[i + 2 * KYBER_SYMBYTES] = tk[i];
  }
  shake256(k, KYBER_SYMBYTES, buf, 3 * KYBER_SYMBYTES);
    unsigned char buf[3 * KYBER_SYMBYTES];
    int i;
    crypto_kem_dec(buf, recv, sk);
    crypto_kem_dec(buf + KYBER_SYMBYTES, recv + KYBER_CIPHERTEXTBYTES, ska);
    for (i = 0; i < KYBER_SYMBYTES; i++) {
        buf[i + 2 * KYBER_SYMBYTES] = tk[i];
    }
    shake256(k, KYBER_SYMBYTES, buf, 3 * KYBER_SYMBYTES);
 }
--- a/crypto_kem/kyber768/clean/ntt.c
+++ b/crypto_kem/kyber768/clean/ntt.c
@@ -1,80 +1,81 @@
 #include "ntt.h"
 #include "inttypes.h"
 #include "params.h"
 #include "reduce.h"

 extern const uint16_t omegas_inv_bitrev_montgomery[];
 extern const uint16_t psis_inv_montgomery[];
 extern const uint16_t zetas[];

 /*************************************************
 * Name:        ntt
 *
 * Description: Computes negacyclic number-theoretic transform (NTT) of
 *              a polynomial (vector of 256 coefficients) in place;
 *              inputs assumed to be in normal order, output in bitreversed
 *order
 *
 * Arguments:   - uint16_t *p: pointer to in/output polynomial
 **************************************************/
 void ntt(uint16_t *p) {
  int level, start, j, k;
  uint16_t zeta, t;

  k = 1;
  for (level = 7; level >= 0; level--) {
    for (start = 0; start < KYBER_N; start = j + (1 << level)) {
      zeta = zetas[k++];
      for (j = start; j < start + (1 << level); ++j) {
        t = montgomery_reduce((uint32_t)zeta * p[j + (1 << level)]);

        p[j + (1 << level)] = barrett_reduce(p[j] + 4 * KYBER_Q - t);

        if (level & 1) {   /* odd level */
          p[j] = p[j] + t; /* Omit reduction (be lazy) */
        } else {
          p[j] = barrett_reduce(p[j] + t);
        }
      }
    }
  }
 }

 /*************************************************
 * Name:        invntt
 *
 * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
 *of a polynomial (vector of 256 coefficients) in place; inputs assumed to be in
 *bitreversed order, output in normal order
 *
 * Arguments:   - uint16_t *a: pointer to in/output polynomial
 **************************************************/
 void invntt(uint16_t *a) {
  int start, j, jTwiddle, level;
  uint16_t temp, W;
  uint32_t t;

  for (level = 0; level < 8; level++) {
    for (start = 0; start < (1 << level); start++) {
      jTwiddle = 0;
      for (j = start; j < KYBER_N - 1; j += 2 * (1 << level)) {
        W = omegas_inv_bitrev_montgomery[jTwiddle++];
        temp = a[j];

        if (level & 1) { /* odd level */
          a[j] = barrett_reduce((temp + a[j + (1 << level)]));
        } else {
          a[j] = (temp + a[j + (1 << level)]); /* Omit reduction (be lazy) */
        }

        t = (W * ((uint32_t)temp + 4 * KYBER_Q - a[j + (1 << level)]));

        a[j + (1 << level)] = montgomery_reduce(t);
      }
    }
  }

  for (j = 0; j < KYBER_N; j++) {
    a[j] = montgomery_reduce((a[j] * psis_inv_montgomery[j]));
  }
 }
 #include "ntt.h"
 #include "inttypes.h"
 #include "params.h"
 #include "reduce.h"

 extern const uint16_t omegas_inv_bitrev_montgomery[];
 extern const uint16_t psis_inv_montgomery[];
 extern const uint16_t zetas[];

 /*************************************************
 * Name:        ntt
 *
 * Description: Computes negacyclic number-theoretic transform (NTT) of
 *              a polynomial (vector of 256 coefficients) in place;
 *              inputs assumed to be in normal order, output in bitreversed
 *order
 *
 * Arguments:   - uint16_t *p: pointer to in/output polynomial
 **************************************************/
 void ntt(uint16_t *p) {
    int level, start, j, k;
    uint16_t zeta, t;

    k = 1;
    for (level = 7; level >= 0; level--) {
        for (start = 0; start < KYBER_N; start = j + (1 << level)) {
            zeta = zetas[k++];
            for (j = start; j < start + (1 << level); ++j) {
                t = montgomery_reduce((uint32_t)zeta * p[j + (1 << level)]);

                p[j + (1 << level)] = barrett_reduce(p[j] + 4 * KYBER_Q - t);

                if (level & 1) {     /* odd level */
                    p[j] = p[j] + t; /* Omit reduction (be lazy) */
                } else {
                    p[j] = barrett_reduce(p[j] + t);
                }
            }
        }
    }
 }

 /*************************************************
 * Name:        invntt
 *
 * Description: Computes inverse of negacyclic number-theoretic transform (NTT)
 *of a polynomial (vector of 256 coefficients) in place; inputs assumed to be in
 *bitreversed order, output in normal order
 *
 * Arguments:   - uint16_t *a: pointer to in/output polynomial
 **************************************************/
 void invntt(uint16_t *a) {
    int start, j, jTwiddle, level;
    uint16_t temp, W;
    uint32_t t;

    for (level = 0; level < 8; level++) {
        for (start = 0; start < (1 << level); start++) {
            jTwiddle = 0;
            for (j = start; j < KYBER_N - 1; j += 2 * (1 << level)) {
                W = omegas_inv_bitrev_montgomery[jTwiddle++];
                temp = a[j];

                if (level & 1) { /* odd level */
                    a[j] = barrett_reduce((temp + a[j + (1 << level)]));
                } else {
                    a[j] = (temp +
                            a[j + (1 << level)]); /* Omit reduction (be lazy) */
                }

                t = (W * ((uint32_t)temp + 4 * KYBER_Q - a[j + (1 << level)]));

                a[j + (1 << level)] = montgomery_reduce(t);
            }
        }
    }

    for (j = 0; j < KYBER_N; j++) {
        a[j] = montgomery_reduce((a[j] * psis_inv_montgomery[j]));
    }
 }
--- a/crypto_kem/kyber768/clean/ntt.h
+++ b/crypto_kem/kyber768/clean/ntt.h
@@ -1,9 +1,9 @@
 #ifndef NTT_H
 #define NTT_H

 #include <stdint.h>

 void ntt(uint16_t *poly);
 void invntt(uint16_t *a);

 #endif
 #ifndef NTT_H
 #define NTT_H

 #include <stdint.h>

 void ntt(uint16_t *poly);
 void invntt(uint16_t *a);

 #endif
--- a/crypto_kem/kyber768/clean/params.h
+++ b/crypto_kem/kyber768/clean/params.h
@@ -1,31 +1,31 @@
 #ifndef PARAMS_H
 #define PARAMS_H

 #define KYBER_K 3

 #define KYBER_N 256
 #define KYBER_Q 7681

 #define KYBER_ETA 4

 #define KYBER_SYMBYTES 32 /* size in bytes of shared key, hashes, and seeds */

 #define KYBER_POLYBYTES 416
 #define KYBER_POLYCOMPRESSEDBYTES 96
 #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)
 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)

 #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
 #define KYBER_INDCPA_PUBLICKEYBYTES                                            \
  (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_SYMBYTES)
 #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
 #define KYBER_INDCPA_BYTES                                                     \
  (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)

 #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES)
 #define KYBER_SECRETKEYBYTES                                                   \
  (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES +                 \
   2 * KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
 #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES

 #endif
 #ifndef PARAMS_H
 #define PARAMS_H

 #define KYBER_K 3

 #define KYBER_N 256
 #define KYBER_Q 7681

 #define KYBER_ETA 4

 #define KYBER_SYMBYTES 32 /* size in bytes of shared key, hashes, and seeds */

 #define KYBER_POLYBYTES 416
 #define KYBER_POLYCOMPRESSEDBYTES 96
 #define KYBER_POLYVECBYTES (KYBER_K * KYBER_POLYBYTES)
 #define KYBER_POLYVECCOMPRESSEDBYTES (KYBER_K * 352)

 #define KYBER_INDCPA_MSGBYTES KYBER_SYMBYTES
 #define KYBER_INDCPA_PUBLICKEYBYTES                                            \
    (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_SYMBYTES)
 #define KYBER_INDCPA_SECRETKEYBYTES (KYBER_POLYVECBYTES)
 #define KYBER_INDCPA_BYTES                                                     \
    (KYBER_POLYVECCOMPRESSEDBYTES + KYBER_POLYCOMPRESSEDBYTES)

 #define KYBER_PUBLICKEYBYTES (KYBER_INDCPA_PUBLICKEYBYTES)
 #define KYBER_SECRETKEYBYTES                                                   \
    (KYBER_INDCPA_SECRETKEYBYTES + KYBER_INDCPA_PUBLICKEYBYTES +               \
     2 * KYBER_SYMBYTES) /* 32 bytes of additional space to save H(pk) */
 #define KYBER_CIPHERTEXTBYTES KYBER_INDCPA_BYTES

 #endif
--- a/crypto_kem/kyber768/clean/poly.c
+++ b/crypto_kem/kyber768/clean/poly.c
@@ -15,19 +15,20 @@
 *              - const poly *a:    pointer to input polynomial
 **************************************************/
 void poly_compress(unsigned char *r, const poly *a) {
  uint32_t t[8];
  unsigned int i, j, k = 0;

  for (i = 0; i < KYBER_N; i += 8) {
    for (j = 0; j < 8; j++) {
      t[j] = (((freeze(a->coeffs[i + j]) << 3) + KYBER_Q / 2) / KYBER_Q) & 7;
    uint32_t t[8];
    unsigned int i, j, k = 0;

    for (i = 0; i < KYBER_N; i += 8) {
        for (j = 0; j < 8; j++) {
            t[j] =
                (((freeze(a->coeffs[i + j]) << 3) + KYBER_Q / 2) / KYBER_Q) & 7;
        }

        r[k] = t[0] | (t[1] << 3) | (t[2] << 6);
        r[k + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
        r[k + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
        k += 3;
    }

    r[k] = t[0] | (t[1] << 3) | (t[2] << 6);
    r[k + 1] = (t[2] >> 2) | (t[3] << 1) | (t[4] << 4) | (t[5] << 7);
    r[k + 2] = (t[5] >> 1) | (t[6] << 2) | (t[7] << 5);
    k += 3;
  }
 }

 /*************************************************
@@ -40,18 +41,20 @@ void poly_compress(unsigned char *r, const poly *a) {
 *              - const unsigned char *a: pointer to input byte array
 **************************************************/
 void poly_decompress(poly *r, const unsigned char *a) {
  unsigned int i;
  for (i = 0; i < KYBER_N; i += 8) {
    r->coeffs[i + 0] = (((a[0] & 7) * KYBER_Q) + 4) >> 3;
    r->coeffs[i + 1] = ((((a[0] >> 3) & 7) * KYBER_Q) + 4) >> 3;
    r->coeffs[i + 2] = ((((a[0] >> 6) | ((a[1] << 2) & 4)) * KYBER_Q) + 4) >> 3;
    r->coeffs[i + 3] = ((((a[1] >> 1) & 7) * KYBER_Q) + 4) >> 3;
    r->coeffs[i + 4] = ((((a[1] >> 4) & 7) * KYBER_Q) + 4) >> 3;
    r->coeffs[i + 5] = ((((a[1] >> 7) | ((a[2] << 1) & 6)) * KYBER_Q) + 4) >> 3;
    r->coeffs[i + 6] = ((((a[2] >> 2) & 7) * KYBER_Q) + 4) >> 3;
    r->coeffs[i + 7] = ((((a[2] >> 5)) * KYBER_Q) + 4) >> 3;
    a += 3;
  }
    unsigned int i;
    for (i = 0; i < KYBER_N; i += 8) {
        r->coeffs[i + 0] = (((a[0] & 7) * KYBER_Q) + 4) >> 3;
        r->coeffs[i + 1] = ((((a[0] >> 3) & 7) * KYBER_Q) + 4) >> 3;
        r->coeffs[i + 2] =
            ((((a[0] >> 6) | ((a[1] << 2) & 4)) * KYBER_Q) + 4) >> 3;
        r->coeffs[i + 3] = ((((a[1] >> 1) & 7) * KYBER_Q) + 4) >> 3;
        r->coeffs[i + 4] = ((((a[1] >> 4) & 7) * KYBER_Q) + 4) >> 3;
        r->coeffs[i + 5] =
            ((((a[1] >> 7) | ((a[2] << 1) & 6)) * KYBER_Q) + 4) >> 3;
        r->coeffs[i + 6] = ((((a[2] >> 2) & 7) * KYBER_Q) + 4) >> 3;
        r->coeffs[i + 7] = ((((a[2] >> 5)) * KYBER_Q) + 4) >> 3;
        a += 3;
    }
 }

 /*************************************************
@@ -63,28 +66,28 @@ void poly_decompress(poly *r, const unsigned char *a) {
 *              - const poly *a:    pointer to input polynomial
 **************************************************/
 void poly_tobytes(unsigned char *r, const poly *a) {
  int i, j;
  uint16_t t[8];

  for (i = 0; i < KYBER_N / 8; i++) {
    for (j = 0; j < 8; j++) {
      t[j] = freeze(a->coeffs[8 * i + j]);
    int i, j;
    uint16_t t[8];

    for (i = 0; i < KYBER_N / 8; i++) {
        for (j = 0; j < 8; j++) {
            t[j] = freeze(a->coeffs[8 * i + j]);
        }

        r[13 * i + 0] = t[0] & 0xff;
        r[13 * i + 1] = (t[0] >> 8) | ((t[1] & 0x07) << 5);
        r[13 * i + 2] = (t[1] >> 3) & 0xff;
        r[13 * i + 3] = (t[1] >> 11) | ((t[2] & 0x3f) << 2);
        r[13 * i + 4] = (t[2] >> 6) | ((t[3] & 0x01) << 7);
        r[13 * i + 5] = (t[3] >> 1) & 0xff;
        r[13 * i + 6] = (t[3] >> 9) | ((t[4] & 0x0f) << 4);
        r[13 * i + 7] = (t[4] >> 4) & 0xff;
        r[13 * i + 8] = (t[4] >> 12) | ((t[5] & 0x7f) << 1);
        r[13 * i + 9] = (t[5] >> 7) | ((t[6] & 0x03) << 6);
        r[13 * i + 10] = (t[6] >> 2) & 0xff;
        r[13 * i + 11] = (t[6] >> 10) | ((t[7] & 0x1f) << 3);
        r[13 * i + 12] = (t[7] >> 5);
    }

    r[13 * i + 0] = t[0] & 0xff;
    r[13 * i + 1] = (t[0] >> 8) | ((t[1] & 0x07) << 5);
    r[13 * i + 2] = (t[1] >> 3) & 0xff;
    r[13 * i + 3] = (t[1] >> 11) | ((t[2] & 0x3f) << 2);
    r[13 * i + 4] = (t[2] >> 6) | ((t[3] & 0x01) << 7);
    r[13 * i + 5] = (t[3] >> 1) & 0xff;
    r[13 * i + 6] = (t[3] >> 9) | ((t[4] & 0x0f) << 4);
    r[13 * i + 7] = (t[4] >> 4) & 0xff;
    r[13 * i + 8] = (t[4] >> 12) | ((t[5] & 0x7f) << 1);
    r[13 * i + 9] = (t[5] >> 7) | ((t[6] & 0x03) << 6);
    r[13 * i + 10] = (t[6] >> 2) & 0xff;
    r[13 * i + 11] = (t[6] >> 10) | ((t[7] & 0x1f) << 3);
    r[13 * i + 12] = (t[7] >> 5);
  }
 }

 /*************************************************
@@ -97,29 +100,29 @@ void poly_tobytes(unsigned char *r, const poly *a) {
 *              - const unsigned char *a: pointer to input byte array
 **************************************************/
 void poly_frombytes(poly *r, const unsigned char *a) {
  int i;
  for (i = 0; i < KYBER_N / 8; i++) {
    r->coeffs[8 * i + 0] =
        a[13 * i + 0] | (((uint16_t)a[13 * i + 1] & 0x1f) << 8);
    r->coeffs[8 * i + 1] = (a[13 * i + 1] >> 5) |
                           (((uint16_t)a[13 * i + 2]) << 3) |
                           (((uint16_t)a[13 * i + 3] & 0x03) << 11);
    r->coeffs[8 * i + 2] =
        (a[13 * i + 3] >> 2) | (((uint16_t)a[13 * i + 4] & 0x7f) << 6);
    r->coeffs[8 * i + 3] = (a[13 * i + 4] >> 7) |
                           (((uint16_t)a[13 * i + 5]) << 1) |
                           (((uint16_t)a[13 * i + 6] & 0x0f) << 9);
    r->coeffs[8 * i + 4] = (a[13 * i + 6] >> 4) |
                           (((uint16_t)a[13 * i + 7]) << 4) |
                           (((uint16_t)a[13 * i + 8] & 0x01) << 12);
    r->coeffs[8 * i + 5] =
        (a[13 * i + 8] >> 1) | (((uint16_t)a[13 * i + 9] & 0x3f) << 7);
    r->coeffs[8 * i + 6] = (a[13 * i + 9] >> 6) |
                           (((uint16_t)a[13 * i + 10]) << 2) |
                           (((uint16_t)a[13 * i + 11] & 0x07) << 10);
    r->coeffs[8 * i + 7] =
        (a[13 * i + 11] >> 3) | (((uint16_t)a[13 * i + 12]) << 5);
  }
    int i;
    for (i = 0; i < KYBER_N / 8; i++) {
        r->coeffs[8 * i + 0] =
            a[13 * i + 0] | (((uint16_t)a[13 * i + 1] & 0x1f) << 8);
        r->coeffs[8 * i + 1] = (a[13 * i + 1] >> 5) |
                               (((uint16_t)a[13 * i + 2]) << 3) |
                               (((uint16_t)a[13 * i + 3] & 0x03) << 11);
        r->coeffs[8 * i + 2] =
            (a[13 * i + 3] >> 2) | (((uint16_t)a[13 * i + 4] & 0x7f) << 6);
        r->coeffs[8 * i + 3] = (a[13 * i + 4] >> 7) |
                               (((uint16_t)a[13 * i + 5]) << 1) |
                               (((uint16_t)a[13 * i + 6] & 0x0f) << 9);
        r->coeffs[8 * i + 4] = (a[13 * i + 6] >> 4) |
                               (((uint16_t)a[13 * i + 7]) << 4) |
                               (((uint16_t)a[13 * i + 8] & 0x01) << 12);
        r->coeffs[8 * i + 5] =
            (a[13 * i + 8] >> 1) | (((uint16_t)a[13 * i + 9] & 0x3f) << 7);
        r->coeffs[8 * i + 6] = (a[13 * i + 9] >> 6) |
                               (((uint16_t)a[13 * i + 10]) << 2) |
                               (((uint16_t)a[13 * i + 11] & 0x07) << 10);
        r->coeffs[8 * i + 7] =
            (a[13 * i + 11] >> 3) | (((uint16_t)a[13 * i + 12]) << 5);
    }
 }

 /*************************************************
@@ -134,18 +137,18 @@ void poly_frombytes(poly *r, const unsigned char *a) {
 *              - unsigned char nonce:       one-byte input nonce
 **************************************************/
 void poly_getnoise(poly *r, const unsigned char *seed, unsigned char nonce) {
  unsigned char buf[KYBER_ETA * KYBER_N / 4];
  unsigned char extseed[KYBER_SYMBYTES + 1];
  int i;
    unsigned char buf[KYBER_ETA * KYBER_N / 4];
    unsigned char extseed[KYBER_SYMBYTES + 1];
    int i;

  for (i = 0; i < KYBER_SYMBYTES; i++) {
    extseed[i] = seed[i];
  }
  extseed[KYBER_SYMBYTES] = nonce;
    for (i = 0; i < KYBER_SYMBYTES; i++) {
        extseed[i] = seed[i];
    }
    extseed[KYBER_SYMBYTES] = nonce;

  shake256(buf, KYBER_ETA * KYBER_N / 4, extseed, KYBER_SYMBYTES + 1);
    shake256(buf, KYBER_ETA * KYBER_N / 4, extseed, KYBER_SYMBYTES + 1);

  cbd(r, buf);
    cbd(r, buf);
 }

 /*************************************************
@@ -159,7 +162,7 @@ void poly_getnoise(poly *r, const unsigned char *seed, unsigned char nonce) {
 * Arguments:   - uint16_t *r: pointer to in/output polynomial
 **************************************************/
 void poly_ntt(poly *r) {
  ntt(r->coeffs);
    ntt(r->coeffs);
 }

 /*************************************************
@@ -172,7 +175,7 @@ void poly_ntt(poly *r) {
 * Arguments:   - uint16_t *a: pointer to in/output polynomial
 **************************************************/
 void poly_invntt(poly *r) {
  invntt(r->coeffs);
    invntt(r->coeffs);
 }

 /*************************************************
@@ -185,10 +188,10 @@ void poly_invntt(poly *r) {
 *            - const poly *b: pointer to second input polynomial
 **************************************************/
 void poly_add(poly *r, const poly *a, const poly *b) {
  int i;
  for (i = 0; i < KYBER_N; i++) {
    r->coeffs[i] = barrett_reduce(a->coeffs[i] + b->coeffs[i]);
  }
    int i;
    for (i = 0; i < KYBER_N; i++) {
        r->coeffs[i] = barrett_reduce(a->coeffs[i] + b->coeffs[i]);
    }
 }

 /*************************************************
@@ -201,10 +204,11 @@ void poly_add(poly *r, const poly *a, const poly *b) {
 *            - const poly *b: pointer to second input polynomial
 **************************************************/
 void poly_sub(poly *r, const poly *a, const poly *b) {
  int i;
  for (i = 0; i < KYBER_N; i++) {
    r->coeffs[i] = barrett_reduce(a->coeffs[i] + 3 * KYBER_Q - b->coeffs[i]);
  }
    int i;
    for (i = 0; i < KYBER_N; i++) {
        r->coeffs[i] =
            barrett_reduce(a->coeffs[i] + 3 * KYBER_Q - b->coeffs[i]);
    }
 }

 /*************************************************
@@ -216,14 +220,14 @@ void poly_sub(poly *r, const poly *a, const poly *b) {
 *              - const unsigned char *msg: pointer to input message
 **************************************************/
 void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]) {
  uint16_t i, j, mask;
    uint16_t i, j, mask;

  for (i = 0; i < KYBER_SYMBYTES; i++) {
    for (j = 0; j < 8; j++) {
      mask = -((msg[i] >> j) & 1);
      r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2);
    for (i = 0; i < KYBER_SYMBYTES; i++) {
        for (j = 0; j < 8; j++) {
            mask = -((msg[i] >> j) & 1);
            r->coeffs[8 * i + j] = mask & ((KYBER_Q + 1) / 2);
        }
    }
  }
 }

 /*************************************************
@@ -235,14 +239,16 @@ void poly_frommsg(poly *r, const unsigned char msg[KYBER_SYMBYTES]) {
 *              - const poly *a:      pointer to input polynomial
 **************************************************/
 void poly_tomsg(unsigned char msg[KYBER_SYMBYTES], const poly *a) {
  uint16_t t;
  int i, j;

  for (i = 0; i < KYBER_SYMBYTES; i++) {
    msg[i] = 0;
    for (j = 0; j < 8; j++) {
      t = (((freeze(a->coeffs[8 * i + j]) << 1) + KYBER_Q / 2) / KYBER_Q) & 1;
      msg[i] |= t << j;
    uint16_t t;
    int i, j;

    for (i = 0; i < KYBER_SYMBYTES; i++) {
        msg[i] = 0;
        for (j = 0; j < 8; j++) {
            t = (((freeze(a->coeffs[8 * i + j]) << 1) + KYBER_Q / 2) /
                 KYBER_Q) &
                1;
            msg[i] |= t << j;
        }
    }
  }
 }
--- a/crypto_kem/kyber768/clean/poly.h
+++ b/crypto_kem/kyber768/clean/poly.h
@@ -9,7 +9,7 @@
 * coeffs[0] + X*coeffs[1] + X^2*xoeffs[2] + ... + X^{n-1}*coeffs[n-1]
 */
 typedef struct {
  uint16_t coeffs[KYBER_N];
    uint16_t coeffs[KYBER_N];
 } poly;

 void poly_compress(unsigned char *r, const poly *a);
--- a/crypto_kem/kyber768/clean/polyvec.c
+++ b/crypto_kem/kyber768/clean/polyvec.c
@@ -13,31 +13,31 @@
 *              - const polyvec *a: pointer to input vector of polynomials
 **************************************************/
 void polyvec_compress(unsigned char *r, const polyvec *a) {
  int i, j, k;
  uint16_t t[8];
  for (i = 0; i < KYBER_K; i++) {
    for (j = 0; j < KYBER_N / 8; j++) {
      for (k = 0; k < 8; k++) {
        t[k] = ((((uint32_t)freeze(a->vec[i].coeffs[8 * j + k]) << 11) +
                 KYBER_Q / 2) /
                KYBER_Q) &
               0x7ff;
      }
    int i, j, k;
    uint16_t t[8];
    for (i = 0; i < KYBER_K; i++) {
        for (j = 0; j < KYBER_N / 8; j++) {
            for (k = 0; k < 8; k++) {
                t[k] = ((((uint32_t)freeze(a->vec[i].coeffs[8 * j + k]) << 11) +
                         KYBER_Q / 2) /
                        KYBER_Q) &
                       0x7ff;
            }

      r[11 * j + 0] = t[0] & 0xff;
      r[11 * j + 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3);
      r[11 * j + 2] = (t[1] >> 5) | ((t[2] & 0x03) << 6);
      r[11 * j + 3] = (t[2] >> 2) & 0xff;
      r[11 * j + 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1);
      r[11 * j + 5] = (t[3] >> 7) | ((t[4] & 0x0f) << 4);
      r[11 * j + 6] = (t[4] >> 4) | ((t[5] & 0x01) << 7);
      r[11 * j + 7] = (t[5] >> 1) & 0xff;
      r[11 * j + 8] = (t[5] >> 9) | ((t[6] & 0x3f) << 2);
      r[11 * j + 9] = (t[6] >> 6) | ((t[7] & 0x07) << 5);
      r[11 * j + 10] = (t[7] >> 3);
            r[11 * j + 0] = t[0] & 0xff;
            r[11 * j + 1] = (t[0] >> 8) | ((t[1] & 0x1f) << 3);
            r[11 * j + 2] = (t[1] >> 5) | ((t[2] & 0x03) << 6);
            r[11 * j + 3] = (t[2] >> 2) & 0xff;
            r[11 * j + 4] = (t[2] >> 10) | ((t[3] & 0x7f) << 1);
            r[11 * j + 5] = (t[3] >> 7) | ((t[4] & 0x0f) << 4);
            r[11 * j + 6] = (t[4] >> 4) | ((t[5] & 0x01) << 7);
            r[11 * j + 7] = (t[5] >> 1) & 0xff;
            r[11 * j + 8] = (t[5] >> 9) | ((t[6] & 0x3f) << 2);
            r[11 * j + 9] = (t[6] >> 6) | ((t[7] & 0x07) << 5);
            r[11 * j + 10] = (t[7] >> 3);
        }
        r += 352;
    }
    r += 352;
  }
 }

 /*************************************************
@@ -50,54 +50,61 @@ void polyvec_compress(unsigned char *r, const polyvec *a) {
 *              - unsigned char *a: pointer to input byte array
 **************************************************/
 void polyvec_decompress(polyvec *r, const unsigned char *a) {
  int i, j;
  for (i = 0; i < KYBER_K; i++) {
    for (j = 0; j < KYBER_N / 8; j++) {
      r->vec[i].coeffs[8 * j + 0] =
          (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) *
            KYBER_Q) +
           1024) >>
          11;
      r->vec[i].coeffs[8 * j + 1] =
          ((((a[11 * j + 1] >> 3) | (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) *
            KYBER_Q) +
           1024) >>
          11;
      r->vec[i].coeffs[8 * j + 2] =
          ((((a[11 * j + 2] >> 6) | (((uint32_t)a[11 * j + 3] & 0xff) << 2) |
             (((uint32_t)a[11 * j + 4] & 0x01) << 10)) *
            KYBER_Q) +
           1024) >>
          11;
      r->vec[i].coeffs[8 * j + 3] =
          ((((a[11 * j + 4] >> 1) | (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) *
            KYBER_Q) +
           1024) >>
          11;
      r->vec[i].coeffs[8 * j + 4] =
          ((((a[11 * j + 5] >> 4) | (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) *
            KYBER_Q) +
           1024) >>
          11;
      r->vec[i].coeffs[8 * j + 5] =
          ((((a[11 * j + 6] >> 7) | (((uint32_t)a[11 * j + 7] & 0xff) << 1) |
             (((uint32_t)a[11 * j + 8] & 0x03) << 9)) *
            KYBER_Q) +
           1024) >>
          11;
      r->vec[i].coeffs[8 * j + 6] =
          ((((a[11 * j + 8] >> 2) | (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) *
            KYBER_Q) +
           1024) >>
          11;
      r->vec[i].coeffs[8 * j + 7] =
          ((((a[11 * j + 9] >> 5) | (((uint32_t)a[11 * j + 10] & 0xff) << 3)) *
            KYBER_Q) +
           1024) >>
          11;
    int i, j;
    for (i = 0; i < KYBER_K; i++) {
        for (j = 0; j < KYBER_N / 8; j++) {
            r->vec[i].coeffs[8 * j + 0] =
                (((a[11 * j + 0] | (((uint32_t)a[11 * j + 1] & 0x07) << 8)) *
                  KYBER_Q) +
                 1024) >>
                11;
            r->vec[i].coeffs[8 * j + 1] =
                ((((a[11 * j + 1] >> 3) |
                   (((uint32_t)a[11 * j + 2] & 0x3f) << 5)) *
                  KYBER_Q) +
                 1024) >>
                11;
            r->vec[i].coeffs[8 * j + 2] =
                ((((a[11 * j + 2] >> 6) |
                   (((uint32_t)a[11 * j + 3] & 0xff) << 2) |
                   (((uint32_t)a[11 * j + 4] & 0x01) << 10)) *
                  KYBER_Q) +
                 1024) >>
                11;
            r->vec[i].coeffs[8 * j + 3] =
                ((((a[11 * j + 4] >> 1) |
                   (((uint32_t)a[11 * j + 5] & 0x0f) << 7)) *
                  KYBER_Q) +
                 1024) >>
                11;
            r->vec[i].coeffs[8 * j + 4] =
                ((((a[11 * j + 5] >> 4) |
                   (((uint32_t)a[11 * j + 6] & 0x7f) << 4)) *
                  KYBER_Q) +
                 1024) >>
                11;
            r->vec[i].coeffs[8 * j + 5] =
                ((((a[11 * j + 6] >> 7) |
                   (((uint32_t)a[11 * j + 7] & 0xff) << 1) |
                   (((uint32_t)a[11 * j + 8] & 0x03) << 9)) *
                  KYBER_Q) +
                 1024) >>
                11;
            r->vec[i].coeffs[8 * j + 6] =
                ((((a[11 * j + 8] >> 2) |
                   (((uint32_t)a[11 * j + 9] & 0x1f) << 6)) *
                  KYBER_Q) +
                 1024) >>
                11;
            r->vec[i].coeffs[8 * j + 7] =
                ((((a[11 * j + 9] >> 5) |
                   (((uint32_t)a[11 * j + 10] & 0xff) << 3)) *
                  KYBER_Q) +
                 1024) >>
                11;
        }
        a += 352;
    }
    a += 352;
  }
 }

 /*************************************************
@@ -109,10 +116,10 @@ void polyvec_decompress(polyvec *r, const unsigned char *a) {
 *              - const polyvec *a: pointer to input vector of polynomials
 **************************************************/
 void polyvec_tobytes(unsigned char *r, const polyvec *a) {
  int i;
  for (i = 0; i < KYBER_K; i++) {
    poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]);
  }
    int i;
    for (i = 0; i < KYBER_K; i++) {
        poly_tobytes(r + i * KYBER_POLYBYTES, &a->vec[i]);
    }
 }

 /*************************************************
@@ -125,10 +132,10 @@ void polyvec_tobytes(unsigned char *r, const polyvec *a) {
 *              - const polyvec *a: pointer to input vector of polynomials
 **************************************************/
 void polyvec_frombytes(polyvec *r, const unsigned char *a) {
  int i;
  for (i = 0; i < KYBER_K; i++) {
    poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES);
  }
    int i;
    for (i = 0; i < KYBER_K; i++) {
        poly_frombytes(&r->vec[i], a + i * KYBER_POLYBYTES);
    }
 }

 /*************************************************
@@ -139,10 +146,10 @@ void polyvec_frombytes(polyvec *r, const unsigned char *a) {
 * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
 **************************************************/
 void polyvec_ntt(polyvec *r) {
  int i;
  for (i = 0; i < KYBER_K; i++) {
    poly_ntt(&r->vec[i]);
  }
    int i;
    for (i = 0; i < KYBER_K; i++) {
        poly_ntt(&r->vec[i]);
    }
 }

 /*************************************************
@@ -153,10 +160,10 @@ void polyvec_ntt(polyvec *r) {
 * Arguments:   - polyvec *r: pointer to in/output vector of polynomials
 **************************************************/
 void polyvec_invntt(polyvec *r) {
  int i;
  for (i = 0; i < KYBER_K; i++) {
    poly_invntt(&r->vec[i]);
  }
    int i;
    for (i = 0; i < KYBER_K; i++) {
        poly_invntt(&r->vec[i]);
    }
 }

 /*************************************************
@@ -169,18 +176,18 @@ void polyvec_invntt(polyvec *r) {
 *            - const polyvec *b: pointer to second input vector of polynomials
 **************************************************/
 void polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) {
  int i, j;
  uint16_t t;
  for (j = 0; j < KYBER_N; j++) {
    t = montgomery_reduce(4613 *
                          (uint32_t)b->vec[0].coeffs[j]); // 4613 = 2^{2*18} % q
    r->coeffs[j] = montgomery_reduce(a->vec[0].coeffs[j] * t);
    for (i = 1; i < KYBER_K; i++) {
      t = montgomery_reduce(4613 * (uint32_t)b->vec[i].coeffs[j]);
      r->coeffs[j] += montgomery_reduce(a->vec[i].coeffs[j] * t);
    int i, j;
    uint16_t t;
    for (j = 0; j < KYBER_N; j++) {
        t = montgomery_reduce(
            4613 * (uint32_t)b->vec[0].coeffs[j]); // 4613 = 2^{2*18} % q
        r->coeffs[j] = montgomery_reduce(a->vec[0].coeffs[j] * t);
        for (i = 1; i < KYBER_K; i++) {
            t = montgomery_reduce(4613 * (uint32_t)b->vec[i].coeffs[j]);
            r->coeffs[j] += montgomery_reduce(a->vec[i].coeffs[j] * t);
        }
        r->coeffs[j] = barrett_reduce(r->coeffs[j]);
    }
    r->coeffs[j] = barrett_reduce(r->coeffs[j]);
  }
 }

 /*************************************************
@@ -193,8 +200,8 @@ void polyvec_pointwise_acc(poly *r, const polyvec *a, const polyvec *b) {
 *            - const polyvec *b: pointer to second input vector of polynomials
 **************************************************/
 void polyvec_add(polyvec *r, const polyvec *a, const polyvec *b) {
  int i;
  for (i = 0; i < KYBER_K; i++) {
    poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
  }
    int i;
    for (i = 0; i < KYBER_K; i++) {
        poly_add(&r->vec[i], &a->vec[i], &b->vec[i]);
    }
 }
--- a/crypto_kem/kyber768/clean/polyvec.h
+++ b/crypto_kem/kyber768/clean/polyvec.h
@@ -5,7 +5,7 @@
 #include "poly.h"

 typedef struct {
  poly vec[KYBER_K];
    poly vec[KYBER_K];
 } polyvec;

 void polyvec_compress(unsigned char *r, const polyvec *a);
--- a/crypto_kem/kyber768/clean/reduce.c
+++ b/crypto_kem/kyber768/clean/reduce.c
@@ -18,13 +18,13 @@ static const uint32_t rlog = 18;
 *q.
 **************************************************/
 uint16_t montgomery_reduce(uint32_t a) {
  uint32_t u;
    uint32_t u;

  u = (a * qinv);
  u &= ((1 << rlog) - 1);
  u *= KYBER_Q;
  a = a + u;
  return a >> rlog;
    u = (a * qinv);
    u &= ((1 << rlog) - 1);
    u *= KYBER_Q;
    a = a + u;
    return a >> rlog;
 }

 /*************************************************
@@ -38,12 +38,12 @@ uint16_t montgomery_reduce(uint32_t a) {
 * Returns:     unsigned integer in {0,...,11768} congruent to a modulo q.
 **************************************************/
 uint16_t barrett_reduce(uint16_t a) {
  uint32_t u;
    uint32_t u;

  u = a >> 13; //((uint32_t) a * sinv) >> 16;
  u *= KYBER_Q;
  a -= u;
  return a;
    u = a >> 13; //((uint32_t) a * sinv) >> 16;
    u *= KYBER_Q;
    a -= u;
    return a;
 }

 /*************************************************
@@ -57,14 +57,14 @@ uint16_t barrett_reduce(uint16_t a) {
 * Returns:     unsigned integer in {0,...,q-1} congruent to a modulo q.
 **************************************************/
 uint16_t freeze(uint16_t x) {
  uint16_t m, r;
  int16_t c;
  r = barrett_reduce(x);
    uint16_t m, r;
    int16_t c;
    r = barrett_reduce(x);

  m = r - KYBER_Q;
  c = m;
  c >>= 15;
  r = m ^ ((r ^ m) & c);
    m = r - KYBER_Q;
    c = m;
    c >>= 15;
    r = m ^ ((r ^ m) & c);

  return r;
    return r;
 }
--- a/crypto_kem/kyber768/clean/verify.c
+++ b/crypto_kem/kyber768/clean/verify.c
@@ -13,16 +13,16 @@
 * Returns 0 if the byte arrays are equal, 1 otherwise
 **************************************************/
 int verify(const unsigned char *a, const unsigned char *b, size_t len) {
  uint64_t r;
  size_t i;
  r = 0;
    uint64_t r;
    size_t i;
    r = 0;

  for (i = 0; i < len; i++) {
    r |= a[i] ^ b[i];
  }
    for (i = 0; i < len; i++) {
        r |= a[i] ^ b[i];
    }

  r = (-r) >> 63;
  return r;
    r = (-r) >> 63;
    return r;
 }

 /*************************************************
@@ -40,10 +40,10 @@ int verify(const unsigned char *a, const unsigned char *b, size_t len) {
 **************************************************/
 void cmov(unsigned char *r, const unsigned char *x, size_t len,
          unsigned char b) {
  size_t i;
    size_t i;

  b = -b;
  for (i = 0; i < len; i++) {
    r[i] ^= b & (x[i] ^ r[i]);
  }
    b = -b;
    for (i = 0; i < len; i++) {
        r[i] ^= b & (x[i] ^ r[i]);
    }
 }
--- a/crypto_kem/test.c
+++ b/crypto_kem/test.c
@@ -9,122 +9,126 @@
 * make sure it is not touched by the implementations.
 */
 static void write_canary(unsigned char *d) {
  *((uint64_t *)d) = 0x0123456789ABCDEF;
    *((uint64_t *)d) = 0x0123456789ABCDEF;
 }

 static int check_canary(const unsigned char *d) {
  if (*(uint64_t *)d != 0x0123456789ABCDEF) {
    return -1;
  }
  { return 0; }
    if (*(uint64_t *)d != 0x0123456789ABCDEF) {
        return -1;
    }
    { return 0; }
 }

 static int test_keys(void) {
  unsigned char key_a[CRYPTO_BYTES + 16], key_b[CRYPTO_BYTES + 16];
  unsigned char pk[CRYPTO_PUBLICKEYBYTES + 16];
  unsigned char sendb[CRYPTO_CIPHERTEXTBYTES + 16];
  unsigned char sk_a[CRYPTO_SECRETKEYBYTES + 16];

  write_canary(key_a);
  write_canary(key_a + sizeof(key_a) - 8);
  write_canary(key_b);
  write_canary(key_b + sizeof(key_b) - 8);
  write_canary(pk);
  write_canary(pk + sizeof(pk) - 8);
  write_canary(sendb);
  write_canary(sendb + sizeof(sendb) - 8);
  write_canary(sk_a);
  write_canary(sk_a + sizeof(sk_a) - 8);

  int i;

  for (i = 0; i < NTESTS; i++) {
    // Alice generates a public key
    crypto_kem_keypair(pk + 8, sk_a + 8);

    // Bob derives a secret key and creates a response
    crypto_kem_enc(sendb + 8, key_b + 8, pk + 8);

    // Alice uses Bobs response to get her secret key
    crypto_kem_dec(key_a + 8, sendb + 8, sk_a + 8);

    if (memcmp(key_a + 8, key_b + 8, CRYPTO_BYTES) != 0) {
      printf("ERROR KEYS\n");
    } else if (check_canary(key_a) || check_canary(key_a + sizeof(key_a) - 8) ||
               check_canary(key_b) || check_canary(key_b + sizeof(key_b) - 8) ||
               check_canary(pk) || check_canary(pk + sizeof(pk) - 8) ||
               check_canary(sendb) || check_canary(sendb + sizeof(sendb) - 8) ||
               check_canary(sk_a) || check_canary(sk_a + sizeof(sk_a) - 8)) {
      printf("ERROR canary overwritten\n");
    unsigned char key_a[CRYPTO_BYTES + 16], key_b[CRYPTO_BYTES + 16];
    unsigned char pk[CRYPTO_PUBLICKEYBYTES + 16];
    unsigned char sendb[CRYPTO_CIPHERTEXTBYTES + 16];
    unsigned char sk_a[CRYPTO_SECRETKEYBYTES + 16];

    write_canary(key_a);
    write_canary(key_a + sizeof(key_a) - 8);
    write_canary(key_b);
    write_canary(key_b + sizeof(key_b) - 8);
    write_canary(pk);
    write_canary(pk + sizeof(pk) - 8);
    write_canary(sendb);
    write_canary(sendb + sizeof(sendb) - 8);
    write_canary(sk_a);
    write_canary(sk_a + sizeof(sk_a) - 8);

    int i;

    for (i = 0; i < NTESTS; i++) {
        // Alice generates a public key
        crypto_kem_keypair(pk + 8, sk_a + 8);

        // Bob derives a secret key and creates a response
        crypto_kem_enc(sendb + 8, key_b + 8, pk + 8);

        // Alice uses Bobs response to get her secret key
        crypto_kem_dec(key_a + 8, sendb + 8, sk_a + 8);

        if (memcmp(key_a + 8, key_b + 8, CRYPTO_BYTES) != 0) {
            printf("ERROR KEYS\n");
        } else if (check_canary(key_a) ||
                   check_canary(key_a + sizeof(key_a) - 8) ||
                   check_canary(key_b) ||
                   check_canary(key_b + sizeof(key_b) - 8) ||
                   check_canary(pk) || check_canary(pk + sizeof(pk) - 8) ||
                   check_canary(sendb) ||
                   check_canary(sendb + sizeof(sendb) - 8) ||
                   check_canary(sk_a) ||
                   check_canary(sk_a + sizeof(sk_a) - 8)) {
            printf("ERROR canary overwritten\n");
        }
    }
  }

  return 0;
    return 0;
 }

 static int test_invalid_sk_a(void) {
  unsigned char sk_a[CRYPTO_SECRETKEYBYTES];
  unsigned char key_a[CRYPTO_BYTES], key_b[CRYPTO_BYTES];
  unsigned char pk[CRYPTO_PUBLICKEYBYTES];
  unsigned char sendb[CRYPTO_CIPHERTEXTBYTES];
  int i;
    unsigned char sk_a[CRYPTO_SECRETKEYBYTES];
    unsigned char key_a[CRYPTO_BYTES], key_b[CRYPTO_BYTES];
    unsigned char pk[CRYPTO_PUBLICKEYBYTES];
    unsigned char sendb[CRYPTO_CIPHERTEXTBYTES];
    int i;

  for (i = 0; i < NTESTS; i++) {
    // Alice generates a public key
    crypto_kem_keypair(pk, sk_a);
    for (i = 0; i < NTESTS; i++) {
        // Alice generates a public key
        crypto_kem_keypair(pk, sk_a);

    // Bob derives a secret key and creates a response
    crypto_kem_enc(sendb, key_b, pk);
        // Bob derives a secret key and creates a response
        crypto_kem_enc(sendb, key_b, pk);

    // Replace secret key with random values
    randombytes(sk_a, CRYPTO_SECRETKEYBYTES);
        // Replace secret key with random values
        randombytes(sk_a, CRYPTO_SECRETKEYBYTES);

    // Alice uses Bobs response to get her secret key
    crypto_kem_dec(key_a, sendb, sk_a);
        // Alice uses Bobs response to get her secret key
        crypto_kem_dec(key_a, sendb, sk_a);

    if (!memcmp(key_a, key_b, CRYPTO_BYTES)) {
      printf("ERROR invalid sk_a\n");
        if (!memcmp(key_a, key_b, CRYPTO_BYTES)) {
            printf("ERROR invalid sk_a\n");
        }
    }
  }

  return 0;
    return 0;
 }

 static int test_invalid_ciphertext(void) {
  unsigned char sk_a[CRYPTO_SECRETKEYBYTES];
  unsigned char key_a[CRYPTO_BYTES], key_b[CRYPTO_BYTES];
  unsigned char pk[CRYPTO_PUBLICKEYBYTES];
  unsigned char sendb[CRYPTO_CIPHERTEXTBYTES];
  int i;
  size_t pos;
    unsigned char sk_a[CRYPTO_SECRETKEYBYTES];
    unsigned char key_a[CRYPTO_BYTES], key_b[CRYPTO_BYTES];
    unsigned char pk[CRYPTO_PUBLICKEYBYTES];
    unsigned char sendb[CRYPTO_CIPHERTEXTBYTES];
    int i;
    size_t pos;

  for (i = 0; i < NTESTS; i++) {
    randombytes((unsigned char *)&pos, sizeof(size_t));
    for (i = 0; i < NTESTS; i++) {
        randombytes((unsigned char *)&pos, sizeof(size_t));

    // Alice generates a public key
    crypto_kem_keypair(pk, sk_a);
        // Alice generates a public key
        crypto_kem_keypair(pk, sk_a);

    // Bob derives a secret key and creates a response
    crypto_kem_enc(sendb, key_b, pk);
        // Bob derives a secret key and creates a response
        crypto_kem_enc(sendb, key_b, pk);

    // Change some byte in the ciphertext (i.e., encapsulated key)
    sendb[pos % CRYPTO_CIPHERTEXTBYTES] ^= 23;
        // Change some byte in the ciphertext (i.e., encapsulated key)
        sendb[pos % CRYPTO_CIPHERTEXTBYTES] ^= 23;

    // Alice uses Bobs response to get her secret key
    crypto_kem_dec(key_a, sendb, sk_a);
        // Alice uses Bobs response to get her secret key
        crypto_kem_dec(key_a, sendb, sk_a);

    if (!memcmp(key_a, key_b, CRYPTO_BYTES)) {
      printf("ERROR invalid ciphertext\n");
        if (!memcmp(key_a, key_b, CRYPTO_BYTES)) {
            printf("ERROR invalid ciphertext\n");
        }
    }
  }

  return 0;
    return 0;
 }

 int main(void) {
  test_keys();
  test_invalid_sk_a();
  test_invalid_ciphertext();
    test_keys();
    test_invalid_sk_a();
    test_invalid_ciphertext();

  return 0;
    return 0;
 }
--- a/crypto_sign/dilithium-iii/clean/ntt.c
+++ b/crypto_sign/dilithium-iii/clean/ntt.c
@@ -84,20 +84,20 @@ static const uint32_t zetas_inv[N] = {
 * Arguments:   - uint32_t p[N]: input/output coefficient array
 **************************************************/
 void ntt(uint32_t p[N]) {
  unsigned int len, start, j, k;
  uint32_t zeta, t;
    unsigned int len, start, j, k;
    uint32_t zeta, t;

  k = 1;
  for (len = 128; len > 0; len >>= 1) {
    for (start = 0; start < N; start = j + len) {
      zeta = zetas[k++];
      for (j = start; j < start + len; ++j) {
        t = montgomery_reduce((uint64_t)zeta * p[j + len]);
        p[j + len] = p[j] + 2 * Q - t;
        p[j] = p[j] + t;
      }
    k = 1;
    for (len = 128; len > 0; len >>= 1) {
        for (start = 0; start < N; start = j + len) {
            zeta = zetas[k++];
            for (j = start; j < start + len; ++j) {
                t = montgomery_reduce((uint64_t)zeta * p[j + len]);
                p[j + len] = p[j] + 2 * Q - t;
                p[j] = p[j] + t;
            }
        }
    }
  }
 }

 /*************************************************
@@ -111,25 +111,25 @@ void ntt(uint32_t p[N]) {
 * Arguments:   - uint32_t p[N]: input/output coefficient array
 **************************************************/
 void invntt_frominvmont(uint32_t p[N]) {
  unsigned int start, len, j, k;
  uint32_t t, zeta;
  const uint32_t f =
      (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q;
    unsigned int start, len, j, k;
    uint32_t t, zeta;
    const uint32_t f =
        (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q;

  k = 0;
  for (len = 1; len < N; len <<= 1) {
    for (start = 0; start < N; start = j + len) {
      zeta = zetas_inv[k++];
      for (j = start; j < start + len; ++j) {
        t = p[j];
        p[j] = t + p[j + len];
        p[j + len] = t + 256 * Q - p[j + len];
        p[j + len] = montgomery_reduce((uint64_t)zeta * p[j + len]);
      }
    k = 0;
    for (len = 1; len < N; len <<= 1) {
        for (start = 0; start < N; start = j + len) {
            zeta = zetas_inv[k++];
            for (j = start; j < start + len; ++j) {
                t = p[j];
                p[j] = t + p[j + len];
                p[j + len] = t + 256 * Q - p[j + len];
                p[j + len] = montgomery_reduce((uint64_t)zeta * p[j + len]);
            }
        }
    }
  }

  for (j = 0; j < N; ++j) {
    p[j] = montgomery_reduce((uint64_t)f * p[j]);
  }
    for (j = 0; j < N; ++j) {
        p[j] = montgomery_reduce((uint64_t)f * p[j]);
    }
 }
--- a/crypto_sign/dilithium-iii/clean/packing.c
+++ b/crypto_sign/dilithium-iii/clean/packing.c
@@ -14,14 +14,14 @@
 **************************************************/
 void pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
             const unsigned char rho[SEEDBYTES], const polyveck *t1) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < SEEDBYTES; ++i)
    pk[i] = rho[i];
  pk += SEEDBYTES;
    for (i = 0; i < SEEDBYTES; ++i)
        pk[i] = rho[i];
    pk += SEEDBYTES;

  for (i = 0; i < K; ++i)
    polyt1_pack(pk + i * POLT1_SIZE_PACKED, t1->vec + i);
    for (i = 0; i < K; ++i)
        polyt1_pack(pk + i * POLT1_SIZE_PACKED, t1->vec + i);
 }

 /*************************************************
@@ -35,14 +35,14 @@ void pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
 **************************************************/
 void unpack_pk(unsigned char rho[SEEDBYTES], polyveck *t1,
               const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < SEEDBYTES; ++i)
    rho[i] = pk[i];
  pk += SEEDBYTES;
    for (i = 0; i < SEEDBYTES; ++i)
        rho[i] = pk[i];
    pk += SEEDBYTES;

  for (i = 0; i < K; ++i)
    polyt1_unpack(t1->vec + i, pk + i * POLT1_SIZE_PACKED);
    for (i = 0; i < K; ++i)
        polyt1_unpack(t1->vec + i, pk + i * POLT1_SIZE_PACKED);
 }

 /*************************************************
@@ -63,30 +63,30 @@ void pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
             const unsigned char key[SEEDBYTES],
             const unsigned char tr[CRHBYTES], const polyvecl *s1,
             const polyveck *s2, const polyveck *t0) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < SEEDBYTES; ++i)
    sk[i] = rho[i];
  sk += SEEDBYTES;
    for (i = 0; i < SEEDBYTES; ++i)
        sk[i] = rho[i];
    sk += SEEDBYTES;

  for (i = 0; i < SEEDBYTES; ++i)
    sk[i] = key[i];
  sk += SEEDBYTES;
    for (i = 0; i < SEEDBYTES; ++i)
        sk[i] = key[i];
    sk += SEEDBYTES;

  for (i = 0; i < CRHBYTES; ++i)
    sk[i] = tr[i];
  sk += CRHBYTES;
    for (i = 0; i < CRHBYTES; ++i)
        sk[i] = tr[i];
    sk += CRHBYTES;

  for (i = 0; i < L; ++i)
    polyeta_pack(sk + i * POLETA_SIZE_PACKED, s1->vec + i);
  sk += L * POLETA_SIZE_PACKED;
    for (i = 0; i < L; ++i)
        polyeta_pack(sk + i * POLETA_SIZE_PACKED, s1->vec + i);
    sk += L * POLETA_SIZE_PACKED;

  for (i = 0; i < K; ++i)
    polyeta_pack(sk + i * POLETA_SIZE_PACKED, s2->vec + i);
  sk += K * POLETA_SIZE_PACKED;
    for (i = 0; i < K; ++i)
        polyeta_pack(sk + i * POLETA_SIZE_PACKED, s2->vec + i);
    sk += K * POLETA_SIZE_PACKED;

  for (i = 0; i < K; ++i)
    polyt0_pack(sk + i * POLT0_SIZE_PACKED, t0->vec + i);
    for (i = 0; i < K; ++i)
        polyt0_pack(sk + i * POLT0_SIZE_PACKED, t0->vec + i);
 }

 /*************************************************
@@ -105,30 +105,30 @@ void pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
 void unpack_sk(unsigned char rho[SEEDBYTES], unsigned char key[SEEDBYTES],
               unsigned char tr[CRHBYTES], polyvecl *s1, polyveck *s2,
               polyveck *t0, const unsigned char sk[CRYPTO_SECRETKEYBYTES]) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < SEEDBYTES; ++i)
    rho[i] = sk[i];
  sk += SEEDBYTES;
    for (i = 0; i < SEEDBYTES; ++i)
        rho[i] = sk[i];
    sk += SEEDBYTES;

  for (i = 0; i < SEEDBYTES; ++i)
    key[i] = sk[i];
  sk += SEEDBYTES;
    for (i = 0; i < SEEDBYTES; ++i)
        key[i] = sk[i];
    sk += SEEDBYTES;

  for (i = 0; i < CRHBYTES; ++i)
    tr[i] = sk[i];
  sk += CRHBYTES;
    for (i = 0; i < CRHBYTES; ++i)
        tr[i] = sk[i];
    sk += CRHBYTES;

  for (i = 0; i < L; ++i)
    polyeta_unpack(s1->vec + i, sk + i * POLETA_SIZE_PACKED);
  sk += L * POLETA_SIZE_PACKED;
    for (i = 0; i < L; ++i)
        polyeta_unpack(s1->vec + i, sk + i * POLETA_SIZE_PACKED);
    sk += L * POLETA_SIZE_PACKED;

  for (i = 0; i < K; ++i)
    polyeta_unpack(s2->vec + i, sk + i * POLETA_SIZE_PACKED);
  sk += K * POLETA_SIZE_PACKED;
    for (i = 0; i < K; ++i)
        polyeta_unpack(s2->vec + i, sk + i * POLETA_SIZE_PACKED);
    sk += K * POLETA_SIZE_PACKED;

  for (i = 0; i < K; ++i)
    polyt0_unpack(t0->vec + i, sk + i * POLT0_SIZE_PACKED);
    for (i = 0; i < K; ++i)
        polyt0_unpack(t0->vec + i, sk + i * POLT0_SIZE_PACKED);
 }

 /*************************************************
@@ -143,43 +143,43 @@ void unpack_sk(unsigned char rho[SEEDBYTES], unsigned char key[SEEDBYTES],
 **************************************************/
 void pack_sig(unsigned char sig[CRYPTO_BYTES], const polyvecl *z,
              const polyveck *h, const poly *c) {
  unsigned int i, j, k;
  uint64_t signs, mask;

  for (i = 0; i < L; ++i)
    polyz_pack(sig + i * POLZ_SIZE_PACKED, z->vec + i);
  sig += L * POLZ_SIZE_PACKED;

  /* Encode h */
  k = 0;
  for (i = 0; i < K; ++i) {
    for (j = 0; j < N; ++j)
      if (h->vec[i].coeffs[j] != 0)
        sig[k++] = j;

    sig[OMEGA + i] = k;
  }
  while (k < OMEGA)
    sig[k++] = 0;
  sig += OMEGA + K;

  /* Encode c */
  signs = 0;
  mask = 1;
  for (i = 0; i < N / 8; ++i) {
    sig[i] = 0;
    for (j = 0; j < 8; ++j) {
      if (c->coeffs[8 * i + j] != 0) {
        sig[i] |= (1U << j);
        if (c->coeffs[8 * i + j] == (Q - 1))
          signs |= mask;
        mask <<= 1;
      }
    unsigned int i, j, k;
    uint64_t signs, mask;

    for (i = 0; i < L; ++i)
        polyz_pack(sig + i * POLZ_SIZE_PACKED, z->vec + i);
    sig += L * POLZ_SIZE_PACKED;

    /* Encode h */
    k = 0;
    for (i = 0; i < K; ++i) {
        for (j = 0; j < N; ++j)
            if (h->vec[i].coeffs[j] != 0)
                sig[k++] = j;

        sig[OMEGA + i] = k;
    }
  }
  sig += N / 8;
  for (i = 0; i < 8; ++i)
    sig[i] = signs >> 8 * i;
    while (k < OMEGA)
        sig[k++] = 0;
    sig += OMEGA + K;

    /* Encode c */
    signs = 0;
    mask = 1;
    for (i = 0; i < N / 8; ++i) {
        sig[i] = 0;
        for (j = 0; j < 8; ++j) {
            if (c->coeffs[8 * i + j] != 0) {
                sig[i] |= (1U << j);
                if (c->coeffs[8 * i + j] == (Q - 1))
                    signs |= mask;
                mask <<= 1;
            }
        }
    }
    sig += N / 8;
    for (i = 0; i < 8; ++i)
        sig[i] = signs >> 8 * i;
 }

 /*************************************************
@@ -197,60 +197,60 @@ void pack_sig(unsigned char sig[CRYPTO_BYTES], const polyvecl *z,
 **************************************************/
 int unpack_sig(polyvecl *z, polyveck *h, poly *c,
               const unsigned char sig[CRYPTO_BYTES]) {
  unsigned int i, j, k;
  uint64_t signs, mask;

  for (i = 0; i < L; ++i)
    polyz_unpack(z->vec + i, sig + i * POLZ_SIZE_PACKED);
  sig += L * POLZ_SIZE_PACKED;

  /* Decode h */
  k = 0;
  for (i = 0; i < K; ++i) {
    for (j = 0; j < N; ++j)
      h->vec[i].coeffs[j] = 0;

    if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA)
      return 1;

    for (j = k; j < sig[OMEGA + i]; ++j) {
      /* Coefficients are ordered for strong unforgeability */
      if (j > k && sig[j] <= sig[j - 1])
        return 1;
      h->vec[i].coeffs[sig[j]] = 1;
    unsigned int i, j, k;
    uint64_t signs, mask;

    for (i = 0; i < L; ++i)
        polyz_unpack(z->vec + i, sig + i * POLZ_SIZE_PACKED);
    sig += L * POLZ_SIZE_PACKED;

    /* Decode h */
    k = 0;
    for (i = 0; i < K; ++i) {
        for (j = 0; j < N; ++j)
            h->vec[i].coeffs[j] = 0;

        if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA)
            return 1;

        for (j = k; j < sig[OMEGA + i]; ++j) {
            /* Coefficients are ordered for strong unforgeability */
            if (j > k && sig[j] <= sig[j - 1])
                return 1;
            h->vec[i].coeffs[sig[j]] = 1;
        }

        k = sig[OMEGA + i];
    }

    k = sig[OMEGA + i];
  }
    /* Extra indices are zero for strong unforgeability */
    for (j = k; j < OMEGA; ++j)
        if (sig[j])
            return 1;

  /* Extra indices are zero for strong unforgeability */
  for (j = k; j < OMEGA; ++j)
    if (sig[j])
      return 1;
    sig += OMEGA + K;

  sig += OMEGA + K;
    /* Decode c */
    for (i = 0; i < N; ++i)
        c->coeffs[i] = 0;

  /* Decode c */
  for (i = 0; i < N; ++i)
    c->coeffs[i] = 0;
    signs = 0;
    for (i = 0; i < 8; ++i)
        signs |= (uint64_t)sig[N / 8 + i] << 8 * i;

  signs = 0;
  for (i = 0; i < 8; ++i)
    signs |= (uint64_t)sig[N / 8 + i] << 8 * i;

  /* Extra sign bits are zero for strong unforgeability */
  if (signs >> 60)
    return 1;
    /* Extra sign bits are zero for strong unforgeability */
    if (signs >> 60)
        return 1;

  mask = 1;
  for (i = 0; i < N / 8; ++i) {
    for (j = 0; j < 8; ++j) {
      if ((sig[i] >> j) & 0x01) {
        c->coeffs[8 * i + j] = (signs & mask) ? Q - 1 : 1;
        mask <<= 1;
      }
    mask = 1;
    for (i = 0; i < N / 8; ++i) {
        for (j = 0; j < 8; ++j) {
            if ((sig[i] >> j) & 0x01) {
                c->coeffs[8 * i + j] = (signs & mask) ? Q - 1 : 1;
                mask <<= 1;
            }
        }
    }
  }

  return 0;
    return 0;
 }
--- a/crypto_sign/dilithium-iii/clean/params.h
+++ b/crypto_sign/dilithium-iii/clean/params.h
@@ -61,8 +61,8 @@

 #define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K * POLT1_SIZE_PACKED)
 #define CRYPTO_SECRETKEYBYTES                                                  \
  (2 * SEEDBYTES + (L + K) * POLETA_SIZE_PACKED + CRHBYTES +                   \
   K * POLT0_SIZE_PACKED)
    (2 * SEEDBYTES + (L + K) * POLETA_SIZE_PACKED + CRHBYTES +                 \
     K * POLT0_SIZE_PACKED)
 #define CRYPTO_BYTES (L * POLZ_SIZE_PACKED + (OMEGA + K) + (N / 8 + 8))

 #endif
--- a/crypto_sign/dilithium-iii/clean/poly.c
+++ b/crypto_sign/dilithium-iii/clean/poly.c
@@ -15,10 +15,10 @@
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
 void poly_reduce(poly *a) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    a->coeffs[i] = reduce32(a->coeffs[i]);
    for (i = 0; i < N; ++i)
        a->coeffs[i] = reduce32(a->coeffs[i]);
 }

 /*************************************************
@@ -30,10 +30,10 @@ void poly_reduce(poly *a) {
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
 void poly_csubq(poly *a) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    a->coeffs[i] = csubq(a->coeffs[i]);
    for (i = 0; i < N; ++i)
        a->coeffs[i] = csubq(a->coeffs[i]);
 }

 /*************************************************
@@ -45,10 +45,10 @@ void poly_csubq(poly *a) {
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
 void poly_freeze(poly *a) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    a->coeffs[i] = freeze(a->coeffs[i]);
    for (i = 0; i < N; ++i)
        a->coeffs[i] = freeze(a->coeffs[i]);
 }

 /*************************************************
@@ -61,10 +61,10 @@ void poly_freeze(poly *a) {
 *              - const poly *b: pointer to second summand
 **************************************************/
 void poly_add(poly *c, const poly *a, const poly *b) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
    for (i = 0; i < N; ++i)
        c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
 }

 /*************************************************
@@ -80,10 +80,10 @@ void poly_add(poly *c, const poly *a, const poly *b) {
 *                               subtraced from first input polynomial
 **************************************************/
 void poly_sub(poly *c, const poly *a, const poly *b) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i];
    for (i = 0; i < N; ++i)
        c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i];
 }

 /*************************************************
@@ -95,10 +95,10 @@ void poly_sub(poly *c, const poly *a, const poly *b) {
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
 void poly_neg(poly *a) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    a->coeffs[i] = Q - a->coeffs[i];
    for (i = 0; i < N; ++i)
        a->coeffs[i] = Q - a->coeffs[i];
 }

 /*************************************************
@@ -111,10 +111,10 @@ void poly_neg(poly *a) {
 *              - unsigned int k: exponent
 **************************************************/
 void poly_shiftl(poly *a, unsigned int k) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    a->coeffs[i] <<= k;
    for (i = 0; i < N; ++i)
        a->coeffs[i] <<= k;
 }

 /*************************************************
@@ -126,7 +126,7 @@ void poly_shiftl(poly *a, unsigned int k) {
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
 void poly_ntt(poly *a) {
  ntt(a->coeffs);
    ntt(a->coeffs);
 }

 /*************************************************
@@ -138,7 +138,7 @@ void poly_ntt(poly *a) {
 * Arguments:   - poly *a: pointer to input/output polynomial
 **************************************************/
 void poly_invntt_montgomery(poly *a) {
  invntt_frominvmont(a->coeffs);
    invntt_frominvmont(a->coeffs);
 }

 /*************************************************
@@ -154,10 +154,10 @@ void poly_invntt_montgomery(poly *a) {
 *              - const poly *b: pointer to second input polynomial
 **************************************************/
 void poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    c->coeffs[i] = montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]);
    for (i = 0; i < N; ++i)
        c->coeffs[i] = montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]);
 }

 /*************************************************
@@ -174,10 +174,10 @@ void poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) {
 *              - const poly *v: pointer to input polynomial
 **************************************************/
 void poly_power2round(poly *a1, poly *a0, const poly *a) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    a1->coeffs[i] = power2round(a->coeffs[i], a0->coeffs + i);
    for (i = 0; i < N; ++i)
        a1->coeffs[i] = power2round(a->coeffs[i], a0->coeffs + i);
 }

 /*************************************************
@@ -195,10 +195,10 @@ void poly_power2round(poly *a1, poly *a0, const poly *a) {
 *              - const poly *c: pointer to input polynomial
 **************************************************/
 void poly_decompose(poly *a1, poly *a0, const poly *a) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    a1->coeffs[i] = decompose(a->coeffs[i], a0->coeffs + i);
    for (i = 0; i < N; ++i)
        a1->coeffs[i] = decompose(a->coeffs[i], a0->coeffs + i);
 }

 /*************************************************
@@ -216,13 +216,13 @@ void poly_decompose(poly *a1, poly *a0, const poly *a) {
 * Returns number of 1 bits.
 **************************************************/
 unsigned int poly_make_hint(poly *h, const poly *a, const poly *b) {
  unsigned int i, s = 0;
    unsigned int i, s = 0;

  for (i = 0; i < N; ++i) {
    h->coeffs[i] = make_hint(a->coeffs[i], b->coeffs[i]);
    s += h->coeffs[i];
  }
  return s;
    for (i = 0; i < N; ++i) {
        h->coeffs[i] = make_hint(a->coeffs[i], b->coeffs[i]);
        s += h->coeffs[i];
    }
    return s;
 }

 /*************************************************
@@ -235,10 +235,10 @@ unsigned int poly_make_hint(poly *h, const poly *a, const poly *b) {
 *              - const poly *h: pointer to input hint polynomial
 **************************************************/
 void poly_use_hint(poly *a, const poly *b, const poly *h) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N; ++i)
    a->coeffs[i] = use_hint(b->coeffs[i], h->coeffs[i]);
    for (i = 0; i < N; ++i)
        a->coeffs[i] = use_hint(b->coeffs[i], h->coeffs[i]);
 }

 /*************************************************
@@ -253,23 +253,23 @@ void poly_use_hint(poly *a, const poly *b, const poly *h) {
 * Returns 0 if norm is strictly smaller than B and 1 otherwise.
 **************************************************/
 int poly_chknorm(const poly *a, uint32_t B) {
  unsigned int i;
  int32_t t;

  /* It is ok to leak which coefficient violates the bound since
     the probability for each coefficient is independent of secret
     data but we must not leak the sign of the centralized representative. */
  for (i = 0; i < N; ++i) {
    /* Absolute value of centralized representative */
    t = (Q - 1) / 2 - a->coeffs[i];
    t ^= (t >> 31);
    t = (Q - 1) / 2 - t;

    if ((uint32_t)t >= B) {
      return 1;
    unsigned int i;
    int32_t t;

    /* It is ok to leak which coefficient violates the bound since
       the probability for each coefficient is independent of secret
       data but we must not leak the sign of the centralized representative. */
    for (i = 0; i < N; ++i) {
        /* Absolute value of centralized representative */
        t = (Q - 1) / 2 - a->coeffs[i];
        t ^= (t >> 31);
        t = (Q - 1) / 2 - t;

        if ((uint32_t)t >= B) {
            return 1;
        }
    }
  }
  return 0;
    return 0;
 }

 /*************************************************
@@ -283,19 +283,19 @@ int poly_chknorm(const poly *a, uint32_t B) {
 *              - const unsigned char *buf: array of random bytes
 **************************************************/
 void poly_uniform(poly *a, const unsigned char *buf) {
  unsigned int ctr, pos;
  uint32_t t;

  ctr = pos = 0;
  while (ctr < N) {
    t = buf[pos++];
    t |= (uint32_t)buf[pos++] << 8;
    t |= (uint32_t)buf[pos++] << 16;
    t &= 0x7FFFFF;

    if (t < Q)
      a->coeffs[ctr++] = t;
  }
    unsigned int ctr, pos;
    uint32_t t;

    ctr = pos = 0;
    while (ctr < N) {
        t = buf[pos++];
        t |= (uint32_t)buf[pos++] << 8;
        t |= (uint32_t)buf[pos++] << 16;
        t &= 0x7FFFFF;

        if (t < Q)
            a->coeffs[ctr++] = t;
    }
 }

 /*************************************************
@@ -317,25 +317,25 @@ static unsigned int rej_eta(uint32_t *a, unsigned int len,
 #if ETA > 7
 #error "rej_eta() assumes ETA <= 7"
 #endif
  unsigned int ctr, pos;
  unsigned char t0, t1;
    unsigned int ctr, pos;
    unsigned char t0, t1;

  ctr = pos = 0;
  while (ctr < len && pos < buflen) {
    ctr = pos = 0;
    while (ctr < len && pos < buflen) {
 #if ETA <= 3
    t0 = buf[pos] & 0x07;
    t1 = buf[pos++] >> 5;
        t0 = buf[pos] & 0x07;
        t1 = buf[pos++] >> 5;
 #else
    t0 = buf[pos] & 0x0F;
    t1 = buf[pos++] >> 4;
        t0 = buf[pos] & 0x0F;
        t1 = buf[pos++] >> 4;
 #endif

    if (t0 <= 2 * ETA)
      a[ctr++] = Q + ETA - t0;
    if (t1 <= 2 * ETA && ctr < len)
      a[ctr++] = Q + ETA - t1;
  }
  return ctr;
        if (t0 <= 2 * ETA)
            a[ctr++] = Q + ETA - t0;
        if (t1 <= 2 * ETA && ctr < len)
            a[ctr++] = Q + ETA - t1;
    }
    return ctr;
 }

 /*************************************************
@@ -352,25 +352,25 @@ static unsigned int rej_eta(uint32_t *a, unsigned int len,
 **************************************************/
 void poly_uniform_eta(poly *a, const unsigned char seed[SEEDBYTES],
                      unsigned char nonce) {
  unsigned int i, ctr;
  unsigned char inbuf[SEEDBYTES + 1];
  /* Probability that we need more than 2 blocks: < 2^{-84}
     Probability that we need more than 3 blocks: < 2^{-352} */
  unsigned char outbuf[2 * SHAKE256_RATE];
  uint64_t state[25];

  for (i = 0; i < SEEDBYTES; ++i)
    inbuf[i] = seed[i];
  inbuf[SEEDBYTES] = nonce;

  shake256_absorb(state, inbuf, SEEDBYTES + 1);
  shake256_squeezeblocks(outbuf, 2, state);

  ctr = rej_eta(a->coeffs, N, outbuf, 2 * SHAKE256_RATE);
  if (ctr < N) {
    shake256_squeezeblocks(outbuf, 1, state);
    rej_eta(a->coeffs + ctr, N - ctr, outbuf, SHAKE256_RATE);
  }
    unsigned int i, ctr;
    unsigned char inbuf[SEEDBYTES + 1];
    /* Probability that we need more than 2 blocks: < 2^{-84}
       Probability that we need more than 3 blocks: < 2^{-352} */
    unsigned char outbuf[2 * SHAKE256_RATE];
    uint64_t state[25];

    for (i = 0; i < SEEDBYTES; ++i)
        inbuf[i] = seed[i];
    inbuf[SEEDBYTES] = nonce;

    shake256_absorb(state, inbuf, SEEDBYTES + 1);
    shake256_squeezeblocks(outbuf, 2, state);

    ctr = rej_eta(a->coeffs, N, outbuf, 2 * SHAKE256_RATE);
    if (ctr < N) {
        shake256_squeezeblocks(outbuf, 1, state);
        rej_eta(a->coeffs + ctr, N - ctr, outbuf, SHAKE256_RATE);
    }
 }

 /*************************************************
@@ -394,29 +394,29 @@ static unsigned int rej_gamma1m1(uint32_t *a, unsigned int len,
 #if GAMMA1 > (1 << 19)
 #error "rej_gamma1m1() assumes GAMMA1 - 1 fits in 19 bits"
 #endif
  unsigned int ctr, pos;
  uint32_t t0, t1;

  ctr = pos = 0;
  while (ctr < len && pos + 5 <= buflen) {
    t0 = buf[pos];
    t0 |= (uint32_t)buf[pos + 1] << 8;
    t0 |= (uint32_t)buf[pos + 2] << 16;
    t0 &= 0xFFFFF;

    t1 = buf[pos + 2] >> 4;
    t1 |= (uint32_t)buf[pos + 3] << 4;
    t1 |= (uint32_t)buf[pos + 4] << 12;

    pos += 5;

    if (t0 <= 2 * GAMMA1 - 2)
      a[ctr++] = Q + GAMMA1 - 1 - t0;
    if (t1 <= 2 * GAMMA1 - 2 && ctr < len)
      a[ctr++] = Q + GAMMA1 - 1 - t1;
  }
    unsigned int ctr, pos;
    uint32_t t0, t1;

    ctr = pos = 0;
    while (ctr < len && pos + 5 <= buflen) {
        t0 = buf[pos];
        t0 |= (uint32_t)buf[pos + 1] << 8;
        t0 |= (uint32_t)buf[pos + 2] << 16;
        t0 &= 0xFFFFF;

        t1 = buf[pos + 2] >> 4;
        t1 |= (uint32_t)buf[pos + 3] << 4;
        t1 |= (uint32_t)buf[pos + 4] << 12;

        pos += 5;

        if (t0 <= 2 * GAMMA1 - 2)
            a[ctr++] = Q + GAMMA1 - 1 - t0;
        if (t1 <= 2 * GAMMA1 - 2 && ctr < len)
            a[ctr++] = Q + GAMMA1 - 1 - t1;
    }

  return ctr;
    return ctr;
 }

 /*************************************************
@@ -434,28 +434,28 @@ static unsigned int rej_gamma1m1(uint32_t *a, unsigned int len,
 void poly_uniform_gamma1m1(poly *a,
                           const unsigned char seed[SEEDBYTES + CRHBYTES],
                           uint16_t nonce) {
  unsigned int i, ctr;
  unsigned char inbuf[SEEDBYTES + CRHBYTES + 2];
  /* Probability that we need more than 5 blocks: < 2^{-81}
     Probability that we need more than 6 blocks: < 2^{-467} */
  unsigned char outbuf[5 * SHAKE256_RATE];
  uint64_t state[25];

  for (i = 0; i < SEEDBYTES + CRHBYTES; ++i)
    inbuf[i] = seed[i];
  inbuf[SEEDBYTES + CRHBYTES] = nonce & 0xFF;
  inbuf[SEEDBYTES + CRHBYTES + 1] = nonce >> 8;

  shake256_absorb(state, inbuf, SEEDBYTES + CRHBYTES + 2);
  shake256_squeezeblocks(outbuf, 5, state);

  ctr = rej_gamma1m1(a->coeffs, N, outbuf, 5 * SHAKE256_RATE);
  if (ctr < N) {
    /* There are no bytes left in outbuf
       since 5*SHAKE256_RATE is divisible by 5 */
    shake256_squeezeblocks(outbuf, 1, state);
    rej_gamma1m1(a->coeffs + ctr, N - ctr, outbuf, SHAKE256_RATE);
  }
    unsigned int i, ctr;
    unsigned char inbuf[SEEDBYTES + CRHBYTES + 2];
    /* Probability that we need more than 5 blocks: < 2^{-81}
       Probability that we need more than 6 blocks: < 2^{-467} */
    unsigned char outbuf[5 * SHAKE256_RATE];
    uint64_t state[25];

    for (i = 0; i < SEEDBYTES + CRHBYTES; ++i)
        inbuf[i] = seed[i];
    inbuf[SEEDBYTES + CRHBYTES] = nonce & 0xFF;
    inbuf[SEEDBYTES + CRHBYTES + 1] = nonce >> 8;

    shake256_absorb(state, inbuf, SEEDBYTES + CRHBYTES + 2);
    shake256_squeezeblocks(outbuf, 5, state);

    ctr = rej_gamma1m1(a->coeffs, N, outbuf, 5 * SHAKE256_RATE);
    if (ctr < N) {
        /* There are no bytes left in outbuf
           since 5*SHAKE256_RATE is divisible by 5 */
        shake256_squeezeblocks(outbuf, 1, state);
        rej_gamma1m1(a->coeffs + ctr, N - ctr, outbuf, SHAKE256_RATE);
    }
 }

 /*************************************************
@@ -472,37 +472,37 @@ void polyeta_pack(unsigned char *r, const poly *a) {
 #if ETA > 7
 #error "polyeta_pack() assumes ETA <= 7"
 #endif
  unsigned int i;
  unsigned char t[8];
    unsigned int i;
    unsigned char t[8];

 #if ETA <= 3
  for (i = 0; i < N / 8; ++i) {
    t[0] = Q + ETA - a->coeffs[8 * i + 0];
    t[1] = Q + ETA - a->coeffs[8 * i + 1];
    t[2] = Q + ETA - a->coeffs[8 * i + 2];
    t[3] = Q + ETA - a->coeffs[8 * i + 3];
    t[4] = Q + ETA - a->coeffs[8 * i + 4];
    t[5] = Q + ETA - a->coeffs[8 * i + 5];
    t[6] = Q + ETA - a->coeffs[8 * i + 6];
    t[7] = Q + ETA - a->coeffs[8 * i + 7];

    r[3 * i + 0] = t[0];
    r[3 * i + 0] |= t[1] << 3;
    r[3 * i + 0] |= t[2] << 6;
    r[3 * i + 1] = t[2] >> 2;
    r[3 * i + 1] |= t[3] << 1;
    r[3 * i + 1] |= t[4] << 4;
    r[3 * i + 1] |= t[5] << 7;
    r[3 * i + 2] = t[5] >> 1;
    r[3 * i + 2] |= t[6] << 2;
    r[3 * i + 2] |= t[7] << 5;
  }
    for (i = 0; i < N / 8; ++i) {
        t[0] = Q + ETA - a->coeffs[8 * i + 0];
        t[1] = Q + ETA - a->coeffs[8 * i + 1];
        t[2] = Q + ETA - a->coeffs[8 * i + 2];
        t[3] = Q + ETA - a->coeffs[8 * i + 3];
        t[4] = Q + ETA - a->coeffs[8 * i + 4];
        t[5] = Q + ETA - a->coeffs[8 * i + 5];
        t[6] = Q + ETA - a->coeffs[8 * i + 6];
        t[7] = Q + ETA - a->coeffs[8 * i + 7];

        r[3 * i + 0] = t[0];
        r[3 * i + 0] |= t[1] << 3;
        r[3 * i + 0] |= t[2] << 6;
        r[3 * i + 1] = t[2] >> 2;
        r[3 * i + 1] |= t[3] << 1;
        r[3 * i + 1] |= t[4] << 4;
        r[3 * i + 1] |= t[5] << 7;
        r[3 * i + 2] = t[5] >> 1;
        r[3 * i + 2] |= t[6] << 2;
        r[3 * i + 2] |= t[7] << 5;
    }
 #else
  for (i = 0; i < N / 2; ++i) {
    t[0] = Q + ETA - a->coeffs[2 * i + 0];
    t[1] = Q + ETA - a->coeffs[2 * i + 1];
    r[i] = t[0] | (t[1] << 4);
  }
    for (i = 0; i < N / 2; ++i) {
        t[0] = Q + ETA - a->coeffs[2 * i + 0];
        t[1] = Q + ETA - a->coeffs[2 * i + 1];
        r[i] = t[0] | (t[1] << 4);
    }
 #endif
 }

@@ -516,35 +516,37 @@ void polyeta_pack(unsigned char *r, const poly *a) {
 *              - const unsigned char *a: byte array with bit-packed polynomial
 **************************************************/
 void polyeta_unpack(poly *r, const unsigned char *a) {
  unsigned int i;
    unsigned int i;

 #if ETA <= 3
  for (i = 0; i < N / 8; ++i) {
    r->coeffs[8 * i + 0] = a[3 * i + 0] & 0x07;
    r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 0x07;
    r->coeffs[8 * i + 2] = (a[3 * i + 0] >> 6) | ((a[3 * i + 1] & 0x01) << 2);
    r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 0x07;
    r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 0x07;
    r->coeffs[8 * i + 5] = (a[3 * i + 1] >> 7) | ((a[3 * i + 2] & 0x03) << 1);
    r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 0x07;
    r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5);

    r->coeffs[8 * i + 0] = Q + ETA - r->coeffs[8 * i + 0];
    r->coeffs[8 * i + 1] = Q + ETA - r->coeffs[8 * i + 1];
    r->coeffs[8 * i + 2] = Q + ETA - r->coeffs[8 * i + 2];
    r->coeffs[8 * i + 3] = Q + ETA - r->coeffs[8 * i + 3];
    r->coeffs[8 * i + 4] = Q + ETA - r->coeffs[8 * i + 4];
    r->coeffs[8 * i + 5] = Q + ETA - r->coeffs[8 * i + 5];
    r->coeffs[8 * i + 6] = Q + ETA - r->coeffs[8 * i + 6];
    r->coeffs[8 * i + 7] = Q + ETA - r->coeffs[8 * i + 7];
  }
    for (i = 0; i < N / 8; ++i) {
        r->coeffs[8 * i + 0] = a[3 * i + 0] & 0x07;
        r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 0x07;
        r->coeffs[8 * i + 2] =
            (a[3 * i + 0] >> 6) | ((a[3 * i + 1] & 0x01) << 2);
        r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 0x07;
        r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 0x07;
        r->coeffs[8 * i + 5] =
            (a[3 * i + 1] >> 7) | ((a[3 * i + 2] & 0x03) << 1);
        r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 0x07;
        r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5);

        r->coeffs[8 * i + 0] = Q + ETA - r->coeffs[8 * i + 0];
        r->coeffs[8 * i + 1] = Q + ETA - r->coeffs[8 * i + 1];
        r->coeffs[8 * i + 2] = Q + ETA - r->coeffs[8 * i + 2];
        r->coeffs[8 * i + 3] = Q + ETA - r->coeffs[8 * i + 3];
        r->coeffs[8 * i + 4] = Q + ETA - r->coeffs[8 * i + 4];
        r->coeffs[8 * i + 5] = Q + ETA - r->coeffs[8 * i + 5];
        r->coeffs[8 * i + 6] = Q + ETA - r->coeffs[8 * i + 6];
        r->coeffs[8 * i + 7] = Q + ETA - r->coeffs[8 * i + 7];
    }
 #else
  for (i = 0; i < N / 2; ++i) {
    r->coeffs[2 * i + 0] = a[i] & 0x0F;
    r->coeffs[2 * i + 1] = a[i] >> 4;
    r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0];
    r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1];
  }
    for (i = 0; i < N / 2; ++i) {
        r->coeffs[2 * i + 0] = a[i] & 0x0F;
        r->coeffs[2 * i + 1] = a[i] >> 4;
        r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0];
        r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1];
    }
 #endif
 }

@@ -562,26 +564,26 @@ void polyt1_pack(unsigned char *r, const poly *a) {
 #if D != 14
 #error "polyt1_pack() assumes D == 14"
 #endif
  unsigned int i;

  for (i = 0; i < N / 8; ++i) {
    r[9 * i + 0] = a->coeffs[8 * i + 0] & 0xFF;
    r[9 * i + 1] =
        (a->coeffs[8 * i + 0] >> 8) | ((a->coeffs[8 * i + 1] & 0x7F) << 1);
    r[9 * i + 2] =
        (a->coeffs[8 * i + 1] >> 7) | ((a->coeffs[8 * i + 2] & 0x3F) << 2);
    r[9 * i + 3] =
        (a->coeffs[8 * i + 2] >> 6) | ((a->coeffs[8 * i + 3] & 0x1F) << 3);
    r[9 * i + 4] =
        (a->coeffs[8 * i + 3] >> 5) | ((a->coeffs[8 * i + 4] & 0x0F) << 4);
    r[9 * i + 5] =
        (a->coeffs[8 * i + 4] >> 4) | ((a->coeffs[8 * i + 5] & 0x07) << 5);
    r[9 * i + 6] =
        (a->coeffs[8 * i + 5] >> 3) | ((a->coeffs[8 * i + 6] & 0x03) << 6);
    r[9 * i + 7] =
        (a->coeffs[8 * i + 6] >> 2) | ((a->coeffs[8 * i + 7] & 0x01) << 7);
    r[9 * i + 8] = a->coeffs[8 * i + 7] >> 1;
  }
    unsigned int i;

    for (i = 0; i < N / 8; ++i) {
        r[9 * i + 0] = a->coeffs[8 * i + 0] & 0xFF;
        r[9 * i + 1] =
            (a->coeffs[8 * i + 0] >> 8) | ((a->coeffs[8 * i + 1] & 0x7F) << 1);
        r[9 * i + 2] =
            (a->coeffs[8 * i + 1] >> 7) | ((a->coeffs[8 * i + 2] & 0x3F) << 2);
        r[9 * i + 3] =
            (a->coeffs[8 * i + 2] >> 6) | ((a->coeffs[8 * i + 3] & 0x1F) << 3);
        r[9 * i + 4] =
            (a->coeffs[8 * i + 3] >> 5) | ((a->coeffs[8 * i + 4] & 0x0F) << 4);
        r[9 * i + 5] =
            (a->coeffs[8 * i + 4] >> 4) | ((a->coeffs[8 * i + 5] & 0x07) << 5);
        r[9 * i + 6] =
            (a->coeffs[8 * i + 5] >> 3) | ((a->coeffs[8 * i + 6] & 0x03) << 6);
        r[9 * i + 7] =
            (a->coeffs[8 * i + 6] >> 2) | ((a->coeffs[8 * i + 7] & 0x01) << 7);
        r[9 * i + 8] = a->coeffs[8 * i + 7] >> 1;
    }
 }

 /*************************************************
@@ -594,26 +596,26 @@ void polyt1_pack(unsigned char *r, const poly *a) {
 *              - const unsigned char *a: byte array with bit-packed polynomial
 **************************************************/
 void polyt1_unpack(poly *r, const unsigned char *a) {
  unsigned int i;

  for (i = 0; i < N / 8; ++i) {
    r->coeffs[8 * i + 0] =
        a[9 * i + 0] | ((uint32_t)(a[9 * i + 1] & 0x01) << 8);
    r->coeffs[8 * i + 1] =
        (a[9 * i + 1] >> 1) | ((uint32_t)(a[9 * i + 2] & 0x03) << 7);
    r->coeffs[8 * i + 2] =
        (a[9 * i + 2] >> 2) | ((uint32_t)(a[9 * i + 3] & 0x07) << 6);
    r->coeffs[8 * i + 3] =
        (a[9 * i + 3] >> 3) | ((uint32_t)(a[9 * i + 4] & 0x0F) << 5);
    r->coeffs[8 * i + 4] =
        (a[9 * i + 4] >> 4) | ((uint32_t)(a[9 * i + 5] & 0x1F) << 4);
    r->coeffs[8 * i + 5] =
        (a[9 * i + 5] >> 5) | ((uint32_t)(a[9 * i + 6] & 0x3F) << 3);
    r->coeffs[8 * i + 6] =
        (a[9 * i + 6] >> 6) | ((uint32_t)(a[9 * i + 7] & 0x7F) << 2);
    r->coeffs[8 * i + 7] =
        (a[9 * i + 7] >> 7) | ((uint32_t)(a[9 * i + 8] & 0xFF) << 1);
  }
    unsigned int i;

    for (i = 0; i < N / 8; ++i) {
        r->coeffs[8 * i + 0] =
            a[9 * i + 0] | ((uint32_t)(a[9 * i + 1] & 0x01) << 8);
        r->coeffs[8 * i + 1] =
            (a[9 * i + 1] >> 1) | ((uint32_t)(a[9 * i + 2] & 0x03) << 7);
        r->coeffs[8 * i + 2] =
            (a[9 * i + 2] >> 2) | ((uint32_t)(a[9 * i + 3] & 0x07) << 6);
        r->coeffs[8 * i + 3] =
            (a[9 * i + 3] >> 3) | ((uint32_t)(a[9 * i + 4] & 0x0F) << 5);
        r->coeffs[8 * i + 4] =
            (a[9 * i + 4] >> 4) | ((uint32_t)(a[9 * i + 5] & 0x1F) << 4);
        r->coeffs[8 * i + 5] =
            (a[9 * i + 5] >> 5) | ((uint32_t)(a[9 * i + 6] & 0x3F) << 3);
        r->coeffs[8 * i + 6] =
            (a[9 * i + 6] >> 6) | ((uint32_t)(a[9 * i + 7] & 0x7F) << 2);
        r->coeffs[8 * i + 7] =
            (a[9 * i + 7] >> 7) | ((uint32_t)(a[9 * i + 8] & 0xFF) << 1);
    }
 }

 /*************************************************
@@ -627,26 +629,26 @@ void polyt1_unpack(poly *r, const unsigned char *a) {
 *              - const poly *a: pointer to input polynomial
 **************************************************/
 void polyt0_pack(unsigned char *r, const poly *a) {
  unsigned int i;
  uint32_t t[4];

  for (i = 0; i < N / 4; ++i) {
    t[0] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 0];
    t[1] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 1];
    t[2] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 2];
    t[3] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 3];

    r[7 * i + 0] = t[0];
    r[7 * i + 1] = t[0] >> 8;
    r[7 * i + 1] |= t[1] << 6;
    r[7 * i + 2] = t[1] >> 2;
    r[7 * i + 3] = t[1] >> 10;
    r[7 * i + 3] |= t[2] << 4;
    r[7 * i + 4] = t[2] >> 4;
    r[7 * i + 5] = t[2] >> 12;
    r[7 * i + 5] |= t[3] << 2;
    r[7 * i + 6] = t[3] >> 6;
  }
    unsigned int i;
    uint32_t t[4];

    for (i = 0; i < N / 4; ++i) {
        t[0] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 0];
        t[1] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 1];
        t[2] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 2];
        t[3] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 3];

        r[7 * i + 0] = t[0];
        r[7 * i + 1] = t[0] >> 8;
        r[7 * i + 1] |= t[1] << 6;
        r[7 * i + 2] = t[1] >> 2;
        r[7 * i + 3] = t[1] >> 10;
        r[7 * i + 3] |= t[2] << 4;
        r[7 * i + 4] = t[2] >> 4;
        r[7 * i + 5] = t[2] >> 12;
        r[7 * i + 5] |= t[3] << 2;
        r[7 * i + 6] = t[3] >> 6;
    }
 }

 /*************************************************
@@ -659,28 +661,28 @@ void polyt0_pack(unsigned char *r, const poly *a) {
 *              - const unsigned char *a: byte array with bit-packed polynomial
 **************************************************/
 void polyt0_unpack(poly *r, const unsigned char *a) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N / 4; ++i) {
    r->coeffs[4 * i + 0] = a[7 * i + 0];
    r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8;
    for (i = 0; i < N / 4; ++i) {
        r->coeffs[4 * i + 0] = a[7 * i + 0];
        r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8;

    r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
    r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2;
    r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10;
        r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
        r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2;
        r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10;

    r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
    r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4;
    r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12;
        r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
        r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4;
        r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12;

    r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
    r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6;
        r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
        r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6;

    r->coeffs[4 * i + 0] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 0];
    r->coeffs[4 * i + 1] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 1];
    r->coeffs[4 * i + 2] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 2];
    r->coeffs[4 * i + 3] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 3];
  }
        r->coeffs[4 * i + 0] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 0];
        r->coeffs[4 * i + 1] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 1];
        r->coeffs[4 * i + 2] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 2];
        r->coeffs[4 * i + 3] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 3];
    }
 }

 /*************************************************
@@ -698,23 +700,23 @@ void polyz_pack(unsigned char *r, const poly *a) {
 #if GAMMA1 > (1 << 19)
 #error "polyz_pack() assumes GAMMA1 <= 2^{19}"
 #endif
  unsigned int i;
  uint32_t t[2];

  for (i = 0; i < N / 2; ++i) {
    /* Map to {0,...,2*GAMMA1 - 2} */
    t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0];
    t[0] += ((int32_t)t[0] >> 31) & Q;
    t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1];
    t[1] += ((int32_t)t[1] >> 31) & Q;

    r[5 * i + 0] = t[0];
    r[5 * i + 1] = t[0] >> 8;
    r[5 * i + 2] = t[0] >> 16;
    r[5 * i + 2] |= t[1] << 4;
    r[5 * i + 3] = t[1] >> 4;
    r[5 * i + 4] = t[1] >> 12;
  }
    unsigned int i;
    uint32_t t[2];

    for (i = 0; i < N / 2; ++i) {
        /* Map to {0,...,2*GAMMA1 - 2} */
        t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0];
        t[0] += ((int32_t)t[0] >> 31) & Q;
        t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1];
        t[1] += ((int32_t)t[1] >> 31) & Q;

        r[5 * i + 0] = t[0];
        r[5 * i + 1] = t[0] >> 8;
        r[5 * i + 2] = t[0] >> 16;
        r[5 * i + 2] |= t[1] << 4;
        r[5 * i + 3] = t[1] >> 4;
        r[5 * i + 4] = t[1] >> 12;
    }
 }

 /*************************************************
@@ -728,22 +730,22 @@ void polyz_pack(unsigned char *r, const poly *a) {
 *              - const unsigned char *a: byte array with bit-packed polynomial
 **************************************************/
 void polyz_unpack(poly *r, const unsigned char *a) {
  unsigned int i;

  for (i = 0; i < N / 2; ++i) {
    r->coeffs[2 * i + 0] = a[5 * i + 0];
    r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
    r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16;

    r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
    r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
    r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;

    r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
    r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q;
    r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
    r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q;
  }
    unsigned int i;

    for (i = 0; i < N / 2; ++i) {
        r->coeffs[2 * i + 0] = a[5 * i + 0];
        r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
        r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16;

        r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
        r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
        r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;

        r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
        r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q;
        r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
        r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q;
    }
 }

 /*************************************************
@@ -757,8 +759,8 @@ void polyz_unpack(poly *r, const unsigned char *a) {
 *              - const poly *a: pointer to input polynomial
 **************************************************/
 void polyw1_pack(unsigned char *r, const poly *a) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < N / 2; ++i)
    r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4);
    for (i = 0; i < N / 2; ++i)
        r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4);
 }
--- a/crypto_sign/dilithium-iii/clean/poly.h
+++ b/crypto_sign/dilithium-iii/clean/poly.h
@@ -6,7 +6,7 @@
 #include <stdint.h>

 typedef struct {
  uint32_t coeffs[N];
    uint32_t coeffs[N];
 } poly __attribute__((aligned(32)));

 void poly_reduce(poly *a);
--- a/crypto_sign/dilithium-iii/clean/polyvec.c
+++ b/crypto_sign/dilithium-iii/clean/polyvec.c
@@ -16,10 +16,10 @@
 * Arguments:   - polyvecl *v: pointer to input/output vector
 **************************************************/
 void polyvecl_freeze(polyvecl *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < L; ++i)
    poly_freeze(v->vec + i);
    for (i = 0; i < L; ++i)
        poly_freeze(v->vec + i);
 }

 /*************************************************
@@ -33,10 +33,10 @@ void polyvecl_freeze(polyvecl *v) {
 *              - const polyvecl *v: pointer to second summand
 **************************************************/
 void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < L; ++i)
    poly_add(w->vec + i, u->vec + i, v->vec + i);
    for (i = 0; i < L; ++i)
        poly_add(w->vec + i, u->vec + i, v->vec + i);
 }

 /*************************************************
@@ -48,10 +48,10 @@ void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
 * Arguments:   - polyvecl *v: pointer to input/output vector
 **************************************************/
 void polyvecl_ntt(polyvecl *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < L; ++i)
    poly_ntt(v->vec + i);
    for (i = 0; i < L; ++i)
        poly_ntt(v->vec + i);
 }

 /*************************************************
@@ -69,15 +69,15 @@ void polyvecl_ntt(polyvecl *v) {
 **************************************************/
 void polyvecl_pointwise_acc_invmontgomery(poly *w, const polyvecl *u,
                                          const polyvecl *v) {
  unsigned int i;
  poly t;
    unsigned int i;
    poly t;

  poly_pointwise_invmontgomery(w, u->vec + 0, v->vec + 0);
    poly_pointwise_invmontgomery(w, u->vec + 0, v->vec + 0);

  for (i = 1; i < L; ++i) {
    poly_pointwise_invmontgomery(&t, u->vec + i, v->vec + i);
    poly_add(w, w, &t);
  }
    for (i = 1; i < L; ++i) {
        poly_pointwise_invmontgomery(&t, u->vec + i, v->vec + i);
        poly_add(w, w, &t);
    }
 }

 /*************************************************
@@ -93,13 +93,13 @@ void polyvecl_pointwise_acc_invmontgomery(poly *w, const polyvecl *u,
 * otherwise.
 **************************************************/
 int polyvecl_chknorm(const polyvecl *v, uint32_t bound) {
  unsigned int i;
  int ret = 0;
    unsigned int i;
    int ret = 0;

  for (i = 0; i < L; ++i)
    ret |= poly_chknorm(v->vec + i, bound);
    for (i = 0; i < L; ++i)
        ret |= poly_chknorm(v->vec + i, bound);

  return ret;
    return ret;
 }

 /**************************************************************/
@@ -115,10 +115,10 @@ int polyvecl_chknorm(const polyvecl *v, uint32_t bound) {
 * Arguments:   - polyveck *v: pointer to input/output vector
 **************************************************/
 void polyveck_reduce(polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_reduce(v->vec + i);
    for (i = 0; i < K; ++i)
        poly_reduce(v->vec + i);
 }

 /*************************************************
@@ -130,10 +130,10 @@ void polyveck_reduce(polyveck *v) {
 * Arguments:   - polyveck *v: pointer to input/output vector
 **************************************************/
 void polyveck_csubq(polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_csubq(v->vec + i);
    for (i = 0; i < K; ++i)
        poly_csubq(v->vec + i);
 }

 /*************************************************
@@ -145,10 +145,10 @@ void polyveck_csubq(polyveck *v) {
 * Arguments:   - polyveck *v: pointer to input/output vector
 **************************************************/
 void polyveck_freeze(polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_freeze(v->vec + i);
    for (i = 0; i < K; ++i)
        poly_freeze(v->vec + i);
 }

 /*************************************************
@@ -162,10 +162,10 @@ void polyveck_freeze(polyveck *v) {
 *              - const polyveck *v: pointer to second summand
 **************************************************/
 void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_add(w->vec + i, u->vec + i, v->vec + i);
    for (i = 0; i < K; ++i)
        poly_add(w->vec + i, u->vec + i, v->vec + i);
 }

 /*************************************************
@@ -181,10 +181,10 @@ void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
 *                                   subtracted from first input vector
 **************************************************/
 void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_sub(w->vec + i, u->vec + i, v->vec + i);
    for (i = 0; i < K; ++i)
        poly_sub(w->vec + i, u->vec + i, v->vec + i);
 }

 /*************************************************
@@ -197,10 +197,10 @@ void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
 *              - unsigned int k: exponent
 **************************************************/
 void polyveck_shiftl(polyveck *v, unsigned int k) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_shiftl(v->vec + i, k);
    for (i = 0; i < K; ++i)
        poly_shiftl(v->vec + i, k);
 }

 /*************************************************
@@ -212,10 +212,10 @@ void polyveck_shiftl(polyveck *v, unsigned int k) {
 * Arguments:   - polyveck *v: pointer to input/output vector
 **************************************************/
 void polyveck_ntt(polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_ntt(v->vec + i);
    for (i = 0; i < K; ++i)
        poly_ntt(v->vec + i);
 }

 /*************************************************
@@ -228,10 +228,10 @@ void polyveck_ntt(polyveck *v) {
 * Arguments:   - polyveck *v: pointer to input/output vector
 **************************************************/
 void polyveck_invntt_montgomery(polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_invntt_montgomery(v->vec + i);
    for (i = 0; i < K; ++i)
        poly_invntt_montgomery(v->vec + i);
 }

 /*************************************************
@@ -247,13 +247,13 @@ void polyveck_invntt_montgomery(polyveck *v) {
 * otherwise.
 **************************************************/
 int polyveck_chknorm(const polyveck *v, uint32_t bound) {
  unsigned int i;
  int ret = 0;
    unsigned int i;
    int ret = 0;

  for (i = 0; i < K; ++i)
    ret |= poly_chknorm(v->vec + i, bound);
    for (i = 0; i < K; ++i)
        ret |= poly_chknorm(v->vec + i, bound);

  return ret;
    return ret;
 }

 /*************************************************
@@ -271,10 +271,10 @@ int polyveck_chknorm(const polyveck *v, uint32_t bound) {
 *              - const polyveck *v: pointer to input vector
 **************************************************/
 void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_power2round(v1->vec + i, v0->vec + i, v->vec + i);
    for (i = 0; i < K; ++i)
        poly_power2round(v1->vec + i, v0->vec + i, v->vec + i);
 }

 /*************************************************
@@ -293,10 +293,10 @@ void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
 *              - const polyveck *v: pointer to input vector
 **************************************************/
 void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_decompose(v1->vec + i, v0->vec + i, v->vec + i);
    for (i = 0; i < K; ++i)
        poly_decompose(v1->vec + i, v0->vec + i, v->vec + i);
 }

 /*************************************************
@@ -312,12 +312,12 @@ void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
 **************************************************/
 unsigned int polyveck_make_hint(polyveck *h, const polyveck *u,
                                const polyveck *v) {
  unsigned int i, s = 0;
    unsigned int i, s = 0;

  for (i = 0; i < K; ++i)
    s += poly_make_hint(h->vec + i, u->vec + i, v->vec + i);
    for (i = 0; i < K; ++i)
        s += poly_make_hint(h->vec + i, u->vec + i, v->vec + i);

  return s;
    return s;
 }

 /*************************************************
@@ -331,8 +331,8 @@ unsigned int polyveck_make_hint(polyveck *h, const polyveck *u,
 *              - const polyveck *h: pointer to input hint vector
 **************************************************/
 void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
  unsigned int i;
    unsigned int i;

  for (i = 0; i < K; ++i)
    poly_use_hint(w->vec + i, u->vec + i, h->vec + i);
    for (i = 0; i < K; ++i)
        poly_use_hint(w->vec + i, u->vec + i, h->vec + i);
 }
--- a/crypto_sign/dilithium-iii/clean/polyvec.h
+++ b/crypto_sign/dilithium-iii/clean/polyvec.h
@@ -7,7 +7,7 @@

 /* Vectors of polynomials of length L */
 typedef struct {
  poly vec[L];
    poly vec[L];
 } polyvecl;

 void polyvecl_freeze(polyvecl *v);
@@ -22,7 +22,7 @@ int polyvecl_chknorm(const polyvecl *v, uint32_t B);

 /* Vectors of polynomials of length K */
 typedef struct {
  poly vec[K];
    poly vec[K];
 } polyveck;

 void polyveck_reduce(polyveck *v);
--- a/crypto_sign/dilithium-iii/clean/reduce.c
+++ b/crypto_sign/dilithium-iii/clean/reduce.c
@@ -13,14 +13,14 @@
 * Returns r.
 **************************************************/
 uint32_t montgomery_reduce(uint64_t a) {
  uint64_t t;
    uint64_t t;

  t = a * QINV;
  t &= (1ULL << 32) - 1;
  t *= Q;
  t = a + t;
  t >>= 32;
  return t;
    t = a * QINV;
    t &= (1ULL << 32) - 1;
    t *= Q;
    t = a + t;
    t >>= 32;
    return t;
 }

 /*************************************************
@@ -34,12 +34,12 @@ uint32_t montgomery_reduce(uint64_t a) {
 * Returns r.
 **************************************************/
 uint32_t reduce32(uint32_t a) {
  uint32_t t;
    uint32_t t;

  t = a & 0x7FFFFF;
  a >>= 23;
  t += (a << 13) - a;
  return t;
    t = a & 0x7FFFFF;
    a >>= 23;
    t += (a << 13) - a;
    return t;
 }

 /*************************************************
@@ -52,9 +52,9 @@ uint32_t reduce32(uint32_t a) {
 * Returns r.
 **************************************************/
 uint32_t csubq(uint32_t a) {
  a -= Q;
  a += ((int32_t)a >> 31) & Q;
  return a;
    a -= Q;
    a += ((int32_t)a >> 31) & Q;
    return a;
 }

 /*************************************************
@@ -68,7 +68,7 @@ uint32_t csubq(uint32_t a) {
 * Returns r.
 **************************************************/
 uint32_t freeze(uint32_t a) {
  a = reduce32(a);
  a = csubq(a);
  return a;
    a = reduce32(a);
    a = csubq(a);
    return a;
 }
--- a/crypto_sign/dilithium-iii/clean/rounding.c
+++ b/crypto_sign/dilithium-iii/clean/rounding.c
@@ -14,16 +14,16 @@
 * Returns a1.
 **************************************************/
 uint32_t power2round(uint32_t a, uint32_t *a0) {
  int32_t t;
    int32_t t;

  /* Centralized remainder mod 2^D */
  t = a & ((1 << D) - 1);
  t -= (1 << (D - 1)) + 1;
  t += (t >> 31) & (1 << D);
  t -= (1 << (D - 1)) - 1;
  *a0 = Q + t;
  a = (a - t) >> D;
  return a;
    /* Centralized remainder mod 2^D */
    t = a & ((1 << D) - 1);
    t -= (1 << (D - 1)) + 1;
    t += (t >> 31) & (1 << D);
    t -= (1 << (D - 1)) - 1;
    *a0 = Q + t;
    a = (a - t) >> D;
    return a;
 }

 /*************************************************
@@ -43,26 +43,26 @@ uint32_t decompose(uint32_t a, uint32_t *a0) {
 #if ALPHA != (Q - 1) / 16
 #error "decompose assumes ALPHA == (Q-1)/16"
 #endif
  int32_t t, u;
    int32_t t, u;

  /* Centralized remainder mod ALPHA */
  t = a & 0x7FFFF;
  t += (a >> 19) << 9;
  t -= ALPHA / 2 + 1;
  t += (t >> 31) & ALPHA;
  t -= ALPHA / 2 - 1;
  a -= t;
    /* Centralized remainder mod ALPHA */
    t = a & 0x7FFFF;
    t += (a >> 19) << 9;
    t -= ALPHA / 2 + 1;
    t += (t >> 31) & ALPHA;
    t -= ALPHA / 2 - 1;
    a -= t;

  /* Divide by ALPHA (possible to avoid) */
  u = a - 1;
  u >>= 31;
  a = (a >> 19) + 1;
  a -= u & 1;
    /* Divide by ALPHA (possible to avoid) */
    u = a - 1;
    u >>= 31;
    a = (a >> 19) + 1;
    a -= u & 1;

  /* Border case */
  *a0 = Q + t - (a >> 4);
  a &= 0xF;
  return a;
    /* Border case */
    *a0 = Q + t - (a >> 4);
    a &= 0xF;
    return a;
 }

 /*************************************************
@@ -78,9 +78,9 @@ uint32_t decompose(uint32_t a, uint32_t *a0) {
 * Returns 1 if high bits of a and b differ and 0 otherwise.
 **************************************************/
 unsigned int make_hint(const uint32_t a, const uint32_t b) {
  uint32_t t;
    uint32_t t;

  return decompose(a, &t) != decompose(b, &t);
    return decompose(a, &t) != decompose(b, &t);
 }

 /*************************************************
@@ -94,22 +94,22 @@ unsigned int make_hint(const uint32_t a, const uint32_t b) {
 * Returns corrected high bits.
 **************************************************/
 uint32_t use_hint(const uint32_t a, const unsigned int hint) {
  uint32_t a0, a1;
    uint32_t a0, a1;

  a1 = decompose(a, &a0);
  if (hint == 0)
    return a1;
  else if (a0 > Q)
    return (a1 + 1) & 0xF;
  else
    return (a1 - 1) & 0xF;
    a1 = decompose(a, &a0);
    if (hint == 0)
        return a1;
    else if (a0 > Q)
        return (a1 + 1) & 0xF;
    else
        return (a1 - 1) & 0xF;

  /* If decompose does not divide out ALPHA:
  if(hint == 0)
    return a1;
  else if(a0 > Q)
    return (a1 + ALPHA) % (Q - 1);
  else
    return (a1 - ALPHA) % (Q - 1);
  */
    /* If decompose does not divide out ALPHA:
    if(hint == 0)
      return a1;
    else if(a0 > Q)
      return (a1 + ALPHA) % (Q - 1);
    else
      return (a1 - ALPHA) % (Q - 1);
    */
 }
--- a/crypto_sign/dilithium-iii/clean/sign.c
+++ b/crypto_sign/dilithium-iii/clean/sign.c
@@ -18,24 +18,24 @@
 *              - const unsigned char rho[]: byte array containing seed rho
 **************************************************/
 void expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]) {
  unsigned int i, j;
  unsigned char inbuf[SEEDBYTES + 1];
  /* Don't change this to smaller values,
   * sampling later assumes sufficient SHAKE output!
   * Probability that we need more than 5 blocks: < 2^{-132}.
   * Probability that we need more than 6 blocks: < 2^{-546}. */
  unsigned char outbuf[5 * SHAKE128_RATE];

  for (i = 0; i < SEEDBYTES; ++i)
    inbuf[i] = rho[i];

  for (i = 0; i < K; ++i) {
    for (j = 0; j < L; ++j) {
      inbuf[SEEDBYTES] = i + (j << 4);
      shake128(outbuf, sizeof(outbuf), inbuf, SEEDBYTES + 1);
      poly_uniform(mat[i].vec + j, outbuf);
    unsigned int i, j;
    unsigned char inbuf[SEEDBYTES + 1];
    /* Don't change this to smaller values,
     * sampling later assumes sufficient SHAKE output!
     * Probability that we need more than 5 blocks: < 2^{-132}.
     * Probability that we need more than 6 blocks: < 2^{-546}. */
    unsigned char outbuf[5 * SHAKE128_RATE];

    for (i = 0; i < SEEDBYTES; ++i)
        inbuf[i] = rho[i];

    for (i = 0; i < K; ++i) {
        for (j = 0; j < L; ++j) {
            inbuf[SEEDBYTES] = i + (j << 4);
            shake128(outbuf, sizeof(outbuf), inbuf, SEEDBYTES + 1);
            poly_uniform(mat[i].vec + j, outbuf);
        }
    }
  }
 }

 /*************************************************
@@ -50,43 +50,43 @@ void expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]) {
 *              - const polyveck *w1: pointer to vector w1
 **************************************************/
 void challenge(poly *c, const unsigned char mu[CRHBYTES], const polyveck *w1) {
  unsigned int i, b, pos;
  unsigned char inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
  unsigned char outbuf[SHAKE256_RATE];
  uint64_t state[25], signs, mask;

  for (i = 0; i < CRHBYTES; ++i)
    inbuf[i] = mu[i];
  for (i = 0; i < K; ++i)
    polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, w1->vec + i);

  shake256_absorb(state, inbuf, sizeof(inbuf));
  shake256_squeezeblocks(outbuf, 1, state);

  signs = 0;
  for (i = 0; i < 8; ++i)
    signs |= (uint64_t)outbuf[i] << 8 * i;

  pos = 8;
  mask = 1;

  for (i = 0; i < N; ++i)
    c->coeffs[i] = 0;

  for (i = 196; i < 256; ++i) {
    do {
      if (pos >= SHAKE256_RATE) {
        shake256_squeezeblocks(outbuf, 1, state);
        pos = 0;
      }

      b = outbuf[pos++];
    } while (b > i);

    c->coeffs[i] = c->coeffs[b];
    c->coeffs[b] = (signs & mask) ? Q - 1 : 1;
    mask <<= 1;
  }
    unsigned int i, b, pos;
    unsigned char inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
    unsigned char outbuf[SHAKE256_RATE];
    uint64_t state[25], signs, mask;

    for (i = 0; i < CRHBYTES; ++i)
        inbuf[i] = mu[i];
    for (i = 0; i < K; ++i)
        polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, w1->vec + i);

    shake256_absorb(state, inbuf, sizeof(inbuf));
    shake256_squeezeblocks(outbuf, 1, state);

    signs = 0;
    for (i = 0; i < 8; ++i)
        signs |= (uint64_t)outbuf[i] << 8 * i;

    pos = 8;
    mask = 1;

    for (i = 0; i < N; ++i)
        c->coeffs[i] = 0;

    for (i = 196; i < 256; ++i) {
        do {
            if (pos >= SHAKE256_RATE) {
                shake256_squeezeblocks(outbuf, 1, state);
                pos = 0;
            }

            b = outbuf[pos++];
        } while (b > i);

        c->coeffs[i] = c->coeffs[b];
        c->coeffs[b] = (signs & mask) ? Q - 1 : 1;
        mask <<= 1;
    }
 }

 /*************************************************
@@ -102,53 +102,53 @@ void challenge(poly *c, const unsigned char mu[CRHBYTES], const polyveck *w1) {
 * Returns 0 (success)
 **************************************************/
 int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
  unsigned int i;
  unsigned char seedbuf[3 * SEEDBYTES];
  unsigned char tr[CRHBYTES];
  unsigned char *rho, *rhoprime, *key;
  uint16_t nonce = 0;
  polyvecl mat[K];
  polyvecl s1, s1hat;
  polyveck s2, t, t1, t0;

  /* Expand 32 bytes of randomness into rho, rhoprime and key */
  randombytes(seedbuf, SEEDBYTES);
  shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
  rho = seedbuf;
  rhoprime = rho + SEEDBYTES;
  key = rho + 2 * SEEDBYTES;

  /* Expand matrix */
  expand_mat(mat, rho);

  /* Sample short vectors s1 and s2 */
  for (i = 0; i < L; ++i)
    poly_uniform_eta(s1.vec + i, rhoprime, nonce++);
  for (i = 0; i < K; ++i)
    poly_uniform_eta(s2.vec + i, rhoprime, nonce++);

  /* Matrix-vector multiplication */
  s1hat = s1;
  polyvecl_ntt(&s1hat);
  for (i = 0; i < K; ++i) {
    polyvecl_pointwise_acc_invmontgomery(t.vec + i, mat + i, &s1hat);
    poly_reduce(t.vec + i);
    poly_invntt_montgomery(t.vec + i);
  }

  /* Add noise vector s2 */
  polyveck_add(&t, &t, &s2);

  /* Extract t1 and write public key */
  polyveck_freeze(&t);
  polyveck_power2round(&t1, &t0, &t);
  pack_pk(pk, rho, &t1);

  /* Compute CRH(rho, t1) and write secret key */
  shake256(tr, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
  pack_sk(sk, rho, key, tr, &s1, &s2, &t0);

  return 0;
    unsigned int i;
    unsigned char seedbuf[3 * SEEDBYTES];
    unsigned char tr[CRHBYTES];
    unsigned char *rho, *rhoprime, *key;
    uint16_t nonce = 0;
    polyvecl mat[K];
    polyvecl s1, s1hat;
    polyveck s2, t, t1, t0;

    /* Expand 32 bytes of randomness into rho, rhoprime and key */
    randombytes(seedbuf, SEEDBYTES);
    shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
    rho = seedbuf;
    rhoprime = rho + SEEDBYTES;
    key = rho + 2 * SEEDBYTES;

    /* Expand matrix */
    expand_mat(mat, rho);

    /* Sample short vectors s1 and s2 */
    for (i = 0; i < L; ++i)
        poly_uniform_eta(s1.vec + i, rhoprime, nonce++);
    for (i = 0; i < K; ++i)
        poly_uniform_eta(s2.vec + i, rhoprime, nonce++);

    /* Matrix-vector multiplication */
    s1hat = s1;
    polyvecl_ntt(&s1hat);
    for (i = 0; i < K; ++i) {
        polyvecl_pointwise_acc_invmontgomery(t.vec + i, mat + i, &s1hat);
        poly_reduce(t.vec + i);
        poly_invntt_montgomery(t.vec + i);
    }

    /* Add noise vector s2 */
    polyveck_add(&t, &t, &s2);

    /* Extract t1 and write public key */
    polyveck_freeze(&t);
    polyveck_power2round(&t1, &t0, &t);
    pack_pk(pk, rho, &t1);

    /* Compute CRH(rho, t1) and write secret key */
    shake256(tr, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
    pack_sk(sk, rho, key, tr, &s1, &s2, &t0);

    return 0;
 }

 /*************************************************
@@ -170,108 +170,108 @@ int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
 int crypto_sign(unsigned char *sm, unsigned long long *smlen,
                const unsigned char *m, unsigned long long mlen,
                const unsigned char *sk) {
  unsigned long long i, j;
  unsigned int n;
  unsigned char
      seedbuf[2 * SEEDBYTES + CRHBYTES]; // TODO: nonce in seedbuf (2x)
  unsigned char tr[CRHBYTES];
  unsigned char *rho, *key, *mu;
  uint16_t nonce = 0;
  poly c, chat;
  polyvecl mat[K], s1, y, yhat, z;
  polyveck s2, t0, w, w1;
  polyveck h, wcs2, wcs20, ct0, tmp;

  rho = seedbuf;
  key = seedbuf + SEEDBYTES;
  mu = seedbuf + 2 * SEEDBYTES;
  unpack_sk(rho, key, tr, &s1, &s2, &t0, sk);

  /* Copy tr and message into the sm buffer,
   * backwards since m and sm can be equal in SUPERCOP API */
  for (i = 1; i <= mlen; ++i)
    sm[CRYPTO_BYTES + mlen - i] = m[mlen - i];
  for (i = 0; i < CRHBYTES; ++i)
    sm[CRYPTO_BYTES - CRHBYTES + i] = tr[i];

  /* Compute CRH(tr, msg) */
  shake256(mu, CRHBYTES, sm + CRYPTO_BYTES - CRHBYTES, CRHBYTES + mlen);

  /* Expand matrix and transform vectors */
  expand_mat(mat, rho);
  polyvecl_ntt(&s1);
  polyveck_ntt(&s2);
  polyveck_ntt(&t0);
    unsigned long long i, j;
    unsigned int n;
    unsigned char
        seedbuf[2 * SEEDBYTES + CRHBYTES]; // TODO: nonce in seedbuf (2x)
    unsigned char tr[CRHBYTES];
    unsigned char *rho, *key, *mu;
    uint16_t nonce = 0;
    poly c, chat;
    polyvecl mat[K], s1, y, yhat, z;
    polyveck s2, t0, w, w1;
    polyveck h, wcs2, wcs20, ct0, tmp;

    rho = seedbuf;
    key = seedbuf + SEEDBYTES;
    mu = seedbuf + 2 * SEEDBYTES;
    unpack_sk(rho, key, tr, &s1, &s2, &t0, sk);

    /* Copy tr and message into the sm buffer,
     * backwards since m and sm can be equal in SUPERCOP API */
    for (i = 1; i <= mlen; ++i)
        sm[CRYPTO_BYTES + mlen - i] = m[mlen - i];
    for (i = 0; i < CRHBYTES; ++i)
        sm[CRYPTO_BYTES - CRHBYTES + i] = tr[i];

    /* Compute CRH(tr, msg) */
    shake256(mu, CRHBYTES, sm + CRYPTO_BYTES - CRHBYTES, CRHBYTES + mlen);

    /* Expand matrix and transform vectors */
    expand_mat(mat, rho);
    polyvecl_ntt(&s1);
    polyveck_ntt(&s2);
    polyveck_ntt(&t0);

 rej:
  /* Sample intermediate vector y */
  for (i = 0; i < L; ++i)
    poly_uniform_gamma1m1(y.vec + i, key, nonce++);

  /* Matrix-vector multiplication */
  yhat = y;
  polyvecl_ntt(&yhat);
  for (i = 0; i < K; ++i) {
    polyvecl_pointwise_acc_invmontgomery(w.vec + i, mat + i, &yhat);
    poly_reduce(w.vec + i);
    poly_invntt_montgomery(w.vec + i);
  }

  /* Decompose w and call the random oracle */
  polyveck_csubq(&w);
  polyveck_decompose(&w1, &tmp, &w);
  challenge(&c, mu, &w1);

  /* Compute z, reject if it reveals secret */
  chat = c;
  poly_ntt(&chat);
  for (i = 0; i < L; ++i) {
    poly_pointwise_invmontgomery(z.vec + i, &chat, s1.vec + i);
    poly_invntt_montgomery(z.vec + i);
  }
  polyvecl_add(&z, &z, &y);
  polyvecl_freeze(&z);
  if (polyvecl_chknorm(&z, GAMMA1 - BETA))
    goto rej;

  /* Compute w - cs2, reject if w1 can not be computed from it */
  for (i = 0; i < K; ++i) {
    poly_pointwise_invmontgomery(wcs2.vec + i, &chat, s2.vec + i);
    poly_invntt_montgomery(wcs2.vec + i);
  }
  polyveck_sub(&wcs2, &w, &wcs2);
  polyveck_freeze(&wcs2);
  polyveck_decompose(&tmp, &wcs20, &wcs2);
  polyveck_csubq(&wcs20);
  if (polyveck_chknorm(&wcs20, GAMMA2 - BETA))
    goto rej;

  for (i = 0; i < K; ++i)
    for (j = 0; j < N; ++j)
      if (tmp.vec[i].coeffs[j] != w1.vec[i].coeffs[j])
    /* Sample intermediate vector y */
    for (i = 0; i < L; ++i)
        poly_uniform_gamma1m1(y.vec + i, key, nonce++);

    /* Matrix-vector multiplication */
    yhat = y;
    polyvecl_ntt(&yhat);
    for (i = 0; i < K; ++i) {
        polyvecl_pointwise_acc_invmontgomery(w.vec + i, mat + i, &yhat);
        poly_reduce(w.vec + i);
        poly_invntt_montgomery(w.vec + i);
    }

    /* Decompose w and call the random oracle */
    polyveck_csubq(&w);
    polyveck_decompose(&w1, &tmp, &w);
    challenge(&c, mu, &w1);

    /* Compute z, reject if it reveals secret */
    chat = c;
    poly_ntt(&chat);
    for (i = 0; i < L; ++i) {
        poly_pointwise_invmontgomery(z.vec + i, &chat, s1.vec + i);
        poly_invntt_montgomery(z.vec + i);
    }
    polyvecl_add(&z, &z, &y);
    polyvecl_freeze(&z);
    if (polyvecl_chknorm(&z, GAMMA1 - BETA))
        goto rej;

  /* Compute hints for w1 */
  for (i = 0; i < K; ++i) {
    poly_pointwise_invmontgomery(ct0.vec + i, &chat, t0.vec + i);
    poly_invntt_montgomery(ct0.vec + i);
  }
    /* Compute w - cs2, reject if w1 can not be computed from it */
    for (i = 0; i < K; ++i) {
        poly_pointwise_invmontgomery(wcs2.vec + i, &chat, s2.vec + i);
        poly_invntt_montgomery(wcs2.vec + i);
    }
    polyveck_sub(&wcs2, &w, &wcs2);
    polyveck_freeze(&wcs2);
    polyveck_decompose(&tmp, &wcs20, &wcs2);
    polyveck_csubq(&wcs20);
    if (polyveck_chknorm(&wcs20, GAMMA2 - BETA))
        goto rej;

  polyveck_csubq(&ct0);
  if (polyveck_chknorm(&ct0, GAMMA2))
    goto rej;
    for (i = 0; i < K; ++i)
        for (j = 0; j < N; ++j)
            if (tmp.vec[i].coeffs[j] != w1.vec[i].coeffs[j])
                goto rej;

  polyveck_add(&tmp, &wcs2, &ct0);
  polyveck_csubq(&tmp);
  n = polyveck_make_hint(&h, &wcs2, &tmp);
  if (n > OMEGA)
    goto rej;
    /* Compute hints for w1 */
    for (i = 0; i < K; ++i) {
        poly_pointwise_invmontgomery(ct0.vec + i, &chat, t0.vec + i);
        poly_invntt_montgomery(ct0.vec + i);
    }

  /* Write signature */
  pack_sig(sm, &z, &h, &c);
    polyveck_csubq(&ct0);
    if (polyveck_chknorm(&ct0, GAMMA2))
        goto rej;

    polyveck_add(&tmp, &wcs2, &ct0);
    polyveck_csubq(&tmp);
    n = polyveck_make_hint(&h, &wcs2, &tmp);
    if (n > OMEGA)
        goto rej;

  *smlen = mlen + CRYPTO_BYTES;
  return 0;
    /* Write signature */
    pack_sig(sm, &z, &h, &c);

    *smlen = mlen + CRYPTO_BYTES;
    return 0;
 }

 /*************************************************
@@ -291,70 +291,70 @@ rej:
 int crypto_sign_open(unsigned char *m, unsigned long long *mlen,
                     const unsigned char *sm, unsigned long long smlen,
                     const unsigned char *pk) {
  unsigned long long i;
  unsigned char rho[SEEDBYTES];
  unsigned char mu[CRHBYTES];
  poly c, chat, cp;
  polyvecl mat[K], z;
  polyveck t1, w1, h, tmp1, tmp2;

  if (smlen < CRYPTO_BYTES)
    goto badsig;

  *mlen = smlen - CRYPTO_BYTES;

  unpack_pk(rho, &t1, pk);
  if (unpack_sig(&z, &h, &c, sm))
    goto badsig;
  if (polyvecl_chknorm(&z, GAMMA1 - BETA))
    goto badsig;

  /* Compute CRH(CRH(rho, t1), msg) using m as "playground" buffer */
  if (sm != m)
    unsigned long long i;
    unsigned char rho[SEEDBYTES];
    unsigned char mu[CRHBYTES];
    poly c, chat, cp;
    polyvecl mat[K], z;
    polyveck t1, w1, h, tmp1, tmp2;

    if (smlen < CRYPTO_BYTES)
        goto badsig;

    *mlen = smlen - CRYPTO_BYTES;

    unpack_pk(rho, &t1, pk);
    if (unpack_sig(&z, &h, &c, sm))
        goto badsig;
    if (polyvecl_chknorm(&z, GAMMA1 - BETA))
        goto badsig;

    /* Compute CRH(CRH(rho, t1), msg) using m as "playground" buffer */
    if (sm != m)
        for (i = 0; i < *mlen; ++i)
            m[CRYPTO_BYTES + i] = sm[CRYPTO_BYTES + i];

    shake256(m + CRYPTO_BYTES - CRHBYTES, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
    shake256(mu, CRHBYTES, m + CRYPTO_BYTES - CRHBYTES, CRHBYTES + *mlen);

    /* Matrix-vector multiplication; compute Az - c2^dt1 */
    expand_mat(mat, rho);
    polyvecl_ntt(&z);
    for (i = 0; i < K; ++i)
        polyvecl_pointwise_acc_invmontgomery(tmp1.vec + i, mat + i, &z);

    chat = c;
    poly_ntt(&chat);
    polyveck_shiftl(&t1, D);
    polyveck_ntt(&t1);
    for (i = 0; i < K; ++i)
        poly_pointwise_invmontgomery(tmp2.vec + i, &chat, t1.vec + i);

    polyveck_sub(&tmp1, &tmp1, &tmp2);
    polyveck_reduce(&tmp1);
    polyveck_invntt_montgomery(&tmp1);

    /* Reconstruct w1 */
    polyveck_csubq(&tmp1);
    polyveck_use_hint(&w1, &tmp1, &h);

    /* Call random oracle and verify challenge */
    challenge(&cp, mu, &w1);
    for (i = 0; i < N; ++i)
        if (c.coeffs[i] != cp.coeffs[i])
            goto badsig;

    /* All good, copy msg, return 0 */
    for (i = 0; i < *mlen; ++i)
      m[CRYPTO_BYTES + i] = sm[CRYPTO_BYTES + i];

  shake256(m + CRYPTO_BYTES - CRHBYTES, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
  shake256(mu, CRHBYTES, m + CRYPTO_BYTES - CRHBYTES, CRHBYTES + *mlen);

  /* Matrix-vector multiplication; compute Az - c2^dt1 */
  expand_mat(mat, rho);
  polyvecl_ntt(&z);
  for (i = 0; i < K; ++i)
    polyvecl_pointwise_acc_invmontgomery(tmp1.vec + i, mat + i, &z);

  chat = c;
  poly_ntt(&chat);
  polyveck_shiftl(&t1, D);
  polyveck_ntt(&t1);
  for (i = 0; i < K; ++i)
    poly_pointwise_invmontgomery(tmp2.vec + i, &chat, t1.vec + i);

  polyveck_sub(&tmp1, &tmp1, &tmp2);
  polyveck_reduce(&tmp1);
  polyveck_invntt_montgomery(&tmp1);

  /* Reconstruct w1 */
  polyveck_csubq(&tmp1);
  polyveck_use_hint(&w1, &tmp1, &h);

  /* Call random oracle and verify challenge */
  challenge(&cp, mu, &w1);
  for (i = 0; i < N; ++i)
    if (c.coeffs[i] != cp.coeffs[i])
      goto badsig;

  /* All good, copy msg, return 0 */
  for (i = 0; i < *mlen; ++i)
    m[i] = sm[CRYPTO_BYTES + i];
        m[i] = sm[CRYPTO_BYTES + i];

  return 0;
    return 0;

 /* Signature verification failed */
 badsig:
  *mlen = (unsigned long long)-1;
  for (i = 0; i < smlen; ++i)
    m[i] = 0;
    *mlen = (unsigned long long)-1;
    for (i = 0; i < smlen; ++i)
        m[i] = 0;

  return -1;
    return -1;
 }
--- a/crypto_sign/test.c
+++ b/crypto_sign/test.c
@@ -10,86 +10,89 @@
 * make sure it is not touched by the implementations.
 */
 static void write_canary(unsigned char *d) {
  *((uint64_t *)d) = 0x0123456789ABCDEF;
    *((uint64_t *)d) = 0x0123456789ABCDEF;
 }

 static int check_canary(unsigned char *d) {
  if (*(uint64_t *)d != 0x0123456789ABCDEF)
    return -1;
  else
    return 0;
    if (*(uint64_t *)d != 0x0123456789ABCDEF)
        return -1;
    else
        return 0;
 }
 static int test_sign(void) {
  unsigned char pk[CRYPTO_PUBLICKEYBYTES + 16];
  unsigned char sk[CRYPTO_SECRETKEYBYTES + 16];
  unsigned char sm[MLEN + CRYPTO_BYTES + 16];
  unsigned char m[MLEN + 16];

  unsigned long long mlen;
  unsigned long long smlen;

  int i;
  write_canary(pk);
  write_canary(pk + sizeof(pk) - 8);
  write_canary(sk);
  write_canary(sk + sizeof(sk) - 8);
  write_canary(sm);
  write_canary(sm + sizeof(sm) - 8);
  write_canary(m);
  write_canary(m + sizeof(m) - 8);

  for (i = 0; i < NTESTS; i++) {
    crypto_sign_keypair(pk + 8, sk + 8);

    randombytes(m + 8, MLEN);
    crypto_sign(sm + 8, &smlen, m + 8, MLEN, sk + 8);

    // By relying on m == sm we prevent having to allocate CRYPTO_BYTES twice
    if (crypto_sign_open(sm + 8, &mlen, sm + 8, smlen, pk + 8)) {
      printf("ERROR Signature did not verify correctly!\n");
    } else if (check_canary(pk) || check_canary(pk + sizeof(pk) - 8) ||
               check_canary(sk) || check_canary(sk + sizeof(sk) - 8) ||
               check_canary(sm) || check_canary(sm + sizeof(sm) - 8) ||
               check_canary(m) || check_canary(m + sizeof(m) - 8)) {
      printf("ERROR canary overwritten\n");
    unsigned char pk[CRYPTO_PUBLICKEYBYTES + 16];
    unsigned char sk[CRYPTO_SECRETKEYBYTES + 16];
    unsigned char sm[MLEN + CRYPTO_BYTES + 16];
    unsigned char m[MLEN + 16];

    unsigned long long mlen;
    unsigned long long smlen;

    int i;
    write_canary(pk);
    write_canary(pk + sizeof(pk) - 8);
    write_canary(sk);
    write_canary(sk + sizeof(sk) - 8);
    write_canary(sm);
    write_canary(sm + sizeof(sm) - 8);
    write_canary(m);
    write_canary(m + sizeof(m) - 8);

    for (i = 0; i < NTESTS; i++) {
        crypto_sign_keypair(pk + 8, sk + 8);

        randombytes(m + 8, MLEN);
        crypto_sign(sm + 8, &smlen, m + 8, MLEN, sk + 8);

        // By relying on m == sm we prevent having to allocate CRYPTO_BYTES
        // twice
        if (crypto_sign_open(sm + 8, &mlen, sm + 8, smlen, pk + 8)) {
            printf("ERROR Signature did not verify correctly!\n");
        } else if (check_canary(pk) || check_canary(pk + sizeof(pk) - 8) ||
                   check_canary(sk) || check_canary(sk + sizeof(sk) - 8) ||
                   check_canary(sm) || check_canary(sm + sizeof(sm) - 8) ||
                   check_canary(m) || check_canary(m + sizeof(m) - 8)) {
            printf("ERROR canary overwritten\n");
        }
    }
  }

  return 0;
    return 0;
 }

 static int test_wrong_pk(void) {
  unsigned char pk[CRYPTO_PUBLICKEYBYTES];
  unsigned char pk2[CRYPTO_PUBLICKEYBYTES];
  unsigned char sk[CRYPTO_SECRETKEYBYTES];
  unsigned char sm[MLEN + CRYPTO_BYTES];
  unsigned char m[MLEN];
    unsigned char pk[CRYPTO_PUBLICKEYBYTES];
    unsigned char pk2[CRYPTO_PUBLICKEYBYTES];
    unsigned char sk[CRYPTO_SECRETKEYBYTES];
    unsigned char sm[MLEN + CRYPTO_BYTES];
    unsigned char m[MLEN];

  unsigned long long mlen;
  unsigned long long smlen;
    unsigned long long mlen;
    unsigned long long smlen;

  int i;
    int i;

  for (i = 0; i < NTESTS; i++) {
    crypto_sign_keypair(pk2, sk);
    for (i = 0; i < NTESTS; i++) {
        crypto_sign_keypair(pk2, sk);

    crypto_sign_keypair(pk, sk);
        crypto_sign_keypair(pk, sk);

    randombytes(m, MLEN);
    crypto_sign(sm, &smlen, m, MLEN, sk);
        randombytes(m, MLEN);
        crypto_sign(sm, &smlen, m, MLEN, sk);

    // By relying on m == sm we prevent having to allocate CRYPTO_BYTES twice
    if (!crypto_sign_open(sm, &mlen, sm, smlen, pk2)) {
      printf("ERROR Signature did verify correctly under wrong public key!\n");
        // By relying on m == sm we prevent having to allocate CRYPTO_BYTES
        // twice
        if (!crypto_sign_open(sm, &mlen, sm, smlen, pk2)) {
            printf("ERROR Signature did verify correctly under wrong public "
                   "key!\n");
        }
    }
  }

  return 0;
    return 0;
 }

 int main(void) {
  test_sign();
  test_wrong_pk();
    test_sign();
    test_wrong_pk();

  return 0;
    return 0;
 }