diff --git a/.clang-format b/.clang-format
index 3d584a65..a9e4acbe 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,5 +1,6 @@
 ---
 Language:        Cpp
 BasedOnStyle:  LLVM
+AllowShortFunctionsOnASingleLine: false
 ...
 
diff --git a/common/fips202.c b/common/fips202.c
index 9cca98ff..fc8f2117 100644
--- a/common/fips202.c
+++ b/common/fips202.c
@@ -6,7 +6,7 @@
  * by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe */
 
 #include "fips202.h"
-#include <assert.h>
+#include <stddef.h>
 #include <stdint.h>
 
 #define NROUNDS 24
@@ -22,11 +22,12 @@
  * Returns the loaded 64-bit unsigned integer
  **************************************************/
 static uint64_t load64(const unsigned char *x) {
-  unsigned long long r = 0, i;
+  unsigned int i;
+  uint64_t r = 0;
+
+  for (i = 0; i < 8; ++i)
+    r |= (uint64_t)x[i] << 8 * i;
 
-  for (i = 0; i < 8; ++i) {
-    r |= (unsigned long long)x[i] << 8 * i;
-  }
   return r;
 }
 
@@ -41,10 +42,8 @@ static uint64_t load64(const unsigned char *x) {
 static void store64(uint8_t *x, uint64_t u) {
   unsigned int i;
 
-  for (i = 0; i < 8; ++i) {
-    x[i] = u;
-    u >>= 8;
-  }
+  for (i = 0; i < 8; ++i)
+    x[i] = u >> 8 * i;
 }
 
 /* Keccak round constants */
@@ -67,9 +66,9 @@ static const uint64_t KeccakF_RoundConstants[NROUNDS] = {
  *
  * Description: The Keccak F1600 Permutation
  *
- * Arguments:   - uint64_t * state: pointer to in/output Keccak state
+ * Arguments:   - uint64_t *state: pointer to input/output Keccak state
  **************************************************/
-void KeccakF1600_StatePermute(uint64_t *state) {
+static void KeccakF1600_StatePermute(uint64_t *state) {
   int round;
 
   uint64_t Aba, Abe, Abi, Abo, Abu;
@@ -330,35 +329,27 @@ void KeccakF1600_StatePermute(uint64_t *state) {
   state[22] = Asi;
   state[23] = Aso;
   state[24] = Asu;
-
-#undef round
 }
 
-#include <string.h>
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
 /*************************************************
  * Name:        keccak_absorb
  *
  * Description: Absorb step of Keccak;
  *              non-incremental, starts by zeroeing the state.
  *
- * Arguments:   - uint64_t *s:             pointer to (uninitialized) output
- *Keccak state
- *              - unsigned int r:          rate in bytes (e.g., 168 for
- *SHAKE128)
- *              - const unsigned char *m:  pointer to input to be absorbed into
- *s
+ * Arguments:   - uint64_t *s: pointer to (uninitialized) output Keccak state
+ *              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
+ *              - const unsigned char *m: pointer to input to be absorbed into s
  *              - unsigned long long mlen: length of input in bytes
- *              - unsigned char p:         domain-separation byte for different
- *Keccak-derived functions
+ *              - unsigned char p: domain-separation byte for different
+ *                                 Keccak-derived functions
  **************************************************/
 static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m,
-                          unsigned long long int mlen, unsigned char p) {
-  unsigned long long i;
+                          unsigned long long mlen, unsigned char p) {
+  unsigned int i;
   unsigned char t[200];
 
-  // Zero state
+  /* Zero state */
   for (i = 0; i < 25; ++i)
     s[i] = 0;
 
@@ -386,20 +377,18 @@ static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m,
  *
  * Description: Squeeze step of Keccak. Squeezes full blocks of r bytes each.
  *              Modifies the state. Can be called multiple times to keep
- *squeezing, i.e., is incremental.
+ *              squeezing, i.e., is incremental.
  *
- * Arguments:   - unsigned char *h:               pointer to output blocks
+ * Arguments:   - unsigned char *h: pointer to output blocks
  *              - unsigned long long int nblocks: number of blocks to be
- *squeezed (written to h)
- *              - uint64_t *s:                    pointer to in/output Keccak
- *state
- *              - unsigned int r:                 rate in bytes (e.g., 168 for
- *SHAKE128)
+ *                                                squeezed (written to h)
+ *              - uint64_t *s: pointer to input/output Keccak state
+ *              - unsigned int r: rate in bytes (e.g., 168 for SHAKE128)
  **************************************************/
-static void keccak_squeezeblocks(unsigned char *h,
-                                 unsigned long long int nblocks, uint64_t *s,
-                                 unsigned int r) {
+static void keccak_squeezeblocks(unsigned char *h, unsigned long nblocks,
+                                 uint64_t *s, unsigned int r) {
   unsigned int i;
+
   while (nblocks > 0) {
     KeccakF1600_StatePermute(s);
     for (i = 0; i < (r >> 3); i++) {
@@ -416,63 +405,122 @@ static void keccak_squeezeblocks(unsigned char *h,
  * Description: Absorb step of the SHAKE128 XOF.
  *              non-incremental, starts by zeroeing the state.
  *
- * Arguments:   - uint64_t *s:                     pointer to (uninitialized)
- *output Keccak state
- *              - const unsigned char *input:      pointer to input to be
- *absorbed into s
- *              - unsigned long long inputByteLen: length of input in bytes
+ * Arguments:   - uint64_t *s: pointer to (uninitialized) output Keccak state
+ *              - const unsigned char *input: pointer to input to be absorbed
+ *                                            into s
+ *              - unsigned long long inlen: length of input in bytes
  **************************************************/
 void shake128_absorb(uint64_t *s, const unsigned char *input,
-                     unsigned int inputByteLen) {
-  keccak_absorb(s, SHAKE128_RATE, input, inputByteLen, 0x1F);
+                     unsigned long long inlen) {
+  keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F);
 }
 
 /*************************************************
  * Name:        shake128_squeezeblocks
  *
  * Description: Squeeze step of SHAKE128 XOF. Squeezes full blocks of
- *SHAKE128_RATE bytes each. Modifies the state. Can be called multiple times to
- *keep squeezing, i.e., is incremental.
+ *              SHAKE128_RATE bytes each. Modifies the state. Can be called
+ *              multiple times to keep squeezing, i.e., is incremental.
  *
- * Arguments:   - unsigned char *output:      pointer to output blocks
+ * Arguments:   - unsigned char *output: pointer to output blocks
  *              - unsigned long long nblocks: number of blocks to be squeezed
- *(written to output)
- *              - uint64_t *s:                pointer to in/output Keccak state
+ *                                            (written to output)
+ *              - uint64_t *s: pointer to input/output Keccak state
  **************************************************/
-void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks,
+void shake128_squeezeblocks(unsigned char *output, unsigned long nblocks,
                             uint64_t *s) {
   keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE);
 }
 
 /*************************************************
-* Name:        shake256
-*
-* Description: SHAKE256 XOF with non-incremental API
-*
-* Arguments:   - unsigned char *output:      pointer to output
-*              - unsigned long long outlen:  requested output length in bytes
-               - const unsigned char *input: pointer to input
-               - unsigned long long inlen:   length of input in bytes
-**************************************************/
+ * Name:        shake256_absorb
+ *
+ * Description: Absorb step of the SHAKE256 XOF.
+ *              non-incremental, starts by zeroeing the state.
+ *
+ * Arguments:   - uint64_t *s: pointer to (uninitialized) output Keccak state
+ *              - const unsigned char *input: pointer to input to be absorbed
+ *                                            into s
+ *              - unsigned long long inlen: length of input in bytes
+ **************************************************/
+void shake256_absorb(uint64_t *s, const unsigned char *input,
+                     unsigned long long inlen) {
+  keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F);
+}
+
+/*************************************************
+ * Name:        shake256_squeezeblocks
+ *
+ * Description: Squeeze step of SHAKE256 XOF. Squeezes full blocks of
+ *              SHAKE256_RATE bytes each. Modifies the state. Can be called
+ *              multiple times to keep squeezing, i.e., is incremental.
+ *
+ * Arguments:   - unsigned char *output: pointer to output blocks
+ *              - unsigned long long nblocks: number of blocks to be squeezed
+ *                                            (written to output)
+ *              - uint64_t *s: pointer to input/output Keccak state
+ **************************************************/
+void shake256_squeezeblocks(unsigned char *output, unsigned long nblocks,
+                            uint64_t *s) {
+  keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
+}
+
+/*************************************************
+ * Name:        shake128
+ *
+ * Description: SHAKE128 XOF with non-incremental API
+ *
+ * Arguments:   - unsigned char *output: pointer to output
+ *              - unsigned long long outlen: requested output length in bytes
+ *              - const unsigned char *input: pointer to input
+ *              - unsigned long long inlen: length of input in bytes
+ **************************************************/
+void shake128(unsigned char *output, unsigned long long outlen,
+              const unsigned char *input, unsigned long long inlen) {
+  unsigned int i;
+  unsigned long nblocks = outlen / SHAKE128_RATE;
+  unsigned char t[SHAKE128_RATE];
+  uint64_t s[25];
+
+  shake128_absorb(s, input, inlen);
+  shake128_squeezeblocks(output, nblocks, s);
+
+  output += nblocks * SHAKE128_RATE;
+  outlen -= nblocks * SHAKE128_RATE;
+
+  if (outlen) {
+    shake128_squeezeblocks(t, 1, s);
+    for (i = 0; i < outlen; ++i)
+      output[i] = t[i];
+  }
+}
+
+/*************************************************
+ * Name:        shake256
+ *
+ * Description: SHAKE256 XOF with non-incremental API
+ *
+ * Arguments:   - unsigned char *output: pointer to output
+ *              - unsigned long long outlen: requested output length in bytes
+ *              - const unsigned char *input: pointer to input
+ *              - unsigned long long inlen: length of input in bytes
+ **************************************************/
 void shake256(unsigned char *output, unsigned long long outlen,
               const unsigned char *input, unsigned long long inlen) {
-  uint64_t s[25];
+  unsigned int i;
+  unsigned long nblocks = outlen / SHAKE256_RATE;
   unsigned char t[SHAKE256_RATE];
-  unsigned long long nblocks = outlen / SHAKE256_RATE;
-  size_t i;
+  uint64_t s[25];
 
-  /* Absorb input */
-  keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F);
-
-  /* Squeeze output */
-  keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
+  shake256_absorb(s, input, inlen);
+  shake256_squeezeblocks(output, nblocks, s);
 
   output += nblocks * SHAKE256_RATE;
   outlen -= nblocks * SHAKE256_RATE;
 
   if (outlen) {
-    keccak_squeezeblocks(t, 1, s, SHAKE256_RATE);
-    for (i = 0; i < outlen; i++)
+    shake256_squeezeblocks(t, 1, s);
+    for (i = 0; i < outlen; ++i)
       output[i] = t[i];
   }
 }
diff --git a/common/fips202.h b/common/fips202.h
index 32acd8fc..6b7da1dc 100644
--- a/common/fips202.h
+++ b/common/fips202.h
@@ -9,12 +9,23 @@
 #define SHA3_512_RATE 72
 
 void shake128_absorb(uint64_t *s, const unsigned char *input,
-                     unsigned int inputByteLen);
-void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks,
+                     unsigned long long inlen);
+
+void shake128_squeezeblocks(unsigned char *output, unsigned long nblocks,
                             uint64_t *s);
 
+void shake256_absorb(uint64_t *s, const unsigned char *input,
+                     unsigned long long inlen);
+
+void shake256_squeezeblocks(unsigned char *output, unsigned long nblocks,
+                            uint64_t *s);
+
+void shake128(unsigned char *output, unsigned long long outlen,
+              const unsigned char *input, unsigned long long inlen);
+
 void shake256(unsigned char *output, unsigned long long outlen,
               const unsigned char *input, unsigned long long inlen);
+
 void sha3_256(unsigned char *output, const unsigned char *input,
               unsigned long long inlen);
 void sha3_512(unsigned char *output, const unsigned char *input,
diff --git a/crypto_kem/kyber768/clean/poly.c b/crypto_kem/kyber768/clean/poly.c
index d1307796..97f705a5 100644
--- a/crypto_kem/kyber768/clean/poly.c
+++ b/crypto_kem/kyber768/clean/poly.c
@@ -155,7 +155,9 @@ void poly_getnoise(poly *r, const unsigned char *seed, unsigned char nonce) {
  *
  * Arguments:   - uint16_t *r: pointer to in/output polynomial
  **************************************************/
-void poly_ntt(poly *r) { ntt(r->coeffs); }
+void poly_ntt(poly *r) {
+  ntt(r->coeffs);
+}
 
 /*************************************************
  * Name:        poly_invntt
@@ -166,7 +168,9 @@ void poly_ntt(poly *r) { ntt(r->coeffs); }
  *
  * Arguments:   - uint16_t *a: pointer to in/output polynomial
  **************************************************/
-void poly_invntt(poly *r) { invntt(r->coeffs); }
+void poly_invntt(poly *r) {
+  invntt(r->coeffs);
+}
 
 /*************************************************
  * Name:        poly_add
diff --git a/crypto_sign/dilithium-iii/clean/api.h b/crypto_sign/dilithium-iii/clean/api.h
new file mode 100644
index 00000000..6b7bb7be
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/api.h
@@ -0,0 +1,22 @@
+#ifndef API_H
+#define API_H
+
+#define MODE 2
+
+#define CRYPTO_PUBLICKEYBYTES 1472U
+#define CRYPTO_SECRETKEYBYTES 3504U
+#define CRYPTO_BYTES 2701U
+
+#define CRYPTO_ALGNAME "Dilithium-III"
+
+int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+int crypto_sign(unsigned char *sm, unsigned long long *smlen,
+                const unsigned char *msg, unsigned long long len,
+                const unsigned char *sk);
+
+int crypto_sign_open(unsigned char *m, unsigned long long *mlen,
+                     const unsigned char *sm, unsigned long long smlen,
+                     const unsigned char *pk);
+
+#endif
diff --git a/crypto_sign/dilithium-iii/clean/ntt.c b/crypto_sign/dilithium-iii/clean/ntt.c
new file mode 100644
index 00000000..44ba2348
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/ntt.c
@@ -0,0 +1,135 @@
+#include "ntt.h"
+#include "params.h"
+#include "poly.h"
+#include "reduce.h"
+
+/* Roots of unity in order needed by forward ntt */
+static const uint32_t zetas[N] = {
+    0,       25847,   5771523, 7861508, 237124,  7602457, 7504169, 466468,
+    1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103,
+    2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868,
+    6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005,
+    2706023, 95776,   3077325, 3530437, 6718724, 4788269, 5842901, 3915439,
+    4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118,
+    6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596,
+    811944,  531354,  954230,  3881043, 3900724, 5823537, 2071892, 5582638,
+    4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196,
+    7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922,
+    3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370,
+    7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987,
+    5037034, 264944,  508951,  3097992, 44288,   7280319, 904516,  3958618,
+    4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969,  7063561,
+    189548,  4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330,
+    1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961,
+    2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955,
+    266997,  2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039,
+    900702,  1859098, 909542,  819034,  495491,  6767243, 8337157, 7857917,
+    7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579,
+    342297,  286988,  5942594, 4108315, 3437287, 5038140, 1735879, 203044,
+    2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974,
+    4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447,
+    7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775,
+    7100756, 1917081, 5834105, 7005614, 1500165, 777191,  2235880, 3406031,
+    7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136,  4603424,
+    6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531,  7173032,
+    5196991, 162844,  1616392, 3014001, 810149,  1652634, 4686184, 6581310,
+    5341501, 3523897, 3866901, 269760,  2213111, 7404533, 1717735, 472078,
+    7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524,
+    5441381, 6144432, 7959518, 6094090, 183443,  7403526, 1612842, 4834730,
+    7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782};
+
+/* Roots of unity in order needed by inverse ntt */
+static const uint32_t zetas_inv[N] = {
+    6403635, 846154,  6979993, 4442679, 1362209, 48306,   4460757, 554416,
+    3545687, 6767575, 976891,  8196974, 2286327, 420899,  2235985, 2939036,
+    3833893, 260646,  1104333, 1667432, 6470041, 1803090, 6656817, 426683,
+    7908339, 6662682, 975884,  6167306, 8110657, 4513516, 4856520, 3038916,
+    1799107, 3694233, 6727783, 7570268, 5366416, 6764025, 8217573, 3183426,
+    1207385, 8194886, 5011305, 6423145, 164721,  5925962, 5948022, 2013608,
+    3776993, 7786281, 3724270, 2584293, 1846953, 1671176, 2831860, 542412,
+    4974386, 6144537, 7603226, 6880252, 1374803, 2546312, 6463336, 1279661,
+    1962642, 5074302, 7067962, 451100,  1430225, 3318210, 7143142, 1333058,
+    1050970, 6476982, 6511298, 2994039, 3548272, 5744496, 7129923, 3767016,
+    6784443, 5894064, 7132797, 4325093, 7115408, 2590150, 5688936, 5538076,
+    8177373, 6644538, 3342277, 4943130, 4272102, 2437823, 8093429, 8038120,
+    3595838, 768622,  525098,  3556995, 5173371, 6348669, 3122442, 655327,
+    522500,  43260,   1613174, 7884926, 7561383, 7470875, 6521319, 7479715,
+    3193378, 1197226, 3759364, 3520352, 4867236, 1235728, 5945978, 8113420,
+    3562462, 2446433, 6136326, 3342478, 4562441, 6063917, 4972711, 6288750,
+    4540456, 3628969, 3881060, 3019102, 1439742, 812732,  1584928, 7094748,
+    7039087, 7064828, 177440,  2409325, 1851402, 5220671, 3553272, 8190869,
+    1316856, 7620448, 210977,  5991061, 3249728, 6727353, 8578,    3724342,
+    4421799, 7475901, 1100098, 8336129, 5282425, 7871466, 8115473, 3343383,
+    1430430, 6527646, 7031341, 381987,  1308169, 22981,   1228525, 671102,
+    2477047, 411027,  3693493, 2967645, 5665122, 6232521, 983419,  4968207,
+    8253495, 3632928, 3157330, 3190144, 1000202, 4083598, 6441103, 1257611,
+    1585221, 6203962, 4904467, 1452451, 3041255, 3677745, 1528703, 3930395,
+    2797779, 6308525, 2556880, 4479693, 4499374, 7426187, 7849063, 7568473,
+    4680821, 1600420, 2140649, 4873154, 3821735, 4874723, 1643818, 1699267,
+    539299,  6031717, 300467,  4840449, 2867647, 4805995, 3043716, 3861115,
+    4464978, 2537516, 3592148, 1661693, 4849980, 5303092, 8284641, 5674394,
+    8100412, 4369920, 19422,   6623180, 3277672, 1399561, 3859737, 2118186,
+    2108549, 5760665, 1119584, 549488,  4794489, 1079900, 7356305, 5654953,
+    5700314, 5268920, 2884855, 5260684, 2091905, 359251,  6026966, 6554070,
+    7913949, 876248,  777960,  8143293, 518909,  2608894, 8354570};
+
+/*************************************************
+ * Name:        ntt
+ *
+ * Description: Forward NTT, in-place. No modular reduction is performed after
+ *              additions or subtractions. Hence output coefficients can be up
+ *              to 16*Q larger than the coefficients of the input polynomial.
+ *              Output vector is in bitreversed order.
+ *
+ * Arguments:   - uint32_t p[N]: input/output coefficient array
+ **************************************************/
+void ntt(uint32_t p[N]) {
+  unsigned int len, start, j, k;
+  uint32_t zeta, t;
+
+  k = 1;
+  for (len = 128; len > 0; len >>= 1) {
+    for (start = 0; start < N; start = j + len) {
+      zeta = zetas[k++];
+      for (j = start; j < start + len; ++j) {
+        t = montgomery_reduce((uint64_t)zeta * p[j + len]);
+        p[j + len] = p[j] + 2 * Q - t;
+        p[j] = p[j] + t;
+      }
+    }
+  }
+}
+
+/*************************************************
+ * Name:        invntt_frominvmont
+ *
+ * Description: Inverse NTT and multiplication by Montgomery factor 2^32.
+ *              In-place. No modular reductions after additions or
+ *              subtractions. Input coefficient need to be smaller than 2*Q.
+ *              Output coefficient are smaller than 2*Q.
+ *
+ * Arguments:   - uint32_t p[N]: input/output coefficient array
+ **************************************************/
+void invntt_frominvmont(uint32_t p[N]) {
+  unsigned int start, len, j, k;
+  uint32_t t, zeta;
+  const uint32_t f =
+      (((uint64_t)MONT * MONT % Q) * (Q - 1) % Q) * ((Q - 1) >> 8) % Q;
+
+  k = 0;
+  for (len = 1; len < N; len <<= 1) {
+    for (start = 0; start < N; start = j + len) {
+      zeta = zetas_inv[k++];
+      for (j = start; j < start + len; ++j) {
+        t = p[j];
+        p[j] = t + p[j + len];
+        p[j + len] = t + 256 * Q - p[j + len];
+        p[j + len] = montgomery_reduce((uint64_t)zeta * p[j + len]);
+      }
+    }
+  }
+
+  for (j = 0; j < N; ++j) {
+    p[j] = montgomery_reduce((uint64_t)f * p[j]);
+  }
+}
diff --git a/crypto_sign/dilithium-iii/clean/ntt.h b/crypto_sign/dilithium-iii/clean/ntt.h
new file mode 100644
index 00000000..840f8048
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/ntt.h
@@ -0,0 +1,10 @@
+#ifndef NTT_H
+#define NTT_H
+
+#include "params.h"
+#include <stdint.h>
+
+void ntt(uint32_t p[N]);
+void invntt_frominvmont(uint32_t p[N]);
+
+#endif
diff --git a/crypto_sign/dilithium-iii/clean/packing.c b/crypto_sign/dilithium-iii/clean/packing.c
new file mode 100644
index 00000000..3e747b3f
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/packing.c
@@ -0,0 +1,256 @@
+#include "packing.h"
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+
+/*************************************************
+ * Name:        pack_pk
+ *
+ * Description: Bit-pack public key pk = (rho, t1).
+ *
+ * Arguments:   - unsigned char pk[]: output byte array
+ *              - const unsigned char rho[]: byte array containing rho
+ *              - const polyveck *t1: pointer to vector t1
+ **************************************************/
+void pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+             const unsigned char rho[SEEDBYTES], const polyveck *t1) {
+  unsigned int i;
+
+  for (i = 0; i < SEEDBYTES; ++i)
+    pk[i] = rho[i];
+  pk += SEEDBYTES;
+
+  for (i = 0; i < K; ++i)
+    polyt1_pack(pk + i * POLT1_SIZE_PACKED, t1->vec + i);
+}
+
+/*************************************************
+ * Name:        unpack_pk
+ *
+ * Description: Unpack public key pk = (rho, t1).
+ *
+ * Arguments:   - const unsigned char rho[]: output byte array for rho
+ *              - const polyveck *t1: pointer to output vector t1
+ *              - unsigned char pk[]: byte array containing bit-packed pk
+ **************************************************/
+void unpack_pk(unsigned char rho[SEEDBYTES], polyveck *t1,
+               const unsigned char pk[CRYPTO_PUBLICKEYBYTES]) {
+  unsigned int i;
+
+  for (i = 0; i < SEEDBYTES; ++i)
+    rho[i] = pk[i];
+  pk += SEEDBYTES;
+
+  for (i = 0; i < K; ++i)
+    polyt1_unpack(t1->vec + i, pk + i * POLT1_SIZE_PACKED);
+}
+
+/*************************************************
+ * Name:        pack_sk
+ *
+ * Description: Bit-pack secret key sk = (rho, key, tr, s1, s2, t0).
+ *
+ * Arguments:   - unsigned char sk[]: output byte array
+ *              - const unsigned char rho[]: byte array containing rho
+ *              - const unsigned char key[]: byte array containing key
+ *              - const unsigned char tr[]: byte array containing tr
+ *              - const polyvecl *s1: pointer to vector s1
+ *              - const polyveck *s2: pointer to vector s2
+ *              - const polyveck *t0: pointer to vector t0
+ **************************************************/
+void pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+             const unsigned char rho[SEEDBYTES],
+             const unsigned char key[SEEDBYTES],
+             const unsigned char tr[CRHBYTES], const polyvecl *s1,
+             const polyveck *s2, const polyveck *t0) {
+  unsigned int i;
+
+  for (i = 0; i < SEEDBYTES; ++i)
+    sk[i] = rho[i];
+  sk += SEEDBYTES;
+
+  for (i = 0; i < SEEDBYTES; ++i)
+    sk[i] = key[i];
+  sk += SEEDBYTES;
+
+  for (i = 0; i < CRHBYTES; ++i)
+    sk[i] = tr[i];
+  sk += CRHBYTES;
+
+  for (i = 0; i < L; ++i)
+    polyeta_pack(sk + i * POLETA_SIZE_PACKED, s1->vec + i);
+  sk += L * POLETA_SIZE_PACKED;
+
+  for (i = 0; i < K; ++i)
+    polyeta_pack(sk + i * POLETA_SIZE_PACKED, s2->vec + i);
+  sk += K * POLETA_SIZE_PACKED;
+
+  for (i = 0; i < K; ++i)
+    polyt0_pack(sk + i * POLT0_SIZE_PACKED, t0->vec + i);
+}
+
+/*************************************************
+ * Name:        unpack_sk
+ *
+ * Description: Unpack secret key sk = (rho, key, tr, s1, s2, t0).
+ *
+ * Arguments:   - const unsigned char rho[]: output byte array for rho
+ *              - const unsigned char key[]: output byte array for key
+ *              - const unsigned char tr[]: output byte array for tr
+ *              - const polyvecl *s1: pointer to output vector s1
+ *              - const polyveck *s2: pointer to output vector s2
+ *              - const polyveck *r0: pointer to output vector t0
+ *              - unsigned char sk[]: byte array containing bit-packed sk
+ **************************************************/
+void unpack_sk(unsigned char rho[SEEDBYTES], unsigned char key[SEEDBYTES],
+               unsigned char tr[CRHBYTES], polyvecl *s1, polyveck *s2,
+               polyveck *t0, const unsigned char sk[CRYPTO_SECRETKEYBYTES]) {
+  unsigned int i;
+
+  for (i = 0; i < SEEDBYTES; ++i)
+    rho[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for (i = 0; i < SEEDBYTES; ++i)
+    key[i] = sk[i];
+  sk += SEEDBYTES;
+
+  for (i = 0; i < CRHBYTES; ++i)
+    tr[i] = sk[i];
+  sk += CRHBYTES;
+
+  for (i = 0; i < L; ++i)
+    polyeta_unpack(s1->vec + i, sk + i * POLETA_SIZE_PACKED);
+  sk += L * POLETA_SIZE_PACKED;
+
+  for (i = 0; i < K; ++i)
+    polyeta_unpack(s2->vec + i, sk + i * POLETA_SIZE_PACKED);
+  sk += K * POLETA_SIZE_PACKED;
+
+  for (i = 0; i < K; ++i)
+    polyt0_unpack(t0->vec + i, sk + i * POLT0_SIZE_PACKED);
+}
+
+/*************************************************
+ * Name:        pack_sig
+ *
+ * Description: Bit-pack signature sig = (z, h, c).
+ *
+ * Arguments:   - unsigned char sig[]: output byte array
+ *              - const polyvecl *z: pointer to vector z
+ *              - const polyveck *h: pointer to hint vector h
+ *              - const poly *c: pointer to challenge polynomial
+ **************************************************/
+void pack_sig(unsigned char sig[CRYPTO_BYTES], const polyvecl *z,
+              const polyveck *h, const poly *c) {
+  unsigned int i, j, k;
+  uint64_t signs, mask;
+
+  for (i = 0; i < L; ++i)
+    polyz_pack(sig + i * POLZ_SIZE_PACKED, z->vec + i);
+  sig += L * POLZ_SIZE_PACKED;
+
+  /* Encode h */
+  k = 0;
+  for (i = 0; i < K; ++i) {
+    for (j = 0; j < N; ++j)
+      if (h->vec[i].coeffs[j] != 0)
+        sig[k++] = j;
+
+    sig[OMEGA + i] = k;
+  }
+  while (k < OMEGA)
+    sig[k++] = 0;
+  sig += OMEGA + K;
+
+  /* Encode c */
+  signs = 0;
+  mask = 1;
+  for (i = 0; i < N / 8; ++i) {
+    sig[i] = 0;
+    for (j = 0; j < 8; ++j) {
+      if (c->coeffs[8 * i + j] != 0) {
+        sig[i] |= (1U << j);
+        if (c->coeffs[8 * i + j] == (Q - 1))
+          signs |= mask;
+        mask <<= 1;
+      }
+    }
+  }
+  sig += N / 8;
+  for (i = 0; i < 8; ++i)
+    sig[i] = signs >> 8 * i;
+}
+
+/*************************************************
+ * Name:        unpack_sig
+ *
+ * Description: Unpack signature sig = (z, h, c).
+ *
+ * Arguments:   - polyvecl *z: pointer to output vector z
+ *              - polyveck *h: pointer to output hint vector h
+ *              - poly *c: pointer to output challenge polynomial
+ *              - const unsigned char sig[]: byte array containing
+ *                bit-packed signature
+ *
+ * Returns 1 in case of malformed signature; otherwise 0.
+ **************************************************/
+int unpack_sig(polyvecl *z, polyveck *h, poly *c,
+               const unsigned char sig[CRYPTO_BYTES]) {
+  unsigned int i, j, k;
+  uint64_t signs, mask;
+
+  for (i = 0; i < L; ++i)
+    polyz_unpack(z->vec + i, sig + i * POLZ_SIZE_PACKED);
+  sig += L * POLZ_SIZE_PACKED;
+
+  /* Decode h */
+  k = 0;
+  for (i = 0; i < K; ++i) {
+    for (j = 0; j < N; ++j)
+      h->vec[i].coeffs[j] = 0;
+
+    if (sig[OMEGA + i] < k || sig[OMEGA + i] > OMEGA)
+      return 1;
+
+    for (j = k; j < sig[OMEGA + i]; ++j) {
+      /* Coefficients are ordered for strong unforgeability */
+      if (j > k && sig[j] <= sig[j - 1])
+        return 1;
+      h->vec[i].coeffs[sig[j]] = 1;
+    }
+
+    k = sig[OMEGA + i];
+  }
+
+  /* Extra indices are zero for strong unforgeability */
+  for (j = k; j < OMEGA; ++j)
+    if (sig[j])
+      return 1;
+
+  sig += OMEGA + K;
+
+  /* Decode c */
+  for (i = 0; i < N; ++i)
+    c->coeffs[i] = 0;
+
+  signs = 0;
+  for (i = 0; i < 8; ++i)
+    signs |= (uint64_t)sig[N / 8 + i] << 8 * i;
+
+  /* Extra sign bits are zero for strong unforgeability */
+  if (signs >> 60)
+    return 1;
+
+  mask = 1;
+  for (i = 0; i < N / 8; ++i) {
+    for (j = 0; j < 8; ++j) {
+      if ((sig[i] >> j) & 0x01) {
+        c->coeffs[8 * i + j] = (signs & mask) ? Q - 1 : 1;
+        mask <<= 1;
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/crypto_sign/dilithium-iii/clean/packing.h b/crypto_sign/dilithium-iii/clean/packing.h
new file mode 100644
index 00000000..d9bc5acc
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/packing.h
@@ -0,0 +1,25 @@
+#ifndef PACKING_H
+#define PACKING_H
+
+#include "params.h"
+#include "polyvec.h"
+
+void pack_pk(unsigned char pk[CRYPTO_PUBLICKEYBYTES],
+             const unsigned char rho[SEEDBYTES], const polyveck *t1);
+void pack_sk(unsigned char sk[CRYPTO_SECRETKEYBYTES],
+             const unsigned char rho[SEEDBYTES],
+             const unsigned char key[SEEDBYTES],
+             const unsigned char tr[CRHBYTES], const polyvecl *s1,
+             const polyveck *s2, const polyveck *t0);
+void pack_sig(unsigned char sig[CRYPTO_BYTES], const polyvecl *z,
+              const polyveck *h, const poly *c);
+
+void unpack_pk(unsigned char rho[SEEDBYTES], polyveck *t1,
+               const unsigned char pk[CRYPTO_PUBLICKEYBYTES]);
+void unpack_sk(unsigned char rho[SEEDBYTES], unsigned char key[SEEDBYTES],
+               unsigned char tr[CRHBYTES], polyvecl *s1, polyveck *s2,
+               polyveck *t0, const unsigned char sk[CRYPTO_SECRETKEYBYTES]);
+int unpack_sig(polyvecl *z, polyveck *h, poly *c,
+               const unsigned char sig[CRYPTO_BYTES]);
+
+#endif
diff --git a/crypto_sign/dilithium-iii/clean/params.h b/crypto_sign/dilithium-iii/clean/params.h
new file mode 100644
index 00000000..d7daa1eb
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/params.h
@@ -0,0 +1,68 @@
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#ifndef MODE
+#define MODE 2
+#endif
+
+#define SEEDBYTES 32U
+#define CRHBYTES 48U
+#define N 256U
+#define Q 8380417U
+#define QBITS 23U
+#define ROOT_OF_UNITY 1753U
+#define D 14U
+#define GAMMA1 ((Q - 1U) / 16U)
+#define GAMMA2 (GAMMA1 / 2U)
+#define ALPHA (2U * GAMMA2)
+
+#if MODE == 0
+#define K 3U
+#define L 2U
+#define ETA 7U
+#define SETABITS 4U
+#define BETA 375U
+#define OMEGA 64U
+
+#elif MODE == 1
+#define K 4U
+#define L 3U
+#define ETA 6U
+#define SETABITS 4U
+#define BETA 325U
+#define OMEGA 80U
+
+#elif MODE == 2
+#define K 5U
+#define L 4U
+#define ETA 5U
+#define SETABITS 4U
+#define BETA 275U
+#define OMEGA 96U
+
+#elif MODE == 3
+#define K 6U
+#define L 5U
+#define ETA 3U
+#define SETABITS 3U
+#define BETA 175U
+#define OMEGA 120U
+
+#endif
+
+#define POL_SIZE_PACKED ((N * QBITS) / 8)
+#define POLT1_SIZE_PACKED ((N * (QBITS - D)) / 8)
+#define POLT0_SIZE_PACKED ((N * D) / 8)
+#define POLETA_SIZE_PACKED ((N * SETABITS) / 8)
+#define POLZ_SIZE_PACKED ((N * (QBITS - 3)) / 8)
+#define POLW1_SIZE_PACKED ((N * 4) / 8)
+#define POLVECK_SIZE_PACKED (K * POL_SIZE_PACKED)
+#define POLVECL_SIZE_PACKED (L * POL_SIZE_PACKED)
+
+#define CRYPTO_PUBLICKEYBYTES (SEEDBYTES + K * POLT1_SIZE_PACKED)
+#define CRYPTO_SECRETKEYBYTES                                                  \
+  (2 * SEEDBYTES + (L + K) * POLETA_SIZE_PACKED + CRHBYTES +                   \
+   K * POLT0_SIZE_PACKED)
+#define CRYPTO_BYTES (L * POLZ_SIZE_PACKED + (OMEGA + K) + (N / 8 + 8))
+
+#endif
diff --git a/crypto_sign/dilithium-iii/clean/poly.c b/crypto_sign/dilithium-iii/clean/poly.c
new file mode 100644
index 00000000..814edc1b
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/poly.c
@@ -0,0 +1,764 @@
+#include "poly.h"
+#include "fips202.h"
+#include "ntt.h"
+#include "params.h"
+#include "reduce.h"
+#include "rounding.h"
+#include <stdint.h>
+
+/*************************************************
+ * Name:        poly_reduce
+ *
+ * Description: Reduce all coefficients of input polynomial to representative
+ *              in [0,2*Q[.
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+void poly_reduce(poly *a) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a->coeffs[i] = reduce32(a->coeffs[i]);
+}
+
+/*************************************************
+ * Name:        poly_csubq
+ *
+ * Description: For all coefficients of input polynomial subtract Q if
+ *              coefficient is bigger than Q.
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+void poly_csubq(poly *a) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a->coeffs[i] = csubq(a->coeffs[i]);
+}
+
+/*************************************************
+ * Name:        poly_freeze
+ *
+ * Description: Reduce all coefficients of the polynomial to standard
+ *              representatives.
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+void poly_freeze(poly *a) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a->coeffs[i] = freeze(a->coeffs[i]);
+}
+
+/*************************************************
+ * Name:        poly_add
+ *
+ * Description: Add polynomials. No modular reduction is performed.
+ *
+ * Arguments:   - poly *c: pointer to output polynomial
+ *              - const poly *a: pointer to first summand
+ *              - const poly *b: pointer to second summand
+ **************************************************/
+void poly_add(poly *c, const poly *a, const poly *b) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    c->coeffs[i] = a->coeffs[i] + b->coeffs[i];
+}
+
+/*************************************************
+ * Name:        poly_sub
+ *
+ * Description: Subtract polynomials. Assumes coefficients of second input
+ *              polynomial to be less than 2*Q. No modular reduction is
+ *              performed.
+ *
+ * Arguments:   - poly *c: pointer to output polynomial
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial to be
+ *                               subtraced from first input polynomial
+ **************************************************/
+void poly_sub(poly *c, const poly *a, const poly *b) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    c->coeffs[i] = a->coeffs[i] + 2 * Q - b->coeffs[i];
+}
+
+/*************************************************
+ * Name:        poly_neg
+ *
+ * Description: Negate polynomial. Assumes input coefficients to be standard
+ *              representatives.
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+void poly_neg(poly *a) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a->coeffs[i] = Q - a->coeffs[i];
+}
+
+/*************************************************
+ * Name:        poly_shiftl
+ *
+ * Description: Multiply polynomial by 2^k without modular reduction. Assumes
+ *              input coefficients to be less than 2^{32-k}.
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ *              - unsigned int k: exponent
+ **************************************************/
+void poly_shiftl(poly *a, unsigned int k) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a->coeffs[i] <<= k;
+}
+
+/*************************************************
+ * Name:        poly_ntt
+ *
+ * Description: Forward NTT. Output coefficients can be up to 16*Q larger than
+ *              input coefficients.
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+void poly_ntt(poly *a) {
+  ntt(a->coeffs);
+}
+
+/*************************************************
+ * Name:        poly_invntt_montgomery
+ *
+ * Description: Inverse NTT and multiplication with 2^{32}. Input coefficients
+ *              need to be less than 2*Q. Output coefficients are less than 2*Q.
+ *
+ * Arguments:   - poly *a: pointer to input/output polynomial
+ **************************************************/
+void poly_invntt_montgomery(poly *a) {
+  invntt_frominvmont(a->coeffs);
+}
+
+/*************************************************
+ * Name:        poly_pointwise_invmontgomery
+ *
+ * Description: Pointwise multiplication of polynomials in NTT domain
+ *              representation and multiplication of resulting polynomial
+ *              with 2^{-32}. Output coefficients are less than 2*Q if input
+ *              coefficient are less than 22*Q.
+ *
+ * Arguments:   - poly *c: pointer to output polynomial
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ **************************************************/
+void poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    c->coeffs[i] = montgomery_reduce((uint64_t)a->coeffs[i] * b->coeffs[i]);
+}
+
+/*************************************************
+ * Name:        poly_power2round
+ *
+ * Description: For all coefficients c of the input polynomial,
+ *              compute c0, c1 such that c mod Q = c1*2^D + c0
+ *              with -2^{D-1} < c0 <= 2^{D-1}. Assumes coefficients to be
+ *              standard representatives.
+ *
+ * Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
+ *              - poly *a0: pointer to output polynomial with coefficients Q +
+ *a0
+ *              - const poly *v: pointer to input polynomial
+ **************************************************/
+void poly_power2round(poly *a1, poly *a0, const poly *a) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a1->coeffs[i] = power2round(a->coeffs[i], a0->coeffs + i);
+}
+
+/*************************************************
+ * Name:        poly_decompose
+ *
+ * Description: For all coefficients c of the input polynomial,
+ *              compute high and low bits c0, c1 such c mod Q = c1*ALPHA + c0
+ *              with -ALPHA/2 < c0 <= ALPHA/2 except c1 = (Q-1)/ALPHA where we
+ *              set c1 = 0 and -ALPHA/2 <= c0 = c mod Q - Q < 0.
+ *              Assumes coefficients to be standard representatives.
+ *
+ * Arguments:   - poly *a1: pointer to output polynomial with coefficients c1
+ *              - poly *a0: pointer to output polynomial with coefficients Q +
+ *a0
+ *              - const poly *c: pointer to input polynomial
+ **************************************************/
+void poly_decompose(poly *a1, poly *a0, const poly *a) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a1->coeffs[i] = decompose(a->coeffs[i], a0->coeffs + i);
+}
+
+/*************************************************
+ * Name:        poly_make_hint
+ *
+ * Description: Compute hint polynomial. The coefficients of which indicate
+ *              whether the high bits of the corresponding coefficients
+ *              of the first input polynomial and of the sum of the input
+ *              polynomials differ.
+ *
+ * Arguments:   - poly *h: pointer to output hint polynomial
+ *              - const poly *a: pointer to first input polynomial
+ *              - const poly *b: pointer to second input polynomial
+ *
+ * Returns number of 1 bits.
+ **************************************************/
+unsigned int poly_make_hint(poly *h, const poly *a, const poly *b) {
+  unsigned int i, s = 0;
+
+  for (i = 0; i < N; ++i) {
+    h->coeffs[i] = make_hint(a->coeffs[i], b->coeffs[i]);
+    s += h->coeffs[i];
+  }
+  return s;
+}
+
+/*************************************************
+ * Name:        poly_use_hint
+ *
+ * Description: Use hint polynomial to correct the high bits of a polynomial.
+ *
+ * Arguments:   - poly *a: pointer to output polynomial with corrected high bits
+ *              - const poly *b: pointer to input polynomial
+ *              - const poly *h: pointer to input hint polynomial
+ **************************************************/
+void poly_use_hint(poly *a, const poly *b, const poly *h) {
+  unsigned int i;
+
+  for (i = 0; i < N; ++i)
+    a->coeffs[i] = use_hint(b->coeffs[i], h->coeffs[i]);
+}
+
+/*************************************************
+ * Name:        poly_chknorm
+ *
+ * Description: Check infinity norm of polynomial against given bound.
+ *              Assumes input coefficients to be standard representatives.
+ *
+ * Arguments:   - const poly *a: pointer to polynomial
+ *              - uint32_t B: norm bound
+ *
+ * Returns 0 if norm is strictly smaller than B and 1 otherwise.
+ **************************************************/
+int poly_chknorm(const poly *a, uint32_t B) {
+  unsigned int i;
+  int32_t t;
+
+  /* It is ok to leak which coefficient violates the bound since
+     the probability for each coefficient is independent of secret
+     data but we must not leak the sign of the centralized representative. */
+  for (i = 0; i < N; ++i) {
+    /* Absolute value of centralized representative */
+    t = (Q - 1) / 2 - a->coeffs[i];
+    t ^= (t >> 31);
+    t = (Q - 1) / 2 - t;
+
+    if ((uint32_t)t >= B) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+/*************************************************
+ * Name:        poly_uniform
+ *
+ * Description: Sample uniformly random polynomial using stream of random bytes.
+ *              Assumes that enough random bytes are given (e.g.
+ *              5*SHAKE128_RATE bytes).
+ *
+ * Arguments:   - poly *a: pointer to output polynomial
+ *              - const unsigned char *buf: array of random bytes
+ **************************************************/
+void poly_uniform(poly *a, const unsigned char *buf) {
+  unsigned int ctr, pos;
+  uint32_t t;
+
+  ctr = pos = 0;
+  while (ctr < N) {
+    t = buf[pos++];
+    t |= (uint32_t)buf[pos++] << 8;
+    t |= (uint32_t)buf[pos++] << 16;
+    t &= 0x7FFFFF;
+
+    if (t < Q)
+      a->coeffs[ctr++] = t;
+  }
+}
+
+/*************************************************
+ * Name:        rej_eta
+ *
+ * Description: Sample uniformly random coefficients in [-ETA, ETA] by
+ *              performing rejection sampling using array of random bytes.
+ *
+ * Arguments:   - uint32_t *a: pointer to output array (allocated)
+ *              - unsigned int len: number of coefficients to be sampled
+ *              - const unsigned char *buf: array of random bytes
+ *              - unsigned int buflen: length of array of random bytes
+ *
+ * Returns number of sampled coefficients. Can be smaller than len if not enough
+ * random bytes were given.
+ **************************************************/
+static unsigned int rej_eta(uint32_t *a, unsigned int len,
+                            const unsigned char *buf, unsigned int buflen) {
+#if ETA > 7
+#error "rej_eta() assumes ETA <= 7"
+#endif
+  unsigned int ctr, pos;
+  unsigned char t0, t1;
+
+  ctr = pos = 0;
+  while (ctr < len && pos < buflen) {
+#if ETA <= 3
+    t0 = buf[pos] & 0x07;
+    t1 = buf[pos++] >> 5;
+#else
+    t0 = buf[pos] & 0x0F;
+    t1 = buf[pos++] >> 4;
+#endif
+
+    if (t0 <= 2 * ETA)
+      a[ctr++] = Q + ETA - t0;
+    if (t1 <= 2 * ETA && ctr < len)
+      a[ctr++] = Q + ETA - t1;
+  }
+  return ctr;
+}
+
+/*************************************************
+ * Name:        poly_uniform_eta
+ *
+ * Description: Sample polynomial with uniformly random coefficients
+ *              in [-ETA,ETA] by performing rejection sampling using the
+ *              output stream from SHAKE256(seed|nonce).
+ *
+ * Arguments:   - poly *a: pointer to output polynomial
+ *              - const unsigned char seed[]: byte array with seed of length
+ *                                            SEEDBYTES
+ *              - unsigned char nonce: nonce byte
+ **************************************************/
+void poly_uniform_eta(poly *a, const unsigned char seed[SEEDBYTES],
+                      unsigned char nonce) {
+  unsigned int i, ctr;
+  unsigned char inbuf[SEEDBYTES + 1];
+  /* Probability that we need more than 2 blocks: < 2^{-84}
+     Probability that we need more than 3 blocks: < 2^{-352} */
+  unsigned char outbuf[2 * SHAKE256_RATE];
+  uint64_t state[25];
+
+  for (i = 0; i < SEEDBYTES; ++i)
+    inbuf[i] = seed[i];
+  inbuf[SEEDBYTES] = nonce;
+
+  shake256_absorb(state, inbuf, SEEDBYTES + 1);
+  shake256_squeezeblocks(outbuf, 2, state);
+
+  ctr = rej_eta(a->coeffs, N, outbuf, 2 * SHAKE256_RATE);
+  if (ctr < N) {
+    shake256_squeezeblocks(outbuf, 1, state);
+    rej_eta(a->coeffs + ctr, N - ctr, outbuf, SHAKE256_RATE);
+  }
+}
+
+/*************************************************
+ * Name:        rej_gamma1m1
+ *
+ * Description: Sample uniformly random coefficients
+ *              in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection sampling
+ *              using array of random bytes.
+ *
+ * Arguments:   - uint32_t *a: pointer to output array (allocated)
+ *              - unsigned int len: number of coefficients to be sampled
+ *              - const unsigned char *buf: array of random bytes
+ *              - unsigned int buflen: length of array of random bytes
+ *
+ * Returns number of sampled coefficients. Can be smaller than len if not enough
+ * random bytes were given.
+ **************************************************/
+static unsigned int rej_gamma1m1(uint32_t *a, unsigned int len,
+                                 const unsigned char *buf,
+                                 unsigned int buflen) {
+#if GAMMA1 > (1 << 19)
+#error "rej_gamma1m1() assumes GAMMA1 - 1 fits in 19 bits"
+#endif
+  unsigned int ctr, pos;
+  uint32_t t0, t1;
+
+  ctr = pos = 0;
+  while (ctr < len && pos + 5 <= buflen) {
+    t0 = buf[pos];
+    t0 |= (uint32_t)buf[pos + 1] << 8;
+    t0 |= (uint32_t)buf[pos + 2] << 16;
+    t0 &= 0xFFFFF;
+
+    t1 = buf[pos + 2] >> 4;
+    t1 |= (uint32_t)buf[pos + 3] << 4;
+    t1 |= (uint32_t)buf[pos + 4] << 12;
+
+    pos += 5;
+
+    if (t0 <= 2 * GAMMA1 - 2)
+      a[ctr++] = Q + GAMMA1 - 1 - t0;
+    if (t1 <= 2 * GAMMA1 - 2 && ctr < len)
+      a[ctr++] = Q + GAMMA1 - 1 - t1;
+  }
+
+  return ctr;
+}
+
+/*************************************************
+ * Name:        poly_uniform_gamma1m1
+ *
+ * Description: Sample polynomial with uniformly random coefficients
+ *              in [-(GAMMA1 - 1), GAMMA1 - 1] by performing rejection
+ *              sampling on output stream of SHAKE256(seed|nonce).
+ *
+ * Arguments:   - poly *a: pointer to output polynomial
+ *              - const unsigned char seed[]: byte array with seed of length
+ *                                            SEEDBYTES + CRHBYTES
+ *              - uint16_t nonce: 16-bit nonce
+ **************************************************/
+void poly_uniform_gamma1m1(poly *a,
+                           const unsigned char seed[SEEDBYTES + CRHBYTES],
+                           uint16_t nonce) {
+  unsigned int i, ctr;
+  unsigned char inbuf[SEEDBYTES + CRHBYTES + 2];
+  /* Probability that we need more than 5 blocks: < 2^{-81}
+     Probability that we need more than 6 blocks: < 2^{-467} */
+  unsigned char outbuf[5 * SHAKE256_RATE];
+  uint64_t state[25];
+
+  for (i = 0; i < SEEDBYTES + CRHBYTES; ++i)
+    inbuf[i] = seed[i];
+  inbuf[SEEDBYTES + CRHBYTES] = nonce & 0xFF;
+  inbuf[SEEDBYTES + CRHBYTES + 1] = nonce >> 8;
+
+  shake256_absorb(state, inbuf, SEEDBYTES + CRHBYTES + 2);
+  shake256_squeezeblocks(outbuf, 5, state);
+
+  ctr = rej_gamma1m1(a->coeffs, N, outbuf, 5 * SHAKE256_RATE);
+  if (ctr < N) {
+    /* There are no bytes left in outbuf
+       since 5*SHAKE256_RATE is divisible by 5 */
+    shake256_squeezeblocks(outbuf, 1, state);
+    rej_gamma1m1(a->coeffs + ctr, N - ctr, outbuf, SHAKE256_RATE);
+  }
+}
+
+/*************************************************
+ * Name:        polyeta_pack
+ *
+ * Description: Bit-pack polynomial with coefficients in [-ETA,ETA].
+ *              Input coefficients are assumed to lie in [Q-ETA,Q+ETA].
+ *
+ * Arguments:   - unsigned char *r: pointer to output byte array with at least
+ *                                  POLETA_SIZE_PACKED bytes
+ *              - const poly *a: pointer to input polynomial
+ **************************************************/
+void polyeta_pack(unsigned char *r, const poly *a) {
+#if ETA > 7
+#error "polyeta_pack() assumes ETA <= 7"
+#endif
+  unsigned int i;
+  unsigned char t[8];
+
+#if ETA <= 3
+  for (i = 0; i < N / 8; ++i) {
+    t[0] = Q + ETA - a->coeffs[8 * i + 0];
+    t[1] = Q + ETA - a->coeffs[8 * i + 1];
+    t[2] = Q + ETA - a->coeffs[8 * i + 2];
+    t[3] = Q + ETA - a->coeffs[8 * i + 3];
+    t[4] = Q + ETA - a->coeffs[8 * i + 4];
+    t[5] = Q + ETA - a->coeffs[8 * i + 5];
+    t[6] = Q + ETA - a->coeffs[8 * i + 6];
+    t[7] = Q + ETA - a->coeffs[8 * i + 7];
+
+    r[3 * i + 0] = t[0];
+    r[3 * i + 0] |= t[1] << 3;
+    r[3 * i + 0] |= t[2] << 6;
+    r[3 * i + 1] = t[2] >> 2;
+    r[3 * i + 1] |= t[3] << 1;
+    r[3 * i + 1] |= t[4] << 4;
+    r[3 * i + 1] |= t[5] << 7;
+    r[3 * i + 2] = t[5] >> 1;
+    r[3 * i + 2] |= t[6] << 2;
+    r[3 * i + 2] |= t[7] << 5;
+  }
+#else
+  for (i = 0; i < N / 2; ++i) {
+    t[0] = Q + ETA - a->coeffs[2 * i + 0];
+    t[1] = Q + ETA - a->coeffs[2 * i + 1];
+    r[i] = t[0] | (t[1] << 4);
+  }
+#endif
+}
+
+/*************************************************
+ * Name:        polyeta_unpack
+ *
+ * Description: Unpack polynomial with coefficients in [-ETA,ETA].
+ *              Output coefficients lie in [Q-ETA,Q+ETA].
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const unsigned char *a: byte array with bit-packed polynomial
+ **************************************************/
+void polyeta_unpack(poly *r, const unsigned char *a) {
+  unsigned int i;
+
+#if ETA <= 3
+  for (i = 0; i < N / 8; ++i) {
+    r->coeffs[8 * i + 0] = a[3 * i + 0] & 0x07;
+    r->coeffs[8 * i + 1] = (a[3 * i + 0] >> 3) & 0x07;
+    r->coeffs[8 * i + 2] = (a[3 * i + 0] >> 6) | ((a[3 * i + 1] & 0x01) << 2);
+    r->coeffs[8 * i + 3] = (a[3 * i + 1] >> 1) & 0x07;
+    r->coeffs[8 * i + 4] = (a[3 * i + 1] >> 4) & 0x07;
+    r->coeffs[8 * i + 5] = (a[3 * i + 1] >> 7) | ((a[3 * i + 2] & 0x03) << 1);
+    r->coeffs[8 * i + 6] = (a[3 * i + 2] >> 2) & 0x07;
+    r->coeffs[8 * i + 7] = (a[3 * i + 2] >> 5);
+
+    r->coeffs[8 * i + 0] = Q + ETA - r->coeffs[8 * i + 0];
+    r->coeffs[8 * i + 1] = Q + ETA - r->coeffs[8 * i + 1];
+    r->coeffs[8 * i + 2] = Q + ETA - r->coeffs[8 * i + 2];
+    r->coeffs[8 * i + 3] = Q + ETA - r->coeffs[8 * i + 3];
+    r->coeffs[8 * i + 4] = Q + ETA - r->coeffs[8 * i + 4];
+    r->coeffs[8 * i + 5] = Q + ETA - r->coeffs[8 * i + 5];
+    r->coeffs[8 * i + 6] = Q + ETA - r->coeffs[8 * i + 6];
+    r->coeffs[8 * i + 7] = Q + ETA - r->coeffs[8 * i + 7];
+  }
+#else
+  for (i = 0; i < N / 2; ++i) {
+    r->coeffs[2 * i + 0] = a[i] & 0x0F;
+    r->coeffs[2 * i + 1] = a[i] >> 4;
+    r->coeffs[2 * i + 0] = Q + ETA - r->coeffs[2 * i + 0];
+    r->coeffs[2 * i + 1] = Q + ETA - r->coeffs[2 * i + 1];
+  }
+#endif
+}
+
+/*************************************************
+ * Name:        polyt1_pack
+ *
+ * Description: Bit-pack polynomial t1 with coefficients fitting in 9 bits.
+ *              Input coefficients are assumed to be standard representatives.
+ *
+ * Arguments:   - unsigned char *r: pointer to output byte array with at least
+ *                                  POLT1_SIZE_PACKED bytes
+ *              - const poly *a: pointer to input polynomial
+ **************************************************/
+void polyt1_pack(unsigned char *r, const poly *a) {
+#if D != 14
+#error "polyt1_pack() assumes D == 14"
+#endif
+  unsigned int i;
+
+  for (i = 0; i < N / 8; ++i) {
+    r[9 * i + 0] = a->coeffs[8 * i + 0] & 0xFF;
+    r[9 * i + 1] =
+        (a->coeffs[8 * i + 0] >> 8) | ((a->coeffs[8 * i + 1] & 0x7F) << 1);
+    r[9 * i + 2] =
+        (a->coeffs[8 * i + 1] >> 7) | ((a->coeffs[8 * i + 2] & 0x3F) << 2);
+    r[9 * i + 3] =
+        (a->coeffs[8 * i + 2] >> 6) | ((a->coeffs[8 * i + 3] & 0x1F) << 3);
+    r[9 * i + 4] =
+        (a->coeffs[8 * i + 3] >> 5) | ((a->coeffs[8 * i + 4] & 0x0F) << 4);
+    r[9 * i + 5] =
+        (a->coeffs[8 * i + 4] >> 4) | ((a->coeffs[8 * i + 5] & 0x07) << 5);
+    r[9 * i + 6] =
+        (a->coeffs[8 * i + 5] >> 3) | ((a->coeffs[8 * i + 6] & 0x03) << 6);
+    r[9 * i + 7] =
+        (a->coeffs[8 * i + 6] >> 2) | ((a->coeffs[8 * i + 7] & 0x01) << 7);
+    r[9 * i + 8] = a->coeffs[8 * i + 7] >> 1;
+  }
+}
+
+/*************************************************
+ * Name:        polyt1_unpack
+ *
+ * Description: Unpack polynomial t1 with 9-bit coefficients.
+ *              Output coefficients are standard representatives.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const unsigned char *a: byte array with bit-packed polynomial
+ **************************************************/
+void polyt1_unpack(poly *r, const unsigned char *a) {
+  unsigned int i;
+
+  for (i = 0; i < N / 8; ++i) {
+    r->coeffs[8 * i + 0] =
+        a[9 * i + 0] | ((uint32_t)(a[9 * i + 1] & 0x01) << 8);
+    r->coeffs[8 * i + 1] =
+        (a[9 * i + 1] >> 1) | ((uint32_t)(a[9 * i + 2] & 0x03) << 7);
+    r->coeffs[8 * i + 2] =
+        (a[9 * i + 2] >> 2) | ((uint32_t)(a[9 * i + 3] & 0x07) << 6);
+    r->coeffs[8 * i + 3] =
+        (a[9 * i + 3] >> 3) | ((uint32_t)(a[9 * i + 4] & 0x0F) << 5);
+    r->coeffs[8 * i + 4] =
+        (a[9 * i + 4] >> 4) | ((uint32_t)(a[9 * i + 5] & 0x1F) << 4);
+    r->coeffs[8 * i + 5] =
+        (a[9 * i + 5] >> 5) | ((uint32_t)(a[9 * i + 6] & 0x3F) << 3);
+    r->coeffs[8 * i + 6] =
+        (a[9 * i + 6] >> 6) | ((uint32_t)(a[9 * i + 7] & 0x7F) << 2);
+    r->coeffs[8 * i + 7] =
+        (a[9 * i + 7] >> 7) | ((uint32_t)(a[9 * i + 8] & 0xFF) << 1);
+  }
+}
+
+/*************************************************
+ * Name:        polyt0_pack
+ *
+ * Description: Bit-pack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
+ *              Input coefficients are assumed to lie in ]Q-2^{D-1}, Q+2^{D-1}].
+ *
+ * Arguments:   - unsigned char *r: pointer to output byte array with at least
+ *                                  POLT0_SIZE_PACKED bytes
+ *              - const poly *a: pointer to input polynomial
+ **************************************************/
+void polyt0_pack(unsigned char *r, const poly *a) {
+  unsigned int i;
+  uint32_t t[4];
+
+  for (i = 0; i < N / 4; ++i) {
+    t[0] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 0];
+    t[1] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 1];
+    t[2] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 2];
+    t[3] = Q + (1 << (D - 1)) - a->coeffs[4 * i + 3];
+
+    r[7 * i + 0] = t[0];
+    r[7 * i + 1] = t[0] >> 8;
+    r[7 * i + 1] |= t[1] << 6;
+    r[7 * i + 2] = t[1] >> 2;
+    r[7 * i + 3] = t[1] >> 10;
+    r[7 * i + 3] |= t[2] << 4;
+    r[7 * i + 4] = t[2] >> 4;
+    r[7 * i + 5] = t[2] >> 12;
+    r[7 * i + 5] |= t[3] << 2;
+    r[7 * i + 6] = t[3] >> 6;
+  }
+}
+
+/*************************************************
+ * Name:        polyt0_unpack
+ *
+ * Description: Unpack polynomial t0 with coefficients in ]-2^{D-1}, 2^{D-1}].
+ *              Output coefficients lie in ]Q-2^{D-1},Q+2^{D-1}].
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const unsigned char *a: byte array with bit-packed polynomial
+ **************************************************/
+void polyt0_unpack(poly *r, const unsigned char *a) {
+  unsigned int i;
+
+  for (i = 0; i < N / 4; ++i) {
+    r->coeffs[4 * i + 0] = a[7 * i + 0];
+    r->coeffs[4 * i + 0] |= (uint32_t)(a[7 * i + 1] & 0x3F) << 8;
+
+    r->coeffs[4 * i + 1] = a[7 * i + 1] >> 6;
+    r->coeffs[4 * i + 1] |= (uint32_t)a[7 * i + 2] << 2;
+    r->coeffs[4 * i + 1] |= (uint32_t)(a[7 * i + 3] & 0x0F) << 10;
+
+    r->coeffs[4 * i + 2] = a[7 * i + 3] >> 4;
+    r->coeffs[4 * i + 2] |= (uint32_t)a[7 * i + 4] << 4;
+    r->coeffs[4 * i + 2] |= (uint32_t)(a[7 * i + 5] & 0x03) << 12;
+
+    r->coeffs[4 * i + 3] = a[7 * i + 5] >> 2;
+    r->coeffs[4 * i + 3] |= (uint32_t)a[7 * i + 6] << 6;
+
+    r->coeffs[4 * i + 0] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 0];
+    r->coeffs[4 * i + 1] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 1];
+    r->coeffs[4 * i + 2] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 2];
+    r->coeffs[4 * i + 3] = Q + (1 << (D - 1)) - r->coeffs[4 * i + 3];
+  }
+}
+
+/*************************************************
+ * Name:        polyz_pack
+ *
+ * Description: Bit-pack polynomial z with coefficients
+ *              in [-(GAMMA1 - 1), GAMMA1 - 1].
+ *              Input coefficients are assumed to be standard representatives.
+ *
+ * Arguments:   - unsigned char *r: pointer to output byte array with at least
+ *                                  POLZ_SIZE_PACKED bytes
+ *              - const poly *a: pointer to input polynomial
+ **************************************************/
+void polyz_pack(unsigned char *r, const poly *a) {
+#if GAMMA1 > (1 << 19)
+#error "polyz_pack() assumes GAMMA1 <= 2^{19}"
+#endif
+  unsigned int i;
+  uint32_t t[2];
+
+  for (i = 0; i < N / 2; ++i) {
+    /* Map to {0,...,2*GAMMA1 - 2} */
+    t[0] = GAMMA1 - 1 - a->coeffs[2 * i + 0];
+    t[0] += ((int32_t)t[0] >> 31) & Q;
+    t[1] = GAMMA1 - 1 - a->coeffs[2 * i + 1];
+    t[1] += ((int32_t)t[1] >> 31) & Q;
+
+    r[5 * i + 0] = t[0];
+    r[5 * i + 1] = t[0] >> 8;
+    r[5 * i + 2] = t[0] >> 16;
+    r[5 * i + 2] |= t[1] << 4;
+    r[5 * i + 3] = t[1] >> 4;
+    r[5 * i + 4] = t[1] >> 12;
+  }
+}
+
+/*************************************************
+ * Name:        polyz_unpack
+ *
+ * Description: Unpack polynomial z with coefficients
+ *              in [-(GAMMA1 - 1), GAMMA1 - 1].
+ *              Output coefficients are standard representatives.
+ *
+ * Arguments:   - poly *r: pointer to output polynomial
+ *              - const unsigned char *a: byte array with bit-packed polynomial
+ **************************************************/
+void polyz_unpack(poly *r, const unsigned char *a) {
+  unsigned int i;
+
+  for (i = 0; i < N / 2; ++i) {
+    r->coeffs[2 * i + 0] = a[5 * i + 0];
+    r->coeffs[2 * i + 0] |= (uint32_t)a[5 * i + 1] << 8;
+    r->coeffs[2 * i + 0] |= (uint32_t)(a[5 * i + 2] & 0x0F) << 16;
+
+    r->coeffs[2 * i + 1] = a[5 * i + 2] >> 4;
+    r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 3] << 4;
+    r->coeffs[2 * i + 1] |= (uint32_t)a[5 * i + 4] << 12;
+
+    r->coeffs[2 * i + 0] = GAMMA1 - 1 - r->coeffs[2 * i + 0];
+    r->coeffs[2 * i + 0] += ((int32_t)r->coeffs[2 * i + 0] >> 31) & Q;
+    r->coeffs[2 * i + 1] = GAMMA1 - 1 - r->coeffs[2 * i + 1];
+    r->coeffs[2 * i + 1] += ((int32_t)r->coeffs[2 * i + 1] >> 31) & Q;
+  }
+}
+
+/*************************************************
+ * Name:        polyw1_pack
+ *
+ * Description: Bit-pack polynomial w1 with coefficients in [0, 15].
+ *              Input coefficients are assumed to be standard representatives.
+ *
+ * Arguments:   - unsigned char *r: pointer to output byte array with at least
+ *                                  POLW1_SIZE_PACKED bytes
+ *              - const poly *a: pointer to input polynomial
+ **************************************************/
+void polyw1_pack(unsigned char *r, const poly *a) {
+  unsigned int i;
+
+  for (i = 0; i < N / 2; ++i)
+    r[i] = a->coeffs[2 * i + 0] | (a->coeffs[2 * i + 1] << 4);
+}
diff --git a/crypto_sign/dilithium-iii/clean/poly.h b/crypto_sign/dilithium-iii/clean/poly.h
new file mode 100644
index 00000000..2ae614ee
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/poly.h
@@ -0,0 +1,51 @@
+#ifndef POLY_H
+#define POLY_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stdint.h>
+
+typedef struct {
+  uint32_t coeffs[N];
+} poly __attribute__((aligned(32)));
+
+void poly_reduce(poly *a);
+void poly_csubq(poly *a);
+void poly_freeze(poly *a);
+
+void poly_add(poly *c, const poly *a, const poly *b);
+void poly_sub(poly *c, const poly *a, const poly *b);
+void poly_neg(poly *a);
+void poly_shiftl(poly *a, unsigned int k);
+
+void poly_ntt(poly *a);
+void poly_invntt_montgomery(poly *a);
+void poly_pointwise_invmontgomery(poly *c, const poly *a, const poly *b);
+
+void poly_power2round(poly *a1, poly *a0, const poly *a);
+void poly_decompose(poly *a1, poly *a0, const poly *a);
+unsigned int poly_make_hint(poly *h, const poly *a, const poly *b);
+void poly_use_hint(poly *a, const poly *b, const poly *h);
+
+int poly_chknorm(const poly *a, uint32_t B);
+void poly_uniform(poly *a, const unsigned char *buf);
+void poly_uniform_eta(poly *a, const unsigned char seed[SEEDBYTES],
+                      unsigned char nonce);
+void poly_uniform_gamma1m1(poly *a,
+                           const unsigned char seed[SEEDBYTES + CRHBYTES],
+                           uint16_t nonce);
+
+void polyeta_pack(unsigned char *r, const poly *a);
+void polyeta_unpack(poly *r, const unsigned char *a);
+
+void polyt1_pack(unsigned char *r, const poly *a);
+void polyt1_unpack(poly *r, const unsigned char *a);
+
+void polyt0_pack(unsigned char *r, const poly *a);
+void polyt0_unpack(poly *r, const unsigned char *a);
+
+void polyz_pack(unsigned char *r, const poly *a);
+void polyz_unpack(poly *r, const unsigned char *a);
+
+void polyw1_pack(unsigned char *r, const poly *a);
+#endif
diff --git a/crypto_sign/dilithium-iii/clean/polyvec.c b/crypto_sign/dilithium-iii/clean/polyvec.c
new file mode 100644
index 00000000..31db7e57
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/polyvec.c
@@ -0,0 +1,338 @@
+#include "polyvec.h"
+#include "params.h"
+#include "poly.h"
+#include <stdint.h>
+
+/**************************************************************/
+/************ Vectors of polynomials of length L **************/
+/**************************************************************/
+
+/*************************************************
+ * Name:        polyvecl_freeze
+ *
+ * Description: Reduce coefficients of polynomials in vector of length L
+ *              to standard representatives.
+ *
+ * Arguments:   - polyvecl *v: pointer to input/output vector
+ **************************************************/
+void polyvecl_freeze(polyvecl *v) {
+  unsigned int i;
+
+  for (i = 0; i < L; ++i)
+    poly_freeze(v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyvecl_add
+ *
+ * Description: Add vectors of polynomials of length L.
+ *              No modular reduction is performed.
+ *
+ * Arguments:   - polyvecl *w: pointer to output vector
+ *              - const polyvecl *u: pointer to first summand
+ *              - const polyvecl *v: pointer to second summand
+ **************************************************/
+void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v) {
+  unsigned int i;
+
+  for (i = 0; i < L; ++i)
+    poly_add(w->vec + i, u->vec + i, v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyvecl_ntt
+ *
+ * Description: Forward NTT of all polynomials in vector of length L. Output
+ *              coefficients can be up to 16*Q larger than input coefficients.
+ *
+ * Arguments:   - polyvecl *v: pointer to input/output vector
+ **************************************************/
+void polyvecl_ntt(polyvecl *v) {
+  unsigned int i;
+
+  for (i = 0; i < L; ++i)
+    poly_ntt(v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyvecl_pointwise_acc_invmontgomery
+ *
+ * Description: Pointwise multiply vectors of polynomials of length L, multiply
+ *              resulting vector by 2^{-32} and add (accumulate) polynomials
+ *              in it. Input/output vectors are in NTT domain representation.
+ *              Input coefficients are assumed to be less than 22*Q. Output
+ *              coeffcient are less than 2*L*Q.
+ *
+ * Arguments:   - poly *w: output polynomial
+ *              - const polyvecl *u: pointer to first input vector
+ *              - const polyvecl *v: pointer to second input vector
+ **************************************************/
+void polyvecl_pointwise_acc_invmontgomery(poly *w, const polyvecl *u,
+                                          const polyvecl *v) {
+  unsigned int i;
+  poly t;
+
+  poly_pointwise_invmontgomery(w, u->vec + 0, v->vec + 0);
+
+  for (i = 1; i < L; ++i) {
+    poly_pointwise_invmontgomery(&t, u->vec + i, v->vec + i);
+    poly_add(w, w, &t);
+  }
+}
+
+/*************************************************
+ * Name:        polyvecl_chknorm
+ *
+ * Description: Check infinity norm of polynomials in vector of length L.
+ *              Assumes input coefficients to be standard representatives.
+ *
+ * Arguments:   - const polyvecl *v: pointer to vector
+ *              - uint32_t B: norm bound
+ *
+ * Returns 0 if norm of all polynomials is strictly smaller than B and 1
+ * otherwise.
+ **************************************************/
+int polyvecl_chknorm(const polyvecl *v, uint32_t bound) {
+  unsigned int i;
+  int ret = 0;
+
+  for (i = 0; i < L; ++i)
+    ret |= poly_chknorm(v->vec + i, bound);
+
+  return ret;
+}
+
+/**************************************************************/
+/************ Vectors of polynomials of length K **************/
+/**************************************************************/
+
+/*************************************************
+ * Name:        polyveck_reduce
+ *
+ * Description: Reduce coefficients of polynomials in vector of length K
+ *              to representatives in [0,2*Q[.
+ *
+ * Arguments:   - polyveck *v: pointer to input/output vector
+ **************************************************/
+void polyveck_reduce(polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_reduce(v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_csubq
+ *
+ * Description: For all coefficients of polynomials in vector of length K
+ *              subtract Q if coefficient is bigger than Q.
+ *
+ * Arguments:   - polyveck *v: pointer to input/output vector
+ **************************************************/
+void polyveck_csubq(polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_csubq(v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_freeze
+ *
+ * Description: Reduce coefficients of polynomials in vector of length K
+ *              to standard representatives.
+ *
+ * Arguments:   - polyveck *v: pointer to input/output vector
+ **************************************************/
+void polyveck_freeze(polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_freeze(v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_add
+ *
+ * Description: Add vectors of polynomials of length K.
+ *              No modular reduction is performed.
+ *
+ * Arguments:   - polyveck *w: pointer to output vector
+ *              - const polyveck *u: pointer to first summand
+ *              - const polyveck *v: pointer to second summand
+ **************************************************/
+void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_add(w->vec + i, u->vec + i, v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_sub
+ *
+ * Description: Subtract vectors of polynomials of length K.
+ *              Assumes coefficients of polynomials in second input vector
+ *              to be less than 2*Q. No modular reduction is performed.
+ *
+ * Arguments:   - polyveck *w: pointer to output vector
+ *              - const polyveck *u: pointer to first input vector
+ *              - const polyveck *v: pointer to second input vector to be
+ *                                   subtracted from first input vector
+ **************************************************/
+void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_sub(w->vec + i, u->vec + i, v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_shiftl
+ *
+ * Description: Multiply vector of polynomials of Length K by 2^k without
+ *modular reduction. Assumes input coefficients to be less than 2^{32-k}.
+ *
+ * Arguments:   - polyveck *v: pointer to input/output vector
+ *              - unsigned int k: exponent
+ **************************************************/
+void polyveck_shiftl(polyveck *v, unsigned int k) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_shiftl(v->vec + i, k);
+}
+
+/*************************************************
+ * Name:        polyveck_ntt
+ *
+ * Description: Forward NTT of all polynomials in vector of length K. Output
+ *              coefficients can be up to 16*Q larger than input coefficients.
+ *
+ * Arguments:   - polyveck *v: pointer to input/output vector
+ **************************************************/
+void polyveck_ntt(polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_ntt(v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_invntt_montgomery
+ *
+ * Description: Inverse NTT and multiplication by 2^{32} of polynomials
+ *              in vector of length K. Input coefficients need to be less
+ *              than 2*Q.
+ *
+ * Arguments:   - polyveck *v: pointer to input/output vector
+ **************************************************/
+void polyveck_invntt_montgomery(polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_invntt_montgomery(v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_chknorm
+ *
+ * Description: Check infinity norm of polynomials in vector of length K.
+ *              Assumes input coefficients to be standard representatives.
+ *
+ * Arguments:   - const polyveck *v: pointer to vector
+ *              - uint32_t B: norm bound
+ *
+ * Returns 0 if norm of all polynomials are strictly smaller than B and 1
+ * otherwise.
+ **************************************************/
+int polyveck_chknorm(const polyveck *v, uint32_t bound) {
+  unsigned int i;
+  int ret = 0;
+
+  for (i = 0; i < K; ++i)
+    ret |= poly_chknorm(v->vec + i, bound);
+
+  return ret;
+}
+
+/*************************************************
+ * Name:        polyveck_power2round
+ *
+ * Description: For all coefficients a of polynomials in vector of length K,
+ *              compute a0, a1 such that a mod Q = a1*2^D + a0
+ *              with -2^{D-1} < a0 <= 2^{D-1}. Assumes coefficients to be
+ *              standard representatives.
+ *
+ * Arguments:   - polyveck *v1: pointer to output vector of polynomials with
+ *                              coefficients a1
+ *              - polyveck *v0: pointer to output vector of polynomials with
+ *                              coefficients Q + a0
+ *              - const polyveck *v: pointer to input vector
+ **************************************************/
+void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_power2round(v1->vec + i, v0->vec + i, v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_decompose
+ *
+ * Description: For all coefficients a of polynomials in vector of length K,
+ *              compute high and low bits a0, a1 such a mod Q = a1*ALPHA + a0
+ *              with -ALPHA/2 < a0 <= ALPHA/2 except a1 = (Q-1)/ALPHA where we
+ *              set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0.
+ *              Assumes coefficients to be standard representatives.
+ *
+ * Arguments:   - polyveck *v1: pointer to output vector of polynomials with
+ *                              coefficients a1
+ *              - polyveck *v0: pointer to output vector of polynomials with
+ *                              coefficients Q + a0
+ *              - const polyveck *v: pointer to input vector
+ **************************************************/
+void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_decompose(v1->vec + i, v0->vec + i, v->vec + i);
+}
+
+/*************************************************
+ * Name:        polyveck_make_hint
+ *
+ * Description: Compute hint vector.
+ *
+ * Arguments:   - polyveck *h: pointer to output vector
+ *              - const polyveck *u: pointer to first input vector
+ *              - const polyveck *u: pointer to second input vector
+ *
+ * Returns number of 1 bits.
+ **************************************************/
+unsigned int polyveck_make_hint(polyveck *h, const polyveck *u,
+                                const polyveck *v) {
+  unsigned int i, s = 0;
+
+  for (i = 0; i < K; ++i)
+    s += poly_make_hint(h->vec + i, u->vec + i, v->vec + i);
+
+  return s;
+}
+
+/*************************************************
+ * Name:        polyveck_use_hint
+ *
+ * Description: Use hint vector to correct the high bits of input vector.
+ *
+ * Arguments:   - polyveck *w: pointer to output vector of polynomials with
+ *                             corrected high bits
+ *              - const polyveck *u: pointer to input vector
+ *              - const polyveck *h: pointer to input hint vector
+ **************************************************/
+void polyveck_use_hint(polyveck *w, const polyveck *u, const polyveck *h) {
+  unsigned int i;
+
+  for (i = 0; i < K; ++i)
+    poly_use_hint(w->vec + i, u->vec + i, h->vec + i);
+}
diff --git a/crypto_sign/dilithium-iii/clean/polyvec.h b/crypto_sign/dilithium-iii/clean/polyvec.h
new file mode 100644
index 00000000..8e5a94dc
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/polyvec.h
@@ -0,0 +1,47 @@
+#ifndef POLYVEC_H
+#define POLYVEC_H
+
+#include "params.h"
+#include "poly.h"
+#include <stdint.h>
+
+/* Vectors of polynomials of length L */
+typedef struct {
+  poly vec[L];
+} polyvecl;
+
+void polyvecl_freeze(polyvecl *v);
+
+void polyvecl_add(polyvecl *w, const polyvecl *u, const polyvecl *v);
+
+void polyvecl_ntt(polyvecl *v);
+void polyvecl_pointwise_acc_invmontgomery(poly *w, const polyvecl *u,
+                                          const polyvecl *v);
+
+int polyvecl_chknorm(const polyvecl *v, uint32_t B);
+
+/* Vectors of polynomials of length K */
+typedef struct {
+  poly vec[K];
+} polyveck;
+
+void polyveck_reduce(polyveck *v);
+void polyveck_csubq(polyveck *v);
+void polyveck_freeze(polyveck *v);
+
+void polyveck_add(polyveck *w, const polyveck *u, const polyveck *v);
+void polyveck_sub(polyveck *w, const polyveck *u, const polyveck *v);
+void polyveck_shiftl(polyveck *v, unsigned int k);
+
+void polyveck_ntt(polyveck *v);
+void polyveck_invntt_montgomery(polyveck *v);
+
+int polyveck_chknorm(const polyveck *v, uint32_t B);
+
+void polyveck_power2round(polyveck *v1, polyveck *v0, const polyveck *v);
+void polyveck_decompose(polyveck *v1, polyveck *v0, const polyveck *v);
+unsigned int polyveck_make_hint(polyveck *h, const polyveck *u,
+                                const polyveck *v);
+void polyveck_use_hint(polyveck *w, const polyveck *v, const polyveck *h);
+
+#endif
diff --git a/crypto_sign/dilithium-iii/clean/reduce.c b/crypto_sign/dilithium-iii/clean/reduce.c
new file mode 100644
index 00000000..ec6d834a
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/reduce.c
@@ -0,0 +1,74 @@
+#include "reduce.h"
+#include "params.h"
+#include <stdint.h>
+
+/*************************************************
+ * Name:        montgomery_reduce
+ *
+ * Description: For finite field element a with 0 <= a <= Q*2^32,
+ *              compute r \equiv a*2^{-32} (mod Q) such that 0 <= r < 2*Q.
+ *
+ * Arguments:   - uint64_t: finite field element a
+ *
+ * Returns r.
+ **************************************************/
+uint32_t montgomery_reduce(uint64_t a) {
+  uint64_t t;
+
+  t = a * QINV;
+  t &= (1ULL << 32) - 1;
+  t *= Q;
+  t = a + t;
+  t >>= 32;
+  return t;
+}
+
+/*************************************************
+ * Name:        reduce32
+ *
+ * Description: For finite field element a, compute r \equiv a (mod Q)
+ *              such that 0 <= r < 2*Q.
+ *
+ * Arguments:   - uint32_t: finite field element a
+ *
+ * Returns r.
+ **************************************************/
+uint32_t reduce32(uint32_t a) {
+  uint32_t t;
+
+  t = a & 0x7FFFFF;
+  a >>= 23;
+  t += (a << 13) - a;
+  return t;
+}
+
+/*************************************************
+ * Name:        csubq
+ *
+ * Description: Subtract Q if input coefficient is bigger than Q.
+ *
+ * Arguments:   - uint32_t: finite field element a
+ *
+ * Returns r.
+ **************************************************/
+uint32_t csubq(uint32_t a) {
+  a -= Q;
+  a += ((int32_t)a >> 31) & Q;
+  return a;
+}
+
+/*************************************************
+ * Name:        freeze
+ *
+ * Description: For finite field element a, compute standard
+ *              representative r = a mod Q.
+ *
+ * Arguments:   - uint32_t: finite field element a
+ *
+ * Returns r.
+ **************************************************/
+uint32_t freeze(uint32_t a) {
+  a = reduce32(a);
+  a = csubq(a);
+  return a;
+}
diff --git a/crypto_sign/dilithium-iii/clean/reduce.h b/crypto_sign/dilithium-iii/clean/reduce.h
new file mode 100644
index 00000000..83aeffaf
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/reduce.h
@@ -0,0 +1,21 @@
+#ifndef REDUCE_H
+#define REDUCE_H
+
+#include <stdint.h>
+
+#define MONT 4193792U    // 2^32 % Q
+#define QINV 4236238847U // -q^(-1) mod 2^32
+
+/* a <= Q*2^32 => r < 2*Q */
+uint32_t montgomery_reduce(uint64_t a);
+
+/* r < 2*Q */
+uint32_t reduce32(uint32_t a);
+
+/* a < 2*Q => r < Q */
+uint32_t csubq(uint32_t a);
+
+/* r < Q */
+uint32_t freeze(uint32_t a);
+
+#endif
diff --git a/crypto_sign/dilithium-iii/clean/rounding.c b/crypto_sign/dilithium-iii/clean/rounding.c
new file mode 100644
index 00000000..75425a91
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/rounding.c
@@ -0,0 +1,115 @@
+#include "params.h"
+#include <stdint.h>
+
+/*************************************************
+ * Name:        power2round
+ *
+ * Description: For finite field element a, compute a0, a1 such that
+ *              a mod Q = a1*2^D + a0 with -2^{D-1} < a0 <= 2^{D-1}.
+ *              Assumes a to be standard representative.
+ *
+ * Arguments:   - uint32_t a: input element
+ *              - uint32_t *a0: pointer to output element Q + a0
+ *
+ * Returns a1.
+ **************************************************/
+uint32_t power2round(uint32_t a, uint32_t *a0) {
+  int32_t t;
+
+  /* Centralized remainder mod 2^D */
+  t = a & ((1 << D) - 1);
+  t -= (1 << (D - 1)) + 1;
+  t += (t >> 31) & (1 << D);
+  t -= (1 << (D - 1)) - 1;
+  *a0 = Q + t;
+  a = (a - t) >> D;
+  return a;
+}
+
+/*************************************************
+ * Name:        decompose
+ *
+ * Description: For finite field element a, compute high and low bits a0, a1
+ *such that a mod Q = a1*ALPHA + a0 with -ALPHA/2 < a0 <= ALPHA/2 except if a1 =
+ *(Q-1)/ALPHA where we set a1 = 0 and -ALPHA/2 <= a0 = a mod Q - Q < 0. Assumes
+ *a to be standard representative.
+ *
+ * Arguments:   - uint32_t a: input element
+ *              - uint32_t *a0: pointer to output element Q + a0
+ *
+ * Returns a1.
+ **************************************************/
+uint32_t decompose(uint32_t a, uint32_t *a0) {
+#if ALPHA != (Q - 1) / 16
+#error "decompose assumes ALPHA == (Q-1)/16"
+#endif
+  int32_t t, u;
+
+  /* Centralized remainder mod ALPHA */
+  t = a & 0x7FFFF;
+  t += (a >> 19) << 9;
+  t -= ALPHA / 2 + 1;
+  t += (t >> 31) & ALPHA;
+  t -= ALPHA / 2 - 1;
+  a -= t;
+
+  /* Divide by ALPHA (possible to avoid) */
+  u = a - 1;
+  u >>= 31;
+  a = (a >> 19) + 1;
+  a -= u & 1;
+
+  /* Border case */
+  *a0 = Q + t - (a >> 4);
+  a &= 0xF;
+  return a;
+}
+
+/*************************************************
+ * Name:        make_hint
+ *
+ * Description: Compute hint bit indicating whether or not high bits of two
+ *              finite field elements differ. Assumes input elements to be
+ *              standard representatives.
+ *
+ * Arguments:   - uint32_t a: first input element
+ *              - uint32_t b: second input element
+ *
+ * Returns 1 if high bits of a and b differ and 0 otherwise.
+ **************************************************/
+unsigned int make_hint(const uint32_t a, const uint32_t b) {
+  uint32_t t;
+
+  return decompose(a, &t) != decompose(b, &t);
+}
+
+/*************************************************
+ * Name:        use_hint
+ *
+ * Description: Correct high bits according to hint.
+ *
+ * Arguments:   - uint32_t a: input element
+ *              - unsigned int hint: hint bit
+ *
+ * Returns corrected high bits.
+ **************************************************/
+uint32_t use_hint(const uint32_t a, const unsigned int hint) {
+  uint32_t a0, a1;
+
+  a1 = decompose(a, &a0);
+  if (hint == 0)
+    return a1;
+  else if (a0 > Q)
+    return (a1 + 1) & 0xF;
+  else
+    return (a1 - 1) & 0xF;
+
+  /* If decompose does not divide out ALPHA:
+  if(hint == 0)
+    return a1;
+  else if(a0 > Q)
+    return (a1 + ALPHA) % (Q - 1);
+  else
+    return (a1 - ALPHA) % (Q - 1);
+  */
+}
diff --git a/crypto_sign/dilithium-iii/clean/rounding.h b/crypto_sign/dilithium-iii/clean/rounding.h
new file mode 100644
index 00000000..6d3b2960
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/rounding.h
@@ -0,0 +1,11 @@
+#ifndef ROUNDING_H
+#define ROUNDING_H
+
+#include <stdint.h>
+
+uint32_t power2round(const uint32_t a, uint32_t *a0);
+uint32_t decompose(uint32_t a, uint32_t *a0);
+unsigned int make_hint(const uint32_t a, const uint32_t b);
+uint32_t use_hint(const uint32_t a, const unsigned int hint);
+
+#endif
diff --git a/crypto_sign/dilithium-iii/clean/sign.c b/crypto_sign/dilithium-iii/clean/sign.c
new file mode 100644
index 00000000..7144d779
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/sign.c
@@ -0,0 +1,360 @@
+#include "sign.h"
+#include "fips202.h"
+#include "packing.h"
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+#include "randombytes.h"
+#include <stdint.h>
+
+/*************************************************
+ * Name:        expand_mat
+ *
+ * Description: Implementation of ExpandA. Generates matrix A with uniformly
+ *              random coefficients a_{i,j} by performing rejection
+ *              sampling on the output stream of SHAKE128(rho|i|j).
+ *
+ * Arguments:   - polyvecl mat[K]: output matrix
+ *              - const unsigned char rho[]: byte array containing seed rho
+ **************************************************/
+void expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]) {
+  unsigned int i, j;
+  unsigned char inbuf[SEEDBYTES + 1];
+  /* Don't change this to smaller values,
+   * sampling later assumes sufficient SHAKE output!
+   * Probability that we need more than 5 blocks: < 2^{-132}.
+   * Probability that we need more than 6 blocks: < 2^{-546}. */
+  unsigned char outbuf[5 * SHAKE128_RATE];
+
+  for (i = 0; i < SEEDBYTES; ++i)
+    inbuf[i] = rho[i];
+
+  for (i = 0; i < K; ++i) {
+    for (j = 0; j < L; ++j) {
+      inbuf[SEEDBYTES] = i + (j << 4);
+      shake128(outbuf, sizeof(outbuf), inbuf, SEEDBYTES + 1);
+      poly_uniform(mat[i].vec + j, outbuf);
+    }
+  }
+}
+
+/*************************************************
+ * Name:        challenge
+ *
+ * Description: Implementation of H. Samples polynomial with 60 nonzero
+ *              coefficients in {-1,1} using the output stream of
+ *              SHAKE256(mu|w1).
+ *
+ * Arguments:   - poly *c: pointer to output polynomial
+ *              - const unsigned char mu[]: byte array containing mu
+ *              - const polyveck *w1: pointer to vector w1
+ **************************************************/
+void challenge(poly *c, const unsigned char mu[CRHBYTES], const polyveck *w1) {
+  unsigned int i, b, pos;
+  unsigned char inbuf[CRHBYTES + K * POLW1_SIZE_PACKED];
+  unsigned char outbuf[SHAKE256_RATE];
+  uint64_t state[25], signs, mask;
+
+  for (i = 0; i < CRHBYTES; ++i)
+    inbuf[i] = mu[i];
+  for (i = 0; i < K; ++i)
+    polyw1_pack(inbuf + CRHBYTES + i * POLW1_SIZE_PACKED, w1->vec + i);
+
+  shake256_absorb(state, inbuf, sizeof(inbuf));
+  shake256_squeezeblocks(outbuf, 1, state);
+
+  signs = 0;
+  for (i = 0; i < 8; ++i)
+    signs |= (uint64_t)outbuf[i] << 8 * i;
+
+  pos = 8;
+  mask = 1;
+
+  for (i = 0; i < N; ++i)
+    c->coeffs[i] = 0;
+
+  for (i = 196; i < 256; ++i) {
+    do {
+      if (pos >= SHAKE256_RATE) {
+        shake256_squeezeblocks(outbuf, 1, state);
+        pos = 0;
+      }
+
+      b = outbuf[pos++];
+    } while (b > i);
+
+    c->coeffs[i] = c->coeffs[b];
+    c->coeffs[b] = (signs & mask) ? Q - 1 : 1;
+    mask <<= 1;
+  }
+}
+
+/*************************************************
+ * Name:        crypto_sign_keypair
+ *
+ * Description: Generates public and private key.
+ *
+ * Arguments:   - unsigned char *pk: pointer to output public key (allocated
+ *                                   array of CRYPTO_PUBLICKEYBYTES bytes)
+ *              - unsigned char *sk: pointer to output private key (allocated
+ *                                   array of CRYPTO_SECRETKEYBYTES bytes)
+ *
+ * Returns 0 (success)
+ **************************************************/
+int crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+  unsigned int i;
+  unsigned char seedbuf[3 * SEEDBYTES];
+  unsigned char tr[CRHBYTES];
+  unsigned char *rho, *rhoprime, *key;
+  uint16_t nonce = 0;
+  polyvecl mat[K];
+  polyvecl s1, s1hat;
+  polyveck s2, t, t1, t0;
+
+  /* Expand 32 bytes of randomness into rho, rhoprime and key */
+  randombytes(seedbuf, SEEDBYTES);
+  shake256(seedbuf, 3 * SEEDBYTES, seedbuf, SEEDBYTES);
+  rho = seedbuf;
+  rhoprime = rho + SEEDBYTES;
+  key = rho + 2 * SEEDBYTES;
+
+  /* Expand matrix */
+  expand_mat(mat, rho);
+
+  /* Sample short vectors s1 and s2 */
+  for (i = 0; i < L; ++i)
+    poly_uniform_eta(s1.vec + i, rhoprime, nonce++);
+  for (i = 0; i < K; ++i)
+    poly_uniform_eta(s2.vec + i, rhoprime, nonce++);
+
+  /* Matrix-vector multiplication */
+  s1hat = s1;
+  polyvecl_ntt(&s1hat);
+  for (i = 0; i < K; ++i) {
+    polyvecl_pointwise_acc_invmontgomery(t.vec + i, mat + i, &s1hat);
+    poly_reduce(t.vec + i);
+    poly_invntt_montgomery(t.vec + i);
+  }
+
+  /* Add noise vector s2 */
+  polyveck_add(&t, &t, &s2);
+
+  /* Extract t1 and write public key */
+  polyveck_freeze(&t);
+  polyveck_power2round(&t1, &t0, &t);
+  pack_pk(pk, rho, &t1);
+
+  /* Compute CRH(rho, t1) and write secret key */
+  shake256(tr, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  pack_sk(sk, rho, key, tr, &s1, &s2, &t0);
+
+  return 0;
+}
+
+/*************************************************
+ * Name:        crypto_sign
+ *
+ * Description: Compute signed message.
+ *
+ * Arguments:   - unsigned char *sm: pointer to output signed message (allocated
+ *                                   array with CRYPTO_BYTES + mlen bytes),
+ *                                   can be equal to m
+ *              - unsigned long long *smlen: pointer to output length of signed
+ *                                           message
+ *              - const unsigned char *m: pointer to message to be signed
+ *              - unsigned long long mlen: length of message
+ *              - const unsigned char *sk: pointer to bit-packed secret key
+ *
+ * Returns 0 (success)
+ **************************************************/
+int crypto_sign(unsigned char *sm, unsigned long long *smlen,
+                const unsigned char *m, unsigned long long mlen,
+                const unsigned char *sk) {
+  unsigned long long i, j;
+  unsigned int n;
+  unsigned char
+      seedbuf[2 * SEEDBYTES + CRHBYTES]; // TODO: nonce in seedbuf (2x)
+  unsigned char tr[CRHBYTES];
+  unsigned char *rho, *key, *mu;
+  uint16_t nonce = 0;
+  poly c, chat;
+  polyvecl mat[K], s1, y, yhat, z;
+  polyveck s2, t0, w, w1;
+  polyveck h, wcs2, wcs20, ct0, tmp;
+
+  rho = seedbuf;
+  key = seedbuf + SEEDBYTES;
+  mu = seedbuf + 2 * SEEDBYTES;
+  unpack_sk(rho, key, tr, &s1, &s2, &t0, sk);
+
+  /* Copy tr and message into the sm buffer,
+   * backwards since m and sm can be equal in SUPERCOP API */
+  for (i = 1; i <= mlen; ++i)
+    sm[CRYPTO_BYTES + mlen - i] = m[mlen - i];
+  for (i = 0; i < CRHBYTES; ++i)
+    sm[CRYPTO_BYTES - CRHBYTES + i] = tr[i];
+
+  /* Compute CRH(tr, msg) */
+  shake256(mu, CRHBYTES, sm + CRYPTO_BYTES - CRHBYTES, CRHBYTES + mlen);
+
+  /* Expand matrix and transform vectors */
+  expand_mat(mat, rho);
+  polyvecl_ntt(&s1);
+  polyveck_ntt(&s2);
+  polyveck_ntt(&t0);
+
+rej:
+  /* Sample intermediate vector y */
+  for (i = 0; i < L; ++i)
+    poly_uniform_gamma1m1(y.vec + i, key, nonce++);
+
+  /* Matrix-vector multiplication */
+  yhat = y;
+  polyvecl_ntt(&yhat);
+  for (i = 0; i < K; ++i) {
+    polyvecl_pointwise_acc_invmontgomery(w.vec + i, mat + i, &yhat);
+    poly_reduce(w.vec + i);
+    poly_invntt_montgomery(w.vec + i);
+  }
+
+  /* Decompose w and call the random oracle */
+  polyveck_csubq(&w);
+  polyveck_decompose(&w1, &tmp, &w);
+  challenge(&c, mu, &w1);
+
+  /* Compute z, reject if it reveals secret */
+  chat = c;
+  poly_ntt(&chat);
+  for (i = 0; i < L; ++i) {
+    poly_pointwise_invmontgomery(z.vec + i, &chat, s1.vec + i);
+    poly_invntt_montgomery(z.vec + i);
+  }
+  polyvecl_add(&z, &z, &y);
+  polyvecl_freeze(&z);
+  if (polyvecl_chknorm(&z, GAMMA1 - BETA))
+    goto rej;
+
+  /* Compute w - cs2, reject if w1 can not be computed from it */
+  for (i = 0; i < K; ++i) {
+    poly_pointwise_invmontgomery(wcs2.vec + i, &chat, s2.vec + i);
+    poly_invntt_montgomery(wcs2.vec + i);
+  }
+  polyveck_sub(&wcs2, &w, &wcs2);
+  polyveck_freeze(&wcs2);
+  polyveck_decompose(&tmp, &wcs20, &wcs2);
+  polyveck_csubq(&wcs20);
+  if (polyveck_chknorm(&wcs20, GAMMA2 - BETA))
+    goto rej;
+
+  for (i = 0; i < K; ++i)
+    for (j = 0; j < N; ++j)
+      if (tmp.vec[i].coeffs[j] != w1.vec[i].coeffs[j])
+        goto rej;
+
+  /* Compute hints for w1 */
+  for (i = 0; i < K; ++i) {
+    poly_pointwise_invmontgomery(ct0.vec + i, &chat, t0.vec + i);
+    poly_invntt_montgomery(ct0.vec + i);
+  }
+
+  polyveck_csubq(&ct0);
+  if (polyveck_chknorm(&ct0, GAMMA2))
+    goto rej;
+
+  polyveck_add(&tmp, &wcs2, &ct0);
+  polyveck_csubq(&tmp);
+  n = polyveck_make_hint(&h, &wcs2, &tmp);
+  if (n > OMEGA)
+    goto rej;
+
+  /* Write signature */
+  pack_sig(sm, &z, &h, &c);
+
+  *smlen = mlen + CRYPTO_BYTES;
+  return 0;
+}
+
+/*************************************************
+ * Name:        crypto_sign_open
+ *
+ * Description: Verify signed message.
+ *
+ * Arguments:   - unsigned char *m: pointer to output message (allocated
+ *                                  array with smlen bytes), can be equal to sm
+ *              - unsigned long long *mlen: pointer to output length of message
+ *              - const unsigned char *sm: pointer to signed message
+ *              - unsigned long long smlen: length of signed message
+ *              - const unsigned char *sk: pointer to bit-packed public key
+ *
+ * Returns 0 if signed message could be verified correctly and -1 otherwise
+ **************************************************/
+int crypto_sign_open(unsigned char *m, unsigned long long *mlen,
+                     const unsigned char *sm, unsigned long long smlen,
+                     const unsigned char *pk) {
+  unsigned long long i;
+  unsigned char rho[SEEDBYTES];
+  unsigned char mu[CRHBYTES];
+  poly c, chat, cp;
+  polyvecl mat[K], z;
+  polyveck t1, w1, h, tmp1, tmp2;
+
+  if (smlen < CRYPTO_BYTES)
+    goto badsig;
+
+  *mlen = smlen - CRYPTO_BYTES;
+
+  unpack_pk(rho, &t1, pk);
+  if (unpack_sig(&z, &h, &c, sm))
+    goto badsig;
+  if (polyvecl_chknorm(&z, GAMMA1 - BETA))
+    goto badsig;
+
+  /* Compute CRH(CRH(rho, t1), msg) using m as "playground" buffer */
+  if (sm != m)
+    for (i = 0; i < *mlen; ++i)
+      m[CRYPTO_BYTES + i] = sm[CRYPTO_BYTES + i];
+
+  shake256(m + CRYPTO_BYTES - CRHBYTES, CRHBYTES, pk, CRYPTO_PUBLICKEYBYTES);
+  shake256(mu, CRHBYTES, m + CRYPTO_BYTES - CRHBYTES, CRHBYTES + *mlen);
+
+  /* Matrix-vector multiplication; compute Az - c2^dt1 */
+  expand_mat(mat, rho);
+  polyvecl_ntt(&z);
+  for (i = 0; i < K; ++i)
+    polyvecl_pointwise_acc_invmontgomery(tmp1.vec + i, mat + i, &z);
+
+  chat = c;
+  poly_ntt(&chat);
+  polyveck_shiftl(&t1, D);
+  polyveck_ntt(&t1);
+  for (i = 0; i < K; ++i)
+    poly_pointwise_invmontgomery(tmp2.vec + i, &chat, t1.vec + i);
+
+  polyveck_sub(&tmp1, &tmp1, &tmp2);
+  polyveck_reduce(&tmp1);
+  polyveck_invntt_montgomery(&tmp1);
+
+  /* Reconstruct w1 */
+  polyveck_csubq(&tmp1);
+  polyveck_use_hint(&w1, &tmp1, &h);
+
+  /* Call random oracle and verify challenge */
+  challenge(&cp, mu, &w1);
+  for (i = 0; i < N; ++i)
+    if (c.coeffs[i] != cp.coeffs[i])
+      goto badsig;
+
+  /* All good, copy msg, return 0 */
+  for (i = 0; i < *mlen; ++i)
+    m[i] = sm[CRYPTO_BYTES + i];
+
+  return 0;
+
+/* Signature verification failed */
+badsig:
+  *mlen = (unsigned long long)-1;
+  for (i = 0; i < smlen; ++i)
+    m[i] = 0;
+
+  return -1;
+}
diff --git a/crypto_sign/dilithium-iii/clean/sign.h b/crypto_sign/dilithium-iii/clean/sign.h
new file mode 100644
index 00000000..82020227
--- /dev/null
+++ b/crypto_sign/dilithium-iii/clean/sign.h
@@ -0,0 +1,21 @@
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "params.h"
+#include "poly.h"
+#include "polyvec.h"
+
+void expand_mat(polyvecl mat[K], const unsigned char rho[SEEDBYTES]);
+void challenge(poly *c, const unsigned char mu[CRHBYTES], const polyveck *w1);
+
+int crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+int crypto_sign(unsigned char *sm, unsigned long long *smlen,
+                const unsigned char *msg, unsigned long long len,
+                const unsigned char *sk);
+
+int crypto_sign_open(unsigned char *m, unsigned long long *mlen,
+                     const unsigned char *sm, unsigned long long smlen,
+                     const unsigned char *pk);
+
+#endif
diff --git a/crypto_sign/test.c b/crypto_sign/test.c
new file mode 100644
index 00000000..39ec8bcb
--- /dev/null
+++ b/crypto_sign/test.c
@@ -0,0 +1,95 @@
+#include "api.h"
+#include "randombytes.h"
+#include <stdio.h>
+#include <string.h>
+
+#define NTESTS 15
+#define MLEN 32
+
+/* allocate a bit more for all keys and messages and
+ * make sure it is not touched by the implementations.
+ */
+static void write_canary(unsigned char *d) {
+  *((uint64_t *)d) = 0x0123456789ABCDEF;
+}
+
+static int check_canary(unsigned char *d) {
+  if (*(uint64_t *)d != 0x0123456789ABCDEF)
+    return -1;
+  else
+    return 0;
+}
+static int test_sign(void) {
+  unsigned char pk[CRYPTO_PUBLICKEYBYTES + 16];
+  unsigned char sk[CRYPTO_SECRETKEYBYTES + 16];
+  unsigned char sm[MLEN + CRYPTO_BYTES + 16];
+  unsigned char m[MLEN + 16];
+
+  unsigned long long mlen;
+  unsigned long long smlen;
+
+  int i;
+  write_canary(pk);
+  write_canary(pk + sizeof(pk) - 8);
+  write_canary(sk);
+  write_canary(sk + sizeof(sk) - 8);
+  write_canary(sm);
+  write_canary(sm + sizeof(sm) - 8);
+  write_canary(m);
+  write_canary(m + sizeof(m) - 8);
+
+  for (i = 0; i < NTESTS; i++) {
+    crypto_sign_keypair(pk + 8, sk + 8);
+
+    randombytes(m + 8, MLEN);
+    crypto_sign(sm + 8, &smlen, m + 8, MLEN, sk + 8);
+
+    // By relying on m == sm we prevent having to allocate CRYPTO_BYTES twice
+    if (crypto_sign_open(sm + 8, &mlen, sm + 8, smlen, pk + 8)) {
+      printf("ERROR Signature did not verify correctly!\n");
+    } else if (check_canary(pk) || check_canary(pk + sizeof(pk) - 8) ||
+               check_canary(sk) || check_canary(sk + sizeof(sk) - 8) ||
+               check_canary(sm) || check_canary(sm + sizeof(sm) - 8) ||
+               check_canary(m) || check_canary(m + sizeof(m) - 8)) {
+      printf("ERROR canary overwritten\n");
+    }
+  }
+
+  return 0;
+}
+
+static int test_wrong_pk(void) {
+  unsigned char pk[CRYPTO_PUBLICKEYBYTES];
+  unsigned char pk2[CRYPTO_PUBLICKEYBYTES];
+  unsigned char sk[CRYPTO_SECRETKEYBYTES];
+  unsigned char sm[MLEN + CRYPTO_BYTES];
+  unsigned char m[MLEN];
+
+  unsigned long long mlen;
+  unsigned long long smlen;
+
+  int i;
+
+  for (i = 0; i < NTESTS; i++) {
+    crypto_sign_keypair(pk2, sk);
+
+    crypto_sign_keypair(pk, sk);
+
+    randombytes(m, MLEN);
+    crypto_sign(sm, &smlen, m, MLEN, sk);
+
+    // By relying on m == sm we prevent having to allocate CRYPTO_BYTES twice
+    if (!crypto_sign_open(sm, &mlen, sm, smlen, pk2)) {
+      printf("ERROR Signature did verify correctly under wrong public key!\n");
+    }
+  }
+
+  return 0;
+}
+
+int main(void) {
+  test_sign();
+  test_wrong_pk();
+
+  return 0;
+}