From 44a050106a2952bf5e4438621882703a1e92fc3f Mon Sep 17 00:00:00 2001
From: Thomas Pornin <thomas.pornin@nccgroup.com>
Date: Thu, 19 Sep 2019 15:55:28 -0400
Subject: [PATCH] Fixed sampler bug (update to new upstream Falcon code
 2019-09-18).

---
 crypto_sign/falcon-1024/META.yml        |   4 +-
 crypto_sign/falcon-1024/clean/common.c  |  42 +++-
 crypto_sign/falcon-1024/clean/fpr.c     |  25 ++-
 crypto_sign/falcon-1024/clean/fpr.h     |   4 +-
 crypto_sign/falcon-1024/clean/inner.h   | 195 ++++++++++++++++--
 crypto_sign/falcon-1024/clean/keygen.c  |  21 +-
 crypto_sign/falcon-1024/clean/pqclean.c |  38 ++--
 crypto_sign/falcon-1024/clean/rng.c     |   4 +-
 crypto_sign/falcon-1024/clean/sign.c    | 250 ++++++++++++++++++++----
 crypto_sign/falcon-1024/clean/vrfy.c    | 115 ++++++++++-
 crypto_sign/falcon-512/META.yml         |   4 +-
 crypto_sign/falcon-512/clean/common.c   |  42 +++-
 crypto_sign/falcon-512/clean/fpr.c      |  25 ++-
 crypto_sign/falcon-512/clean/fpr.h      |   4 +-
 crypto_sign/falcon-512/clean/inner.h    | 195 ++++++++++++++++--
 crypto_sign/falcon-512/clean/keygen.c   |  21 +-
 crypto_sign/falcon-512/clean/pqclean.c  |  38 ++--
 crypto_sign/falcon-512/clean/rng.c      |   4 +-
 crypto_sign/falcon-512/clean/sign.c     | 250 ++++++++++++++++++++----
 crypto_sign/falcon-512/clean/vrfy.c     | 115 ++++++++++-
 20 files changed, 1204 insertions(+), 192 deletions(-)

diff --git a/crypto_sign/falcon-1024/META.yml b/crypto_sign/falcon-1024/META.yml
index ddae840c..695cf3c0 100644
--- a/crypto_sign/falcon-1024/META.yml
+++ b/crypto_sign/falcon-1024/META.yml
@@ -4,8 +4,8 @@ claimed-nist-level: 5
 length-public-key: 1793
 length-secret-key: 2305
 length-signature: 1330
-nistkat-sha256: ad3d17869fdc05deae13ffa2ef26bde125b42f61b2dcd861a1ae20adcb2accc5
-testvectors-sha256: bd8076c13722d8c555c68fc6bd7763e1a9dd5483ee7c8d1c74dd2df459c72a40
+nistkat-sha256: ef2104e326221515621638ca03cd99802271bdd9907e2ae5fc7b8d19d696c584
+testvectors-sha256: 14ee0e3f0ea4b9b25193a54eed9100b1bb1cf5dbc7813fd9dc9180c1ea1a1042
 principal-submitters:
   - Thomas Prest
 auxiliary-submitters:
diff --git a/crypto_sign/falcon-1024/clean/common.c b/crypto_sign/falcon-1024/clean/common.c
index 7dc8ad20..bb2d7ece 100644
--- a/crypto_sign/falcon-1024/clean/common.c
+++ b/crypto_sign/falcon-1024/clean/common.c
@@ -33,10 +33,43 @@
 
 /* see inner.h */
 void
-PQCLEAN_FALCON1024_CLEAN_hash_to_point(
-    shake256_context *sc,
-    uint16_t *x, unsigned logn, uint8_t *tmp) {
+PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;
 
+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 61445) {
+            while (w >= 12289) {
+                w -= 12289;
+            }
+            *x ++ = (uint16_t)w;
+            n --;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
     /*
      * Each 16-bit sample is a value in 0..65535. The value is
      * kept if it falls in 0..61444 (because 61445 = 5*12289)
@@ -97,7 +130,7 @@ PQCLEAN_FALCON1024_CLEAN_hash_to_point(
         uint8_t buf[2];
         uint32_t w, wr;
 
-        shake256_extract(sc, buf, sizeof buf);
+        inner_shake256_extract(sc, buf, sizeof buf);
         w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
         wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
         wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
@@ -196,7 +229,6 @@ PQCLEAN_FALCON1024_CLEAN_hash_to_point(
             *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
         }
     }
-
 }
 
 /* see inner.h */
diff --git a/crypto_sign/falcon-1024/clean/fpr.c b/crypto_sign/falcon-1024/clean/fpr.c
index b9a8999d..636b4092 100644
--- a/crypto_sign/falcon-1024/clean/fpr.c
+++ b/crypto_sign/falcon-1024/clean/fpr.c
@@ -507,7 +507,7 @@ fpr_sqrt(fpr x) {
 
 
 uint64_t
-fpr_expm_p63(fpr x) {
+fpr_expm_p63(fpr x, fpr ccs) {
     /*
      * Polynomial approximation of exp(-x) is taken from FACCT:
      *   https://eprint.iacr.org/2018/1234
@@ -539,6 +539,8 @@ fpr_expm_p63(fpr x) {
 
     uint64_t z, y;
     unsigned u;
+    uint32_t z0, z1, y0, y1;
+    uint64_t a, b;
 
     y = C[0];
     z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
@@ -554,8 +556,7 @@ fpr_expm_p63(fpr x) {
          * also have appropriate IEEE754 floating-point support,
          * which is better.
          */
-        uint32_t z0, z1, y0, y1;
-        uint64_t a, b, c;
+        uint64_t c;
 
         z0 = (uint32_t)z;
         z1 = (uint32_t)(z >> 32);
@@ -569,6 +570,24 @@ fpr_expm_p63(fpr x) {
         c += (uint64_t)z1 * (uint64_t)y1;
         y = C[u] - c;
     }
+
+    /*
+     * The scaling factor must be applied at the end. Since y is now
+     * in fixed-point notation, we have to convert the factor to the
+     * same format, and do an extra integer multiplication.
+     */
+    z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
+    z0 = (uint32_t)z;
+    z1 = (uint32_t)(z >> 32);
+    y0 = (uint32_t)y;
+    y1 = (uint32_t)(y >> 32);
+    a = ((uint64_t)z0 * (uint64_t)y1)
+        + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+    b = ((uint64_t)z1 * (uint64_t)y0);
+    y = (a >> 32) + (b >> 32);
+    y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+    y += (uint64_t)z1 * (uint64_t)y1;
+
     return y;
 }
 
diff --git a/crypto_sign/falcon-1024/clean/fpr.h b/crypto_sign/falcon-1024/clean/fpr.h
index 2dfc9b85..c3103dc1 100644
--- a/crypto_sign/falcon-1024/clean/fpr.h
+++ b/crypto_sign/falcon-1024/clean/fpr.h
@@ -232,6 +232,8 @@ static const fpr fpr_zero = 0;
 static const fpr fpr_one = 4607182418800017408;
 static const fpr fpr_two = 4611686018427387904;
 static const fpr fpr_onehalf = 4602678819172646912;
+static const fpr fpr_invsqrt2 = 4604544271217802189;
+static const fpr fpr_invsqrt8 = 4600040671590431693;
 static const fpr fpr_ptwo31 = 4746794007248502784;
 static const fpr fpr_ptwo31m1 = 4746794007244308480;
 static const fpr fpr_mtwo31m1 = 13970166044099084288U;
@@ -444,7 +446,7 @@ fpr_lt(fpr x, fpr y) {
  * bits or so.
  */
 #define fpr_expm_p63   PQCLEAN_FALCON1024_CLEAN_fpr_expm_p63
-uint64_t fpr_expm_p63(fpr x);
+uint64_t fpr_expm_p63(fpr x, fpr ccs);
 
 #define fpr_gm_tab   PQCLEAN_FALCON1024_CLEAN_fpr_gm_tab
 extern const fpr fpr_gm_tab[];
diff --git a/crypto_sign/falcon-1024/clean/inner.h b/crypto_sign/falcon-1024/clean/inner.h
index a12d3755..98855160 100644
--- a/crypto_sign/falcon-1024/clean/inner.h
+++ b/crypto_sign/falcon-1024/clean/inner.h
@@ -34,6 +34,45 @@
  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
  */
 
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCON1024_CLEAN_ macro (e.g. PQCLEAN_FALCON1024_CLEAN_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCON1024_CLEAN_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -42,22 +81,47 @@
 
 
 
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+
+
+
 /* ==================================================================== */
 /*
  * SHAKE256 implementation (shake.c).
  *
  * API is defined to be easily replaced with the fips202.h API defined
- * as part of PQ Clean.
+ * as part of PQClean.
  */
 
 
 #include "fips202.h"
 
-#define shake256_context                 shake256incctx
-#define shake256_init(sc)                shake256_inc_init(sc)
-#define shake256_inject(sc, in, len)     shake256_inc_absorb(sc, in, len)
-#define shake256_flip(sc)                shake256_inc_finalize(sc)
-#define shake256_extract(sc, out, len)   shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
 
 
 /* ==================================================================== */
@@ -140,9 +204,22 @@ extern const uint8_t PQCLEAN_FALCON1024_CLEAN_max_sig_bits[];
 
 /*
  * From a SHAKE256 context (must be already flipped), produce a new
- * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
  */
-void PQCLEAN_FALCON1024_CLEAN_hash_to_point(shake256_context *sc,
+void PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
         uint16_t *x, unsigned logn, uint8_t *tmp);
 
 /*
@@ -184,6 +261,8 @@ void PQCLEAN_FALCON1024_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
  *   logn      is the degree log
  *   tmp[]     temporary, must have at least 2*2^logn bytes
  * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
  */
 int PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
                                         const uint16_t *h, unsigned logn, uint8_t *tmp);
@@ -195,6 +274,7 @@ int PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
  * reported if f is not invertible mod phi mod q).
  *
  * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
  */
 int PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
         const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
@@ -208,11 +288,53 @@ int PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
  * The tmp[] array must have room for at least 4*2^logn bytes.
  *
  * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
  */
 int PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
         const int8_t *f, const int8_t *g, const int8_t *F,
         unsigned logn, uint8_t *tmp);
 
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON1024_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp);
+
 /* ==================================================================== */
 /*
  * Implementation of floating-point real numbers (fpr.h, fpr.c).
@@ -358,7 +480,7 @@ typedef struct {
  * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
  * context (in "flipped" state) to obtain its initial state.
  */
-void PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src);
+void PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src);
 
 /*
  * Refill the PRNG buffer. This is normally invoked automatically, and
@@ -586,6 +708,9 @@ void PQCLEAN_FALCON1024_CLEAN_poly_merge_fft(fpr *f,
 
 /*
  * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
  */
 #define FALCON_KEYGEN_TEMP_1      136
 #define FALCON_KEYGEN_TEMP_2      272
@@ -608,8 +733,11 @@ void PQCLEAN_FALCON1024_CLEAN_poly_merge_fft(fpr *f,
  * public key is written in h. Either or both of G and h may be NULL,
  * in which case the corresponding element is not returned (they can
  * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
  */
-void PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
+void PQCLEAN_FALCON1024_CLEAN_keygen(inner_shake256_context *rng,
                                      int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                                      unsigned logn, uint8_t *tmp);
 
@@ -624,6 +752,9 @@ void PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
  * a total of (8*logn+40)*2^logn bytes.
  *
  * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
  */
 void PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key,
         const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
@@ -636,9 +767,15 @@ void PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key,
  *
  * The sig[] and hm[] buffers may overlap.
  *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
  * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
  */
-void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
+void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
                                         const fpr *expanded_key,
                                         const uint16_t *hm, unsigned logn, uint8_t *tmp);
 
@@ -651,13 +788,47 @@ void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
  *
  * The sig[] and hm[] buffers may overlap.
  *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
  * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
  */
-void PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
+void PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
                                        const int8_t *f, const int8_t *g,
                                        const int8_t *F, const int8_t *G,
                                        const uint16_t *hm, unsigned logn, uint8_t *tmp);
 
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(prng *p);
+
 /* ==================================================================== */
 
 #endif
diff --git a/crypto_sign/falcon-1024/clean/keygen.c b/crypto_sign/falcon-1024/clean/keygen.c
index ad6eb66f..47081537 100644
--- a/crypto_sign/falcon-1024/clean/keygen.c
+++ b/crypto_sign/falcon-1024/clean/keygen.c
@@ -2171,6 +2171,9 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
 
 /* ==================================================================== */
 
+
+#define RNG_CONTEXT   inner_shake256_context
+
 /*
  * Get a random 8-byte integer from a SHAKE-based RNG. This function
  * ensures consistent interpretation of the SHAKE output so that
@@ -2178,14 +2181,14 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
  * a known seed is used.
  */
 static inline uint64_t
-get_rng_u64(shake256_context *rng) {
+get_rng_u64(inner_shake256_context *rng) {
     /*
      * We enforce little-endian representation.
      */
 
     uint8_t tmp[8];
 
-    shake256_extract(rng, tmp, sizeof tmp);
+    inner_shake256_extract(rng, tmp, sizeof tmp);
     return (uint64_t)tmp[0]
            | ((uint64_t)tmp[1] << 8)
            | ((uint64_t)tmp[2] << 16)
@@ -2196,6 +2199,7 @@ get_rng_u64(shake256_context *rng) {
            | ((uint64_t)tmp[7] << 56);
 }
 
+
 /*
  * Table below incarnates a discrete Gaussian distribution:
  *    D(x) = exp(-(x^2)/(2*sigma^2))
@@ -2227,7 +2231,7 @@ static const uint64_t gauss_1024_12289[] = {
  * together for lower dimensions.
  */
 static int
-mkgauss(shake256_context *rng, unsigned logn) {
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
     unsigned u, g;
     int val;
 
@@ -3156,6 +3160,7 @@ solve_NTRU_intermediate(unsigned logn_top,
             fpr xv;
 
             xv = fpr_mul(rt2[u], pdc);
+
             /*
              * Sometimes the values can be out-of-bounds if
              * the algorithm fails; we must not call
@@ -4006,7 +4011,7 @@ solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
  * also makes sure that the resultant of the polynomial with phi is odd.
  */
 static void
-poly_small_mkgauss(shake256_context *rng, int8_t *f, unsigned logn) {
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
     size_t n, u;
     unsigned mod2;
 
@@ -4046,7 +4051,7 @@ restart:
 
 /* see falcon.h */
 void
-PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
+PQCLEAN_FALCON1024_CLEAN_keygen(inner_shake256_context *rng,
                                 int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                                 unsigned logn, uint8_t *tmp) {
     /*
@@ -4070,8 +4075,10 @@ PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
      */
     size_t n, u;
     uint16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;
 
     n = MKN(logn);
+    rc = rng;
 
     /*
      * We need to generate f and g randomly, until we find values
@@ -4104,8 +4111,8 @@ PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
          * (i.e. the resultant of the polynomial with phi
          * will be odd).
          */
-        poly_small_mkgauss(rng, f, logn);
-        poly_small_mkgauss(rng, g, logn);
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);
 
         /*
          * Verify that all coefficients are within the bounds
diff --git a/crypto_sign/falcon-1024/clean/pqclean.c b/crypto_sign/falcon-1024/clean/pqclean.c
index 50e21f11..bbab1921 100644
--- a/crypto_sign/falcon-1024/clean/pqclean.c
+++ b/crypto_sign/falcon-1024/clean/pqclean.c
@@ -51,16 +51,16 @@ PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair(
     int8_t f[1024], g[1024], F[1024];
     uint16_t h[1024];
     unsigned char seed[48];
-    shake256_context rng;
+    inner_shake256_context rng;
     size_t u, v;
 
     /*
      * Generate key pair.
      */
     randombytes(seed, sizeof seed);
-    shake256_init(&rng);
-    shake256_inject(&rng, seed, sizeof seed);
-    shake256_flip(&rng);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
     PQCLEAN_FALCON1024_CLEAN_keygen(&rng, f, g, F, NULL, h, 10, tmp.b);
 
     /*
@@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
         uint16_t hm[1024];
     } r;
     unsigned char seed[48];
-    shake256_context sc;
+    inner_shake256_context sc;
     size_t u, v;
 
     /*
@@ -181,19 +181,19 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
     /*
      * Hash message nonce + message into a vector.
      */
-    shake256_init(&sc);
-    shake256_inject(&sc, nonce, NONCELEN);
-    shake256_inject(&sc, m, mlen);
-    shake256_flip(&sc);
-    PQCLEAN_FALCON1024_CLEAN_hash_to_point(&sc, r.hm, 10, tmp.b);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, r.hm, 10, tmp.b);
 
     /*
      * Initialize a RNG.
      */
     randombytes(seed, sizeof seed);
-    shake256_init(&sc);
-    shake256_inject(&sc, seed, sizeof seed);
-    shake256_flip(&sc);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);
 
     /*
      * Compute and return the signature. This loops until a signature
@@ -225,7 +225,7 @@ do_verify(
     } tmp;
     uint16_t h[1024], hm[1024];
     int16_t sig[1024];
-    shake256_context sc;
+    inner_shake256_context sc;
 
     /*
      * Decode public key.
@@ -253,11 +253,11 @@ do_verify(
     /*
      * Hash nonce + message into a vector.
      */
-    shake256_init(&sc);
-    shake256_inject(&sc, nonce, NONCELEN);
-    shake256_inject(&sc, m, mlen);
-    shake256_flip(&sc);
-    PQCLEAN_FALCON1024_CLEAN_hash_to_point(&sc, hm, 10, tmp.b);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, hm, 10, tmp.b);
 
     /*
      * Verify signature.
diff --git a/crypto_sign/falcon-1024/clean/rng.c b/crypto_sign/falcon-1024/clean/rng.c
index e247a639..6be52b65 100644
--- a/crypto_sign/falcon-1024/clean/rng.c
+++ b/crypto_sign/falcon-1024/clean/rng.c
@@ -36,7 +36,7 @@
 
 /* see inner.h */
 void
-PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src) {
+PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
     /*
      * To ensure reproducibility for a given seed, we
      * must enforce little-endian interpretation of
@@ -46,7 +46,7 @@ PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src) {
     uint64_t th, tl;
     int i;
 
-    shake256_extract(src, tmp, 56);
+    inner_shake256_extract(src, tmp, 56);
     for (i = 0; i < 14; i ++) {
         uint32_t w;
 
diff --git a/crypto_sign/falcon-1024/clean/sign.c b/crypto_sign/falcon-1024/clean/sign.c
index 9307206e..d6689eb3 100644
--- a/crypto_sign/falcon-1024/clean/sign.c
+++ b/crypto_sign/falcon-1024/clean/sign.c
@@ -417,8 +417,170 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
     size_t n, hn;
     const fpr *tree0, *tree1;
 
-    n = (size_t)1 << logn;
-    if (n == 1) {
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        a_re = t1[0];
+        a_im = t1[2];
+        b_re = t1[1];
+        b_im = t1[3];
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree1[3];
+        w2 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, w2);
+        a_im = fpr_sub(x1, w3);
+        b_re = tree1[0];
+        b_im = tree1[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree1[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z1[0] = w0 = fpr_add(a_re, c_re);
+        z1[2] = w2 = fpr_add(a_im, c_im);
+        z1[1] = w1 = fpr_sub(a_re, c_re);
+        z1[3] = w3 = fpr_sub(a_im, c_im);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        w0 = fpr_sub(t1[0], w0);
+        w1 = fpr_sub(t1[1], w1);
+        w2 = fpr_sub(t1[2], w2);
+        w3 = fpr_sub(t1[3], w3);
+
+        a_re = w0;
+        a_im = w2;
+        b_re = tree[0];
+        b_im = tree[2];
+        w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        a_re = w1;
+        a_im = w3;
+        b_re = tree[1];
+        b_im = tree[3];
+        w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+        w0 = fpr_add(w0, t0[0]);
+        w1 = fpr_add(w1, t0[1]);
+        w2 = fpr_add(w2, t0[2]);
+        w3 = fpr_add(w3, t0[3]);
+
+        /*
+         * Second recursive invocation.
+         */
+        a_re = w0;
+        a_im = w2;
+        b_re = w1;
+        b_im = w3;
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree0[3];
+        w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree0[0];
+        b_im = tree0[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree0[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z0[0] = fpr_add(a_re, c_re);
+        z0[2] = fpr_add(a_im, c_im);
+        z0[1] = fpr_sub(a_re, c_re);
+        z0[3] = fpr_sub(a_im, c_im);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * Normal end of recursion is for logn == 0. Since the last
+     * steps of the recursions were inlined in the blocks above
+     * (when logn == 1 or 2), this case is not reachable, and is
+     * retained here only for documentation purposes.
+
+    if (logn == 0) {
         fpr x0, x1, sigma;
 
         x0 = t0[0];
@@ -429,6 +591,13 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
         return;
     }
 
+     */
+
+    /*
+     * General recursive case (logn >= 3).
+     */
+
+    n = (size_t)1 << logn;
     hn = n >> 1;
     tree0 = tree + n;
     tree1 = tree + n + ffLDL_treesize(logn - 1);
@@ -480,7 +649,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
     const fpr *b00, *b01, *b10, *b11, *tree;
     fpr ni;
     uint32_t sqn, ng;
-    int16_t *s2tmp;
+    int16_t *s1tmp, *s2tmp;
 
     n = MKN(logn);
     t0 = tmp;
@@ -542,6 +711,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
     /*
      * Compute the signature.
      */
+    s1tmp = (int16_t *)tx;
     sqn = 0;
     ng = 0;
     for (u = 0; u < n; u ++) {
@@ -550,6 +720,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
         z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
         sqn += (uint32_t)(z * z);
         ng |= sqn;
+        s1tmp[u] = (int16_t)z;
     }
     sqn |= -(ng >> 31);
 
@@ -568,6 +739,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
     }
     if (PQCLEAN_FALCON1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
         memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
         return 1;
     }
     return 0;
@@ -592,7 +764,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
     fpr ni;
     uint32_t sqn, ng;
-    int16_t *s2tmp;
+    int16_t *s1tmp, *s2tmp;
 
     n = MKN(logn);
 
@@ -745,6 +917,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     PQCLEAN_FALCON1024_CLEAN_iFFT(t0, logn);
     PQCLEAN_FALCON1024_CLEAN_iFFT(t1, logn);
 
+    s1tmp = (int16_t *)tx;
     sqn = 0;
     ng = 0;
     for (u = 0; u < n; u ++) {
@@ -753,6 +926,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
         z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
         sqn += (uint32_t)(z * z);
         ng |= sqn;
+        s1tmp[u] = (int16_t)z;
     }
     sqn |= -(ng >> 31);
 
@@ -771,6 +945,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     }
     if (PQCLEAN_FALCON1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
         memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
         return 1;
     }
     return 0;
@@ -780,29 +955,28 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
  * Sample an integer value along a half-gaussian distribution centered
  * on zero and standard deviation 1.8205, with a precision of 72 bits.
  */
-static int
-gaussian0_sampler(prng *p) {
+int
+PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(prng *p) {
 
     static const uint32_t dist[] = {
-        6031371U, 13708371U, 13035518U,
-        5186761U,  1487980U, 12270720U,
-        3298653U,  4688887U,  5511555U,
-        1551448U,  9247616U,  9467675U,
-        539632U, 14076116U,  5909365U,
-        138809U, 10836485U, 13263376U,
-        26405U, 15335617U, 16601723U,
-        3714U, 14514117U, 13240074U,
-        386U,  8324059U,  3276722U,
-        29U, 12376792U,  7821247U,
-        1U, 11611789U,  3398254U,
-        0U,  1194629U,  4532444U,
-        0U,    37177U,  2973575U,
-        0U,      855U, 10369757U,
-        0U,       14U,  9441597U,
-        0U,        0U,  3075302U,
-        0U,        0U,    28626U,
-        0U,        0U,      197U,
-        0U,        0U,        1U
+        10745844u,  3068844u,  3741698u,
+        5559083u,  1580863u,  8248194u,
+        2260429u, 13669192u,  2736639u,
+        708981u,  4421575u, 10046180u,
+        169348u,  7122675u,  4136815u,
+        30538u, 13063405u,  7650655u,
+        4132u, 14505003u,  7826148u,
+        417u, 16768101u, 11363290u,
+        31u,  8444042u,  8086568u,
+        1u, 12844466u,   265321u,
+        0u,  1232676u, 13644283u,
+        0u,    38047u,  9111839u,
+        0u,      870u,  6138264u,
+        0u,       14u, 12545723u,
+        0u,        0u,  3104126u,
+        0u,        0u,    28824u,
+        0u,        0u,      198u,
+        0u,        0u,        1u
     };
 
     uint32_t v0, v1, v2, hi;
@@ -843,7 +1017,7 @@ gaussian0_sampler(prng *p) {
  * Sample a bit with probability exp(-x) for some x >= 0.
  */
 static int
-BerExp(prng *p, fpr x) {
+BerExp(prng *p, fpr x, fpr ccs) {
     int s, i;
     fpr r;
     uint32_t sw, w;
@@ -880,7 +1054,7 @@ BerExp(prng *p, fpr x) {
      * case). The bias is negligible since fpr_expm_p63() only computes
      * with 51 bits of precision or so.
      */
-    z = ((fpr_expm_p63(r) << 1) - 1) >> s;
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
 
     /*
      * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
@@ -896,11 +1070,6 @@ BerExp(prng *p, fpr x) {
     return (int)(w >> 31);
 }
 
-typedef struct {
-    prng p;
-    fpr sigma_min;
-} sampler_context;
-
 /*
  * The sampler produces a random integer that follows a discrete Gaussian
  * distribution, centered on mu, and with standard deviation sigma. The
@@ -909,8 +1078,8 @@ typedef struct {
  * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
  * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
  */
-static int
-sampler(void *ctx, fpr mu, fpr isigma) {
+int
+PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
     sampler_context *spc;
     int s;
     fpr r, dss, ccs;
@@ -952,7 +1121,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
          *  - b = 0: z <= 0 and sampled against a Gaussian
          *    centered on 0.
          */
-        z0 = gaussian0_sampler(&spc->p);
+        z0 = PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(&spc->p);
         b = prng_get_u8(&spc->p) & 1;
         z = b + ((b << 1) - 1) * z0;
 
@@ -983,8 +1152,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
          */
         x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
         x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
-        x = fpr_mul(x, ccs);
-        if (BerExp(&spc->p, x)) {
+        if (BerExp(&spc->p, x, ccs)) {
             /*
              * Rejection sampling was centered on r, but the
              * actual center is mu = s + r.
@@ -996,7 +1164,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
 
 /* see inner.h */
 void
-PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
+PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
                                    const fpr *expanded_key,
                                    const uint16_t *hm, unsigned logn, uint8_t *tmp) {
     fpr *ftmp;
@@ -1025,7 +1193,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
                         ? fpr_sigma_min_10
                         : fpr_sigma_min_9;
         PQCLEAN_FALCON1024_CLEAN_prng_init(&spc.p, rng);
-        samp = sampler;
+        samp = PQCLEAN_FALCON1024_CLEAN_sampler;
         samp_ctx = &spc;
 
         /*
@@ -1040,7 +1208,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
 
 /* see inner.h */
 void
-PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
+PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
                                   const int8_t *f, const int8_t *g,
                                   const int8_t *F, const int8_t *G,
                                   const uint16_t *hm, unsigned logn, uint8_t *tmp) {
@@ -1070,7 +1238,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
                         ? fpr_sigma_min_10
                         : fpr_sigma_min_9;
         PQCLEAN_FALCON1024_CLEAN_prng_init(&spc.p, rng);
-        samp = sampler;
+        samp = PQCLEAN_FALCON1024_CLEAN_sampler;
         samp_ctx = &spc;
 
         /*
diff --git a/crypto_sign/falcon-1024/clean/vrfy.c b/crypto_sign/falcon-1024/clean/vrfy.c
index 6c3f4abf..780127cf 100644
--- a/crypto_sign/falcon-1024/clean/vrfy.c
+++ b/crypto_sign/falcon-1024/clean/vrfy.c
@@ -649,7 +649,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
     }
 
     /*
-     * Compute s1 = s2*h - c0 mod phi mod q (in tt[]).
+     * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
      */
     mq_NTT(tt, logn);
     mq_poly_montymul_ntt(tt, h, logn);
@@ -657,7 +657,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
     mq_poly_sub(tt, c0, logn);
 
     /*
-     * Normalize s1 elements into the [-q/2..q/2] range.
+     * Normalize -s1 elements into the [-q/2..q/2] range.
      */
     for (u = 0; u < n; u ++) {
         int32_t w;
@@ -668,7 +668,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
     }
 
     /*
-     * Signature is valid if and only if the aggregate (s1,s2) vector
+     * Signature is valid if and only if the aggregate (-s1,s2) vector
      * is short enough.
      */
     return PQCLEAN_FALCON1024_CLEAN_is_short((int16_t *)tt, s2, logn);
@@ -699,7 +699,7 @@ PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
     return 1;
 }
 
-/* see internal.h */
+/* see inner.h */
 int
 PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
         const int8_t *f, const int8_t *g, const int8_t *F,
@@ -743,3 +743,110 @@ PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
     }
     return 1;
 }
+
+/* see inner.h */
+int
+PQCLEAN_FALCON1024_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+    mq_NTT(tt, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+    }
+    return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h,
+                                        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+                                        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+
+        w = (uint32_t)s1[u];
+        w += Q & -(w >> 31);
+        w = mq_sub(c0[u], w);
+        h[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+    mq_NTT(tt, logn);
+    mq_NTT(h, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = ~r & (uint32_t) - PQCLEAN_FALCON1024_CLEAN_is_short(s1, s2, logn);
+    return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCON1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+    uint16_t *s2;
+    size_t u, n;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    s2 = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)sig[u];
+        w += Q & -(w >> 31);
+        s2[u] = (uint16_t)w;
+    }
+    mq_NTT(s2, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u] - 1u;
+        r += (w >> 31);
+    }
+    return (int)r;
+}
diff --git a/crypto_sign/falcon-512/META.yml b/crypto_sign/falcon-512/META.yml
index e12b3db9..e66ba9be 100644
--- a/crypto_sign/falcon-512/META.yml
+++ b/crypto_sign/falcon-512/META.yml
@@ -4,8 +4,8 @@ claimed-nist-level: 1
 length-public-key: 897
 length-secret-key: 1281
 length-signature: 690
-nistkat-sha256: abc62e7be3d7c1db757ba3cbb771cfdc89c6b36fb5efc885593db89ec2ea8bc4
-testvectors-sha256: 1a1b170fc9e4623e7ff519c15ec7a2dda55e94a175756b7c72429451bd226b09
+nistkat-sha256: e9c3985f1ce732e29ca81aeca091f20d4dbb5beb456ee1a7ab41d04add4dab10
+testvectors-sha256: 036b5e803ab825146502513b7460b24cc9493f8e366323cd5e30e2dc6d4ca6a7
 principal-submitters:
   - Thomas Prest
 auxiliary-submitters:
diff --git a/crypto_sign/falcon-512/clean/common.c b/crypto_sign/falcon-512/clean/common.c
index e46a4eb5..dcea0c1a 100644
--- a/crypto_sign/falcon-512/clean/common.c
+++ b/crypto_sign/falcon-512/clean/common.c
@@ -33,10 +33,43 @@
 
 /* see inner.h */
 void
-PQCLEAN_FALCON512_CLEAN_hash_to_point(
-    shake256_context *sc,
-    uint16_t *x, unsigned logn, uint8_t *tmp) {
+PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;
 
+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 61445) {
+            while (w >= 12289) {
+                w -= 12289;
+            }
+            *x ++ = (uint16_t)w;
+            n --;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
     /*
      * Each 16-bit sample is a value in 0..65535. The value is
      * kept if it falls in 0..61444 (because 61445 = 5*12289)
@@ -97,7 +130,7 @@ PQCLEAN_FALCON512_CLEAN_hash_to_point(
         uint8_t buf[2];
         uint32_t w, wr;
 
-        shake256_extract(sc, buf, sizeof buf);
+        inner_shake256_extract(sc, buf, sizeof buf);
         w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
         wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
         wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
@@ -196,7 +229,6 @@ PQCLEAN_FALCON512_CLEAN_hash_to_point(
             *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
         }
     }
-
 }
 
 /* see inner.h */
diff --git a/crypto_sign/falcon-512/clean/fpr.c b/crypto_sign/falcon-512/clean/fpr.c
index b9a8999d..636b4092 100644
--- a/crypto_sign/falcon-512/clean/fpr.c
+++ b/crypto_sign/falcon-512/clean/fpr.c
@@ -507,7 +507,7 @@ fpr_sqrt(fpr x) {
 
 
 uint64_t
-fpr_expm_p63(fpr x) {
+fpr_expm_p63(fpr x, fpr ccs) {
     /*
      * Polynomial approximation of exp(-x) is taken from FACCT:
      *   https://eprint.iacr.org/2018/1234
@@ -539,6 +539,8 @@ fpr_expm_p63(fpr x) {
 
     uint64_t z, y;
     unsigned u;
+    uint32_t z0, z1, y0, y1;
+    uint64_t a, b;
 
     y = C[0];
     z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
@@ -554,8 +556,7 @@ fpr_expm_p63(fpr x) {
          * also have appropriate IEEE754 floating-point support,
          * which is better.
          */
-        uint32_t z0, z1, y0, y1;
-        uint64_t a, b, c;
+        uint64_t c;
 
         z0 = (uint32_t)z;
         z1 = (uint32_t)(z >> 32);
@@ -569,6 +570,24 @@ fpr_expm_p63(fpr x) {
         c += (uint64_t)z1 * (uint64_t)y1;
         y = C[u] - c;
     }
+
+    /*
+     * The scaling factor must be applied at the end. Since y is now
+     * in fixed-point notation, we have to convert the factor to the
+     * same format, and do an extra integer multiplication.
+     */
+    z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
+    z0 = (uint32_t)z;
+    z1 = (uint32_t)(z >> 32);
+    y0 = (uint32_t)y;
+    y1 = (uint32_t)(y >> 32);
+    a = ((uint64_t)z0 * (uint64_t)y1)
+        + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+    b = ((uint64_t)z1 * (uint64_t)y0);
+    y = (a >> 32) + (b >> 32);
+    y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+    y += (uint64_t)z1 * (uint64_t)y1;
+
     return y;
 }
 
diff --git a/crypto_sign/falcon-512/clean/fpr.h b/crypto_sign/falcon-512/clean/fpr.h
index ef7275a1..f29e55f3 100644
--- a/crypto_sign/falcon-512/clean/fpr.h
+++ b/crypto_sign/falcon-512/clean/fpr.h
@@ -232,6 +232,8 @@ static const fpr fpr_zero = 0;
 static const fpr fpr_one = 4607182418800017408;
 static const fpr fpr_two = 4611686018427387904;
 static const fpr fpr_onehalf = 4602678819172646912;
+static const fpr fpr_invsqrt2 = 4604544271217802189;
+static const fpr fpr_invsqrt8 = 4600040671590431693;
 static const fpr fpr_ptwo31 = 4746794007248502784;
 static const fpr fpr_ptwo31m1 = 4746794007244308480;
 static const fpr fpr_mtwo31m1 = 13970166044099084288U;
@@ -444,7 +446,7 @@ fpr_lt(fpr x, fpr y) {
  * bits or so.
  */
 #define fpr_expm_p63   PQCLEAN_FALCON512_CLEAN_fpr_expm_p63
-uint64_t fpr_expm_p63(fpr x);
+uint64_t fpr_expm_p63(fpr x, fpr ccs);
 
 #define fpr_gm_tab   PQCLEAN_FALCON512_CLEAN_fpr_gm_tab
 extern const fpr fpr_gm_tab[];
diff --git a/crypto_sign/falcon-512/clean/inner.h b/crypto_sign/falcon-512/clean/inner.h
index 4861df6e..78c74bb8 100644
--- a/crypto_sign/falcon-512/clean/inner.h
+++ b/crypto_sign/falcon-512/clean/inner.h
@@ -34,6 +34,45 @@
  * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
  */
 
+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCON512_CLEAN_ macro (e.g. PQCLEAN_FALCON512_CLEAN_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCON512_CLEAN_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -42,22 +81,47 @@
 
 
 
+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+
+
+
 /* ==================================================================== */
 /*
  * SHAKE256 implementation (shake.c).
  *
  * API is defined to be easily replaced with the fips202.h API defined
- * as part of PQ Clean.
+ * as part of PQClean.
  */
 
 
 #include "fips202.h"
 
-#define shake256_context                 shake256incctx
-#define shake256_init(sc)                shake256_inc_init(sc)
-#define shake256_inject(sc, in, len)     shake256_inc_absorb(sc, in, len)
-#define shake256_flip(sc)                shake256_inc_finalize(sc)
-#define shake256_extract(sc, out, len)   shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)
 
 
 /* ==================================================================== */
@@ -140,9 +204,22 @@ extern const uint8_t PQCLEAN_FALCON512_CLEAN_max_sig_bits[];
 
 /*
  * From a SHAKE256 context (must be already flipped), produce a new
- * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
  */
-void PQCLEAN_FALCON512_CLEAN_hash_to_point(shake256_context *sc,
+void PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
         uint16_t *x, unsigned logn, uint8_t *tmp);
 
 /*
@@ -184,6 +261,8 @@ void PQCLEAN_FALCON512_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
  *   logn      is the degree log
  *   tmp[]     temporary, must have at least 2*2^logn bytes
  * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
  */
 int PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
                                        const uint16_t *h, unsigned logn, uint8_t *tmp);
@@ -195,6 +274,7 @@ int PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
  * reported if f is not invertible mod phi mod q).
  *
  * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
  */
 int PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
         const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
@@ -208,11 +288,53 @@ int PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
  * The tmp[] array must have room for at least 4*2^logn bytes.
  *
  * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
  */
 int PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
         const int8_t *f, const int8_t *g, const int8_t *F,
         unsigned logn, uint8_t *tmp);
 
+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON512_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp);
+
 /* ==================================================================== */
 /*
  * Implementation of floating-point real numbers (fpr.h, fpr.c).
@@ -358,7 +480,7 @@ typedef struct {
  * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
  * context (in "flipped" state) to obtain its initial state.
  */
-void PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src);
+void PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src);
 
 /*
  * Refill the PRNG buffer. This is normally invoked automatically, and
@@ -586,6 +708,9 @@ void PQCLEAN_FALCON512_CLEAN_poly_merge_fft(fpr *f,
 
 /*
  * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
  */
 #define FALCON_KEYGEN_TEMP_1      136
 #define FALCON_KEYGEN_TEMP_2      272
@@ -608,8 +733,11 @@ void PQCLEAN_FALCON512_CLEAN_poly_merge_fft(fpr *f,
  * public key is written in h. Either or both of G and h may be NULL,
  * in which case the corresponding element is not returned (they can
  * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
  */
-void PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
+void PQCLEAN_FALCON512_CLEAN_keygen(inner_shake256_context *rng,
                                     int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                                     unsigned logn, uint8_t *tmp);
 
@@ -624,6 +752,9 @@ void PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
  * a total of (8*logn+40)*2^logn bytes.
  *
  * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
  */
 void PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key,
         const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
@@ -636,9 +767,15 @@ void PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key,
  *
  * The sig[] and hm[] buffers may overlap.
  *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
  * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
  */
-void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
+void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
                                        const fpr *expanded_key,
                                        const uint16_t *hm, unsigned logn, uint8_t *tmp);
 
@@ -651,13 +788,47 @@ void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
  *
  * The sig[] and hm[] buffers may overlap.
  *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
  * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
  */
-void PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
+void PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
                                       const int8_t *f, const int8_t *g,
                                       const int8_t *F, const int8_t *G,
                                       const uint16_t *hm, unsigned logn, uint8_t *tmp);
 
+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(prng *p);
+
 /* ==================================================================== */
 
 #endif
diff --git a/crypto_sign/falcon-512/clean/keygen.c b/crypto_sign/falcon-512/clean/keygen.c
index 691165ae..b8f0dac1 100644
--- a/crypto_sign/falcon-512/clean/keygen.c
+++ b/crypto_sign/falcon-512/clean/keygen.c
@@ -2171,6 +2171,9 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
 
 /* ==================================================================== */
 
+
+#define RNG_CONTEXT   inner_shake256_context
+
 /*
  * Get a random 8-byte integer from a SHAKE-based RNG. This function
  * ensures consistent interpretation of the SHAKE output so that
@@ -2178,14 +2181,14 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
  * a known seed is used.
  */
 static inline uint64_t
-get_rng_u64(shake256_context *rng) {
+get_rng_u64(inner_shake256_context *rng) {
     /*
      * We enforce little-endian representation.
      */
 
     uint8_t tmp[8];
 
-    shake256_extract(rng, tmp, sizeof tmp);
+    inner_shake256_extract(rng, tmp, sizeof tmp);
     return (uint64_t)tmp[0]
            | ((uint64_t)tmp[1] << 8)
            | ((uint64_t)tmp[2] << 16)
@@ -2196,6 +2199,7 @@ get_rng_u64(shake256_context *rng) {
            | ((uint64_t)tmp[7] << 56);
 }
 
+
 /*
  * Table below incarnates a discrete Gaussian distribution:
  *    D(x) = exp(-(x^2)/(2*sigma^2))
@@ -2227,7 +2231,7 @@ static const uint64_t gauss_1024_12289[] = {
  * together for lower dimensions.
  */
 static int
-mkgauss(shake256_context *rng, unsigned logn) {
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
     unsigned u, g;
     int val;
 
@@ -3156,6 +3160,7 @@ solve_NTRU_intermediate(unsigned logn_top,
             fpr xv;
 
             xv = fpr_mul(rt2[u], pdc);
+
             /*
              * Sometimes the values can be out-of-bounds if
              * the algorithm fails; we must not call
@@ -4006,7 +4011,7 @@ solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
  * also makes sure that the resultant of the polynomial with phi is odd.
  */
 static void
-poly_small_mkgauss(shake256_context *rng, int8_t *f, unsigned logn) {
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
     size_t n, u;
     unsigned mod2;
 
@@ -4046,7 +4051,7 @@ restart:
 
 /* see falcon.h */
 void
-PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
+PQCLEAN_FALCON512_CLEAN_keygen(inner_shake256_context *rng,
                                int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                                unsigned logn, uint8_t *tmp) {
     /*
@@ -4070,8 +4075,10 @@ PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
      */
     size_t n, u;
     uint16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;
 
     n = MKN(logn);
+    rc = rng;
 
     /*
      * We need to generate f and g randomly, until we find values
@@ -4104,8 +4111,8 @@ PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
          * (i.e. the resultant of the polynomial with phi
          * will be odd).
          */
-        poly_small_mkgauss(rng, f, logn);
-        poly_small_mkgauss(rng, g, logn);
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);
 
         /*
          * Verify that all coefficients are within the bounds
diff --git a/crypto_sign/falcon-512/clean/pqclean.c b/crypto_sign/falcon-512/clean/pqclean.c
index 6e5ddd99..c31599b5 100644
--- a/crypto_sign/falcon-512/clean/pqclean.c
+++ b/crypto_sign/falcon-512/clean/pqclean.c
@@ -51,16 +51,16 @@ PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair(
     int8_t f[512], g[512], F[512];
     uint16_t h[512];
     unsigned char seed[48];
-    shake256_context rng;
+    inner_shake256_context rng;
     size_t u, v;
 
     /*
      * Generate key pair.
      */
     randombytes(seed, sizeof seed);
-    shake256_init(&rng);
-    shake256_inject(&rng, seed, sizeof seed);
-    shake256_flip(&rng);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
     PQCLEAN_FALCON512_CLEAN_keygen(&rng, f, g, F, NULL, h, 9, tmp.b);
 
     /*
@@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
         uint16_t hm[512];
     } r;
     unsigned char seed[48];
-    shake256_context sc;
+    inner_shake256_context sc;
     size_t u, v;
 
     /*
@@ -181,19 +181,19 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
     /*
      * Hash message nonce + message into a vector.
      */
-    shake256_init(&sc);
-    shake256_inject(&sc, nonce, NONCELEN);
-    shake256_inject(&sc, m, mlen);
-    shake256_flip(&sc);
-    PQCLEAN_FALCON512_CLEAN_hash_to_point(&sc, r.hm, 9, tmp.b);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(&sc, r.hm, 9, tmp.b);
 
     /*
      * Initialize a RNG.
      */
     randombytes(seed, sizeof seed);
-    shake256_init(&sc);
-    shake256_inject(&sc, seed, sizeof seed);
-    shake256_flip(&sc);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);
 
     /*
      * Compute and return the signature. This loops until a signature
@@ -225,7 +225,7 @@ do_verify(
     } tmp;
     uint16_t h[512], hm[512];
     int16_t sig[512];
-    shake256_context sc;
+    inner_shake256_context sc;
 
     /*
      * Decode public key.
@@ -253,11 +253,11 @@ do_verify(
     /*
      * Hash nonce + message into a vector.
      */
-    shake256_init(&sc);
-    shake256_inject(&sc, nonce, NONCELEN);
-    shake256_inject(&sc, m, mlen);
-    shake256_flip(&sc);
-    PQCLEAN_FALCON512_CLEAN_hash_to_point(&sc, hm, 9, tmp.b);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(&sc, hm, 9, tmp.b);
 
     /*
      * Verify signature.
diff --git a/crypto_sign/falcon-512/clean/rng.c b/crypto_sign/falcon-512/clean/rng.c
index f09bec93..93859344 100644
--- a/crypto_sign/falcon-512/clean/rng.c
+++ b/crypto_sign/falcon-512/clean/rng.c
@@ -36,7 +36,7 @@
 
 /* see inner.h */
 void
-PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src) {
+PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
     /*
      * To ensure reproducibility for a given seed, we
      * must enforce little-endian interpretation of
@@ -46,7 +46,7 @@ PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src) {
     uint64_t th, tl;
     int i;
 
-    shake256_extract(src, tmp, 56);
+    inner_shake256_extract(src, tmp, 56);
     for (i = 0; i < 14; i ++) {
         uint32_t w;
 
diff --git a/crypto_sign/falcon-512/clean/sign.c b/crypto_sign/falcon-512/clean/sign.c
index 9fd0fc78..d53fda29 100644
--- a/crypto_sign/falcon-512/clean/sign.c
+++ b/crypto_sign/falcon-512/clean/sign.c
@@ -417,8 +417,170 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
     size_t n, hn;
     const fpr *tree0, *tree1;
 
-    n = (size_t)1 << logn;
-    if (n == 1) {
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        a_re = t1[0];
+        a_im = t1[2];
+        b_re = t1[1];
+        b_im = t1[3];
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree1[3];
+        w2 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, w2);
+        a_im = fpr_sub(x1, w3);
+        b_re = tree1[0];
+        b_im = tree1[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree1[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z1[0] = w0 = fpr_add(a_re, c_re);
+        z1[2] = w2 = fpr_add(a_im, c_im);
+        z1[1] = w1 = fpr_sub(a_re, c_re);
+        z1[3] = w3 = fpr_sub(a_im, c_im);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        w0 = fpr_sub(t1[0], w0);
+        w1 = fpr_sub(t1[1], w1);
+        w2 = fpr_sub(t1[2], w2);
+        w3 = fpr_sub(t1[3], w3);
+
+        a_re = w0;
+        a_im = w2;
+        b_re = tree[0];
+        b_im = tree[2];
+        w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        a_re = w1;
+        a_im = w3;
+        b_re = tree[1];
+        b_im = tree[3];
+        w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+        w0 = fpr_add(w0, t0[0]);
+        w1 = fpr_add(w1, t0[1]);
+        w2 = fpr_add(w2, t0[2]);
+        w3 = fpr_add(w3, t0[3]);
+
+        /*
+         * Second recursive invocation.
+         */
+        a_re = w0;
+        a_im = w2;
+        b_re = w1;
+        b_im = w3;
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree0[3];
+        w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree0[0];
+        b_im = tree0[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree0[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z0[0] = fpr_add(a_re, c_re);
+        z0[2] = fpr_add(a_im, c_im);
+        z0[1] = fpr_sub(a_re, c_re);
+        z0[3] = fpr_sub(a_im, c_im);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * Normal end of recursion is for logn == 0. Since the last
+     * steps of the recursions were inlined in the blocks above
+     * (when logn == 1 or 2), this case is not reachable, and is
+     * retained here only for documentation purposes.
+
+    if (logn == 0) {
         fpr x0, x1, sigma;
 
         x0 = t0[0];
@@ -429,6 +591,13 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
         return;
     }
 
+     */
+
+    /*
+     * General recursive case (logn >= 3).
+     */
+
+    n = (size_t)1 << logn;
     hn = n >> 1;
     tree0 = tree + n;
     tree1 = tree + n + ffLDL_treesize(logn - 1);
@@ -480,7 +649,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
     const fpr *b00, *b01, *b10, *b11, *tree;
     fpr ni;
     uint32_t sqn, ng;
-    int16_t *s2tmp;
+    int16_t *s1tmp, *s2tmp;
 
     n = MKN(logn);
     t0 = tmp;
@@ -542,6 +711,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
     /*
      * Compute the signature.
      */
+    s1tmp = (int16_t *)tx;
     sqn = 0;
     ng = 0;
     for (u = 0; u < n; u ++) {
@@ -550,6 +720,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
         z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
         sqn += (uint32_t)(z * z);
         ng |= sqn;
+        s1tmp[u] = (int16_t)z;
     }
     sqn |= -(ng >> 31);
 
@@ -568,6 +739,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
     }
     if (PQCLEAN_FALCON512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
         memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
         return 1;
     }
     return 0;
@@ -592,7 +764,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
     fpr ni;
     uint32_t sqn, ng;
-    int16_t *s2tmp;
+    int16_t *s1tmp, *s2tmp;
 
     n = MKN(logn);
 
@@ -745,6 +917,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     PQCLEAN_FALCON512_CLEAN_iFFT(t0, logn);
     PQCLEAN_FALCON512_CLEAN_iFFT(t1, logn);
 
+    s1tmp = (int16_t *)tx;
     sqn = 0;
     ng = 0;
     for (u = 0; u < n; u ++) {
@@ -753,6 +926,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
         z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
         sqn += (uint32_t)(z * z);
         ng |= sqn;
+        s1tmp[u] = (int16_t)z;
     }
     sqn |= -(ng >> 31);
 
@@ -771,6 +945,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
     }
     if (PQCLEAN_FALCON512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
         memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
         return 1;
     }
     return 0;
@@ -780,29 +955,28 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
  * Sample an integer value along a half-gaussian distribution centered
  * on zero and standard deviation 1.8205, with a precision of 72 bits.
  */
-static int
-gaussian0_sampler(prng *p) {
+int
+PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(prng *p) {
 
     static const uint32_t dist[] = {
-        6031371U, 13708371U, 13035518U,
-        5186761U,  1487980U, 12270720U,
-        3298653U,  4688887U,  5511555U,
-        1551448U,  9247616U,  9467675U,
-        539632U, 14076116U,  5909365U,
-        138809U, 10836485U, 13263376U,
-        26405U, 15335617U, 16601723U,
-        3714U, 14514117U, 13240074U,
-        386U,  8324059U,  3276722U,
-        29U, 12376792U,  7821247U,
-        1U, 11611789U,  3398254U,
-        0U,  1194629U,  4532444U,
-        0U,    37177U,  2973575U,
-        0U,      855U, 10369757U,
-        0U,       14U,  9441597U,
-        0U,        0U,  3075302U,
-        0U,        0U,    28626U,
-        0U,        0U,      197U,
-        0U,        0U,        1U
+        10745844u,  3068844u,  3741698u,
+        5559083u,  1580863u,  8248194u,
+        2260429u, 13669192u,  2736639u,
+        708981u,  4421575u, 10046180u,
+        169348u,  7122675u,  4136815u,
+        30538u, 13063405u,  7650655u,
+        4132u, 14505003u,  7826148u,
+        417u, 16768101u, 11363290u,
+        31u,  8444042u,  8086568u,
+        1u, 12844466u,   265321u,
+        0u,  1232676u, 13644283u,
+        0u,    38047u,  9111839u,
+        0u,      870u,  6138264u,
+        0u,       14u, 12545723u,
+        0u,        0u,  3104126u,
+        0u,        0u,    28824u,
+        0u,        0u,      198u,
+        0u,        0u,        1u
     };
 
     uint32_t v0, v1, v2, hi;
@@ -843,7 +1017,7 @@ gaussian0_sampler(prng *p) {
  * Sample a bit with probability exp(-x) for some x >= 0.
  */
 static int
-BerExp(prng *p, fpr x) {
+BerExp(prng *p, fpr x, fpr ccs) {
     int s, i;
     fpr r;
     uint32_t sw, w;
@@ -880,7 +1054,7 @@ BerExp(prng *p, fpr x) {
      * case). The bias is negligible since fpr_expm_p63() only computes
      * with 51 bits of precision or so.
      */
-    z = ((fpr_expm_p63(r) << 1) - 1) >> s;
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
 
     /*
      * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
@@ -896,11 +1070,6 @@ BerExp(prng *p, fpr x) {
     return (int)(w >> 31);
 }
 
-typedef struct {
-    prng p;
-    fpr sigma_min;
-} sampler_context;
-
 /*
  * The sampler produces a random integer that follows a discrete Gaussian
  * distribution, centered on mu, and with standard deviation sigma. The
@@ -909,8 +1078,8 @@ typedef struct {
  * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
  * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
  */
-static int
-sampler(void *ctx, fpr mu, fpr isigma) {
+int
+PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
     sampler_context *spc;
     int s;
     fpr r, dss, ccs;
@@ -952,7 +1121,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
          *  - b = 0: z <= 0 and sampled against a Gaussian
          *    centered on 0.
          */
-        z0 = gaussian0_sampler(&spc->p);
+        z0 = PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(&spc->p);
         b = prng_get_u8(&spc->p) & 1;
         z = b + ((b << 1) - 1) * z0;
 
@@ -983,8 +1152,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
          */
         x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
         x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
-        x = fpr_mul(x, ccs);
-        if (BerExp(&spc->p, x)) {
+        if (BerExp(&spc->p, x, ccs)) {
             /*
              * Rejection sampling was centered on r, but the
              * actual center is mu = s + r.
@@ -996,7 +1164,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
 
 /* see inner.h */
 void
-PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
+PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
                                   const fpr *expanded_key,
                                   const uint16_t *hm, unsigned logn, uint8_t *tmp) {
     fpr *ftmp;
@@ -1025,7 +1193,7 @@ PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
                         ? fpr_sigma_min_10
                         : fpr_sigma_min_9;
         PQCLEAN_FALCON512_CLEAN_prng_init(&spc.p, rng);
-        samp = sampler;
+        samp = PQCLEAN_FALCON512_CLEAN_sampler;
         samp_ctx = &spc;
 
         /*
@@ -1040,7 +1208,7 @@ PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
 
 /* see inner.h */
 void
-PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
+PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
                                  const int8_t *f, const int8_t *g,
                                  const int8_t *F, const int8_t *G,
                                  const uint16_t *hm, unsigned logn, uint8_t *tmp) {
@@ -1070,7 +1238,7 @@ PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
                         ? fpr_sigma_min_10
                         : fpr_sigma_min_9;
         PQCLEAN_FALCON512_CLEAN_prng_init(&spc.p, rng);
-        samp = sampler;
+        samp = PQCLEAN_FALCON512_CLEAN_sampler;
         samp_ctx = &spc;
 
         /*
diff --git a/crypto_sign/falcon-512/clean/vrfy.c b/crypto_sign/falcon-512/clean/vrfy.c
index 839e80ea..779bd2c8 100644
--- a/crypto_sign/falcon-512/clean/vrfy.c
+++ b/crypto_sign/falcon-512/clean/vrfy.c
@@ -649,7 +649,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
     }
 
     /*
-     * Compute s1 = s2*h - c0 mod phi mod q (in tt[]).
+     * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
      */
     mq_NTT(tt, logn);
     mq_poly_montymul_ntt(tt, h, logn);
@@ -657,7 +657,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
     mq_poly_sub(tt, c0, logn);
 
     /*
-     * Normalize s1 elements into the [-q/2..q/2] range.
+     * Normalize -s1 elements into the [-q/2..q/2] range.
      */
     for (u = 0; u < n; u ++) {
         int32_t w;
@@ -668,7 +668,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
     }
 
     /*
-     * Signature is valid if and only if the aggregate (s1,s2) vector
+     * Signature is valid if and only if the aggregate (-s1,s2) vector
      * is short enough.
      */
     return PQCLEAN_FALCON512_CLEAN_is_short((int16_t *)tt, s2, logn);
@@ -699,7 +699,7 @@ PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
     return 1;
 }
 
-/* see internal.h */
+/* see inner.h */
 int
 PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
         const int8_t *f, const int8_t *g, const int8_t *F,
@@ -743,3 +743,110 @@ PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
     }
     return 1;
 }
+
+/* see inner.h */
+int
+PQCLEAN_FALCON512_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+    mq_NTT(tt, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+    }
+    return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h,
+                                       const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+                                       unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+
+        w = (uint32_t)s1[u];
+        w += Q & -(w >> 31);
+        w = mq_sub(c0[u], w);
+        h[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+    mq_NTT(tt, logn);
+    mq_NTT(h, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = ~r & (uint32_t) - PQCLEAN_FALCON512_CLEAN_is_short(s1, s2, logn);
+    return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCON512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+    uint16_t *s2;
+    size_t u, n;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    s2 = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)sig[u];
+        w += Q & -(w >> 31);
+        s2[u] = (uint16_t)w;
+    }
+    mq_NTT(s2, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u] - 1u;
+        r += (w >> 31);
+    }
+    return (int)r;
+}