Fixed sampler bug (update to new upstream Falcon code 2019-09-18).

2024-11-22 07:35:38 +00:00 · 2019-09-19 15:55:28 -04:00 · 2019-09-19 15:55:28 -04:00 · 44a050106a
commit 44a050106a
parent 4be5e497dc
20 changed files with 1204 additions and 192 deletions
--- a/crypto_sign/falcon-1024/META.yml
+++ b/crypto_sign/falcon-1024/META.yml
@ -4,8 +4,8 @@ claimed-nist-level: 5
 length-public-key: 1793
 length-secret-key: 2305
 length-signature: 1330
-nistkat-sha256: ad3d17869fdc05deae13ffa2ef26bde125b42f61b2dcd861a1ae20adcb2accc5
-testvectors-sha256: bd8076c13722d8c555c68fc6bd7763e1a9dd5483ee7c8d1c74dd2df459c72a40
+nistkat-sha256: ef2104e326221515621638ca03cd99802271bdd9907e2ae5fc7b8d19d696c584
+testvectors-sha256: 14ee0e3f0ea4b9b25193a54eed9100b1bb1cf5dbc7813fd9dc9180c1ea1a1042
 principal-submitters:
  - Thomas Prest
 auxiliary-submitters:
--- a/crypto_sign/falcon-1024/clean/common.c
+++ b/crypto_sign/falcon-1024/clean/common.c
@ -33,10 +33,43 @@

 /* see inner.h */
 void
-PQCLEAN_FALCON1024_CLEAN_hash_to_point(
-    shake256_context *sc,
-    uint16_t *x, unsigned logn, uint8_t *tmp) {
+PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;

+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 61445) {
+            while (w >= 12289) {
+                w -= 12289;
+            }
+            *x ++ = (uint16_t)w;
+            n --;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
    /*
     * Each 16-bit sample is a value in 0..65535. The value is
     * kept if it falls in 0..61444 (because 61445 = 5*12289)
@ -97,7 +130,7 @@ PQCLEAN_FALCON1024_CLEAN_hash_to_point(
        uint8_t buf[2];
        uint32_t w, wr;

-        shake256_extract(sc, buf, sizeof buf);
+        inner_shake256_extract(sc, buf, sizeof buf);
        w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
        wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
        wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
@ -196,7 +229,6 @@ PQCLEAN_FALCON1024_CLEAN_hash_to_point(
            *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
        }
    }
-
 }

 /* see inner.h */
--- a/crypto_sign/falcon-1024/clean/fpr.c
+++ b/crypto_sign/falcon-1024/clean/fpr.c
@ -507,7 +507,7 @@ fpr_sqrt(fpr x) {


 uint64_t
-fpr_expm_p63(fpr x) {
+fpr_expm_p63(fpr x, fpr ccs) {
    /*
     * Polynomial approximation of exp(-x) is taken from FACCT:
     *   https://eprint.iacr.org/2018/1234
@ -539,6 +539,8 @@ fpr_expm_p63(fpr x) {

    uint64_t z, y;
    unsigned u;
+    uint32_t z0, z1, y0, y1;
+    uint64_t a, b;

    y = C[0];
    z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
@ -554,8 +556,7 @@ fpr_expm_p63(fpr x) {
         * also have appropriate IEEE754 floating-point support,
         * which is better.
         */
-        uint32_t z0, z1, y0, y1;
-        uint64_t a, b, c;
+        uint64_t c;

        z0 = (uint32_t)z;
        z1 = (uint32_t)(z >> 32);
@ -569,6 +570,24 @@ fpr_expm_p63(fpr x) {
        c += (uint64_t)z1 * (uint64_t)y1;
        y = C[u] - c;
    }
+
+    /*
+     * The scaling factor must be applied at the end. Since y is now
+     * in fixed-point notation, we have to convert the factor to the
+     * same format, and do an extra integer multiplication.
+     */
+    z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
+    z0 = (uint32_t)z;
+    z1 = (uint32_t)(z >> 32);
+    y0 = (uint32_t)y;
+    y1 = (uint32_t)(y >> 32);
+    a = ((uint64_t)z0 * (uint64_t)y1)
+        + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+    b = ((uint64_t)z1 * (uint64_t)y0);
+    y = (a >> 32) + (b >> 32);
+    y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+    y += (uint64_t)z1 * (uint64_t)y1;
+
    return y;
 }

--- a/crypto_sign/falcon-1024/clean/fpr.h
+++ b/crypto_sign/falcon-1024/clean/fpr.h
@ -232,6 +232,8 @@ static const fpr fpr_zero = 0;
 static const fpr fpr_one = 4607182418800017408;
 static const fpr fpr_two = 4611686018427387904;
 static const fpr fpr_onehalf = 4602678819172646912;
+static const fpr fpr_invsqrt2 = 4604544271217802189;
+static const fpr fpr_invsqrt8 = 4600040671590431693;
 static const fpr fpr_ptwo31 = 4746794007248502784;
 static const fpr fpr_ptwo31m1 = 4746794007244308480;
 static const fpr fpr_mtwo31m1 = 13970166044099084288U;
@ -444,7 +446,7 @@ fpr_lt(fpr x, fpr y) {
 * bits or so.
 */
 #define fpr_expm_p63   PQCLEAN_FALCON1024_CLEAN_fpr_expm_p63
-uint64_t fpr_expm_p63(fpr x);
+uint64_t fpr_expm_p63(fpr x, fpr ccs);

 #define fpr_gm_tab   PQCLEAN_FALCON1024_CLEAN_fpr_gm_tab
 extern const fpr fpr_gm_tab[];
--- a/crypto_sign/falcon-1024/clean/inner.h
+++ b/crypto_sign/falcon-1024/clean/inner.h
@ -34,6 +34,45 @@
 * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
 */

+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCON1024_CLEAN_ macro (e.g. PQCLEAN_FALCON1024_CLEAN_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCON1024_CLEAN_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+

 #include <stdint.h>
 #include <stdlib.h>
@ -42,22 +81,47 @@



+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+
+
+
 /* ==================================================================== */
 /*
 * SHAKE256 implementation (shake.c).
 *
 * API is defined to be easily replaced with the fips202.h API defined
- * as part of PQ Clean.
+ * as part of PQClean.
 */


 #include "fips202.h"

-#define shake256_context                 shake256incctx
-#define shake256_init(sc)                shake256_inc_init(sc)
-#define shake256_inject(sc, in, len)     shake256_inc_absorb(sc, in, len)
-#define shake256_flip(sc)                shake256_inc_finalize(sc)
-#define shake256_extract(sc, out, len)   shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)


 /* ==================================================================== */
@ -140,9 +204,22 @@ extern const uint8_t PQCLEAN_FALCON1024_CLEAN_max_sig_bits[];

 /*
 * From a SHAKE256 context (must be already flipped), produce a new
- * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
 */
-void PQCLEAN_FALCON1024_CLEAN_hash_to_point(shake256_context *sc,
+void PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
        uint16_t *x, unsigned logn, uint8_t *tmp);

 /*
@ -184,6 +261,8 @@ void PQCLEAN_FALCON1024_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
 *   logn      is the degree log
 *   tmp[]     temporary, must have at least 2*2^logn bytes
 * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
 */
 int PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
                                        const uint16_t *h, unsigned logn, uint8_t *tmp);
@ -195,6 +274,7 @@ int PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
 * reported if f is not invertible mod phi mod q).
 *
 * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
 */
 int PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
@ -208,11 +288,53 @@ int PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
 * The tmp[] array must have room for at least 4*2^logn bytes.
 *
 * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
 */
 int PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
        const int8_t *f, const int8_t *g, const int8_t *F,
        unsigned logn, uint8_t *tmp);

+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON1024_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp);
+
 /* ==================================================================== */
 /*
 * Implementation of floating-point real numbers (fpr.h, fpr.c).
@ -358,7 +480,7 @@ typedef struct {
 * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
 * context (in "flipped" state) to obtain its initial state.
 */
-void PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src);
+void PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src);

 /*
 * Refill the PRNG buffer. This is normally invoked automatically, and
@ -586,6 +708,9 @@ void PQCLEAN_FALCON1024_CLEAN_poly_merge_fft(fpr *f,

 /*
 * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
 */
 #define FALCON_KEYGEN_TEMP_1      136
 #define FALCON_KEYGEN_TEMP_2      272
@ -608,8 +733,11 @@ void PQCLEAN_FALCON1024_CLEAN_poly_merge_fft(fpr *f,
 * public key is written in h. Either or both of G and h may be NULL,
 * in which case the corresponding element is not returned (they can
 * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
 */
-void PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
+void PQCLEAN_FALCON1024_CLEAN_keygen(inner_shake256_context *rng,
                                     int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                                     unsigned logn, uint8_t *tmp);

@ -624,6 +752,9 @@ void PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
 * a total of (8*logn+40)*2^logn bytes.
 *
 * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
 */
 void PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key,
        const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
@ -636,9 +767,15 @@ void PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key,
 *
 * The sig[] and hm[] buffers may overlap.
 *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
 * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
 */
-void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
+void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
                                        const fpr *expanded_key,
                                        const uint16_t *hm, unsigned logn, uint8_t *tmp);

@ -651,13 +788,47 @@ void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
 *
 * The sig[] and hm[] buffers may overlap.
 *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
 * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
 */
-void PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
+void PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
                                       const int8_t *f, const int8_t *g,
                                       const int8_t *F, const int8_t *G,
                                       const uint16_t *hm, unsigned logn, uint8_t *tmp);

+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(prng *p);
+
 /* ==================================================================== */

 #endif
--- a/crypto_sign/falcon-1024/clean/keygen.c
+++ b/crypto_sign/falcon-1024/clean/keygen.c
@ -2171,6 +2171,9 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,

 /* ==================================================================== */

+
+#define RNG_CONTEXT   inner_shake256_context
+
 /*
 * Get a random 8-byte integer from a SHAKE-based RNG. This function
 * ensures consistent interpretation of the SHAKE output so that
@ -2178,14 +2181,14 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
 * a known seed is used.
 */
 static inline uint64_t
-get_rng_u64(shake256_context *rng) {
+get_rng_u64(inner_shake256_context *rng) {
    /*
     * We enforce little-endian representation.
     */

    uint8_t tmp[8];

-    shake256_extract(rng, tmp, sizeof tmp);
+    inner_shake256_extract(rng, tmp, sizeof tmp);
    return (uint64_t)tmp[0]
           | ((uint64_t)tmp[1] << 8)
           | ((uint64_t)tmp[2] << 16)
@ -2196,6 +2199,7 @@ get_rng_u64(shake256_context *rng) {
           | ((uint64_t)tmp[7] << 56);
 }

+
 /*
 * Table below incarnates a discrete Gaussian distribution:
 *    D(x) = exp(-(x^2)/(2*sigma^2))
@ -2227,7 +2231,7 @@ static const uint64_t gauss_1024_12289[] = {
 * together for lower dimensions.
 */
 static int
-mkgauss(shake256_context *rng, unsigned logn) {
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
    unsigned u, g;
    int val;

@ -3156,6 +3160,7 @@ solve_NTRU_intermediate(unsigned logn_top,
            fpr xv;

            xv = fpr_mul(rt2[u], pdc);
+
            /*
             * Sometimes the values can be out-of-bounds if
             * the algorithm fails; we must not call
@ -4006,7 +4011,7 @@ solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
 * also makes sure that the resultant of the polynomial with phi is odd.
 */
 static void
-poly_small_mkgauss(shake256_context *rng, int8_t *f, unsigned logn) {
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
    size_t n, u;
    unsigned mod2;

@ -4046,7 +4051,7 @@ restart:

 /* see falcon.h */
 void
-PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
+PQCLEAN_FALCON1024_CLEAN_keygen(inner_shake256_context *rng,
                                int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                                unsigned logn, uint8_t *tmp) {
    /*
@ -4070,8 +4075,10 @@ PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
     */
    size_t n, u;
    uint16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;

    n = MKN(logn);
+    rc = rng;

    /*
     * We need to generate f and g randomly, until we find values
@ -4104,8 +4111,8 @@ PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
         * (i.e. the resultant of the polynomial with phi
         * will be odd).
         */
-        poly_small_mkgauss(rng, f, logn);
-        poly_small_mkgauss(rng, g, logn);
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);

        /*
         * Verify that all coefficients are within the bounds
--- a/crypto_sign/falcon-1024/clean/pqclean.c
+++ b/crypto_sign/falcon-1024/clean/pqclean.c
@ -51,16 +51,16 @@ PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair(
    int8_t f[1024], g[1024], F[1024];
    uint16_t h[1024];
    unsigned char seed[48];
-    shake256_context rng;
+    inner_shake256_context rng;
    size_t u, v;

    /*
     * Generate key pair.
     */
    randombytes(seed, sizeof seed);
-    shake256_init(&rng);
-    shake256_inject(&rng, seed, sizeof seed);
-    shake256_flip(&rng);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
    PQCLEAN_FALCON1024_CLEAN_keygen(&rng, f, g, F, NULL, h, 10, tmp.b);

    /*
@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
        uint16_t hm[1024];
    } r;
    unsigned char seed[48];
-    shake256_context sc;
+    inner_shake256_context sc;
    size_t u, v;

    /*
@ -181,19 +181,19 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
    /*
     * Hash message nonce + message into a vector.
     */
-    shake256_init(&sc);
-    shake256_inject(&sc, nonce, NONCELEN);
-    shake256_inject(&sc, m, mlen);
-    shake256_flip(&sc);
-    PQCLEAN_FALCON1024_CLEAN_hash_to_point(&sc, r.hm, 10, tmp.b);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, r.hm, 10, tmp.b);

    /*
     * Initialize a RNG.
     */
    randombytes(seed, sizeof seed);
-    shake256_init(&sc);
-    shake256_inject(&sc, seed, sizeof seed);
-    shake256_flip(&sc);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);

    /*
     * Compute and return the signature. This loops until a signature
@ -225,7 +225,7 @@ do_verify(
    } tmp;
    uint16_t h[1024], hm[1024];
    int16_t sig[1024];
-    shake256_context sc;
+    inner_shake256_context sc;

    /*
     * Decode public key.
@ -253,11 +253,11 @@ do_verify(
    /*
     * Hash nonce + message into a vector.
     */
-    shake256_init(&sc);
-    shake256_inject(&sc, nonce, NONCELEN);
-    shake256_inject(&sc, m, mlen);
-    shake256_flip(&sc);
-    PQCLEAN_FALCON1024_CLEAN_hash_to_point(&sc, hm, 10, tmp.b);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, hm, 10, tmp.b);

    /*
     * Verify signature.
--- a/crypto_sign/falcon-1024/clean/rng.c
+++ b/crypto_sign/falcon-1024/clean/rng.c
@ -36,7 +36,7 @@

 /* see inner.h */
 void
-PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src) {
+PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
    /*
     * To ensure reproducibility for a given seed, we
     * must enforce little-endian interpretation of
@ -46,7 +46,7 @@ PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src) {
    uint64_t th, tl;
    int i;

-    shake256_extract(src, tmp, 56);
+    inner_shake256_extract(src, tmp, 56);
    for (i = 0; i < 14; i ++) {
        uint32_t w;

--- a/crypto_sign/falcon-1024/clean/sign.c
+++ b/crypto_sign/falcon-1024/clean/sign.c
@ -417,8 +417,170 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
    size_t n, hn;
    const fpr *tree0, *tree1;

-    n = (size_t)1 << logn;
-    if (n == 1) {
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        a_re = t1[0];
+        a_im = t1[2];
+        b_re = t1[1];
+        b_im = t1[3];
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree1[3];
+        w2 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, w2);
+        a_im = fpr_sub(x1, w3);
+        b_re = tree1[0];
+        b_im = tree1[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree1[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z1[0] = w0 = fpr_add(a_re, c_re);
+        z1[2] = w2 = fpr_add(a_im, c_im);
+        z1[1] = w1 = fpr_sub(a_re, c_re);
+        z1[3] = w3 = fpr_sub(a_im, c_im);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        w0 = fpr_sub(t1[0], w0);
+        w1 = fpr_sub(t1[1], w1);
+        w2 = fpr_sub(t1[2], w2);
+        w3 = fpr_sub(t1[3], w3);
+
+        a_re = w0;
+        a_im = w2;
+        b_re = tree[0];
+        b_im = tree[2];
+        w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        a_re = w1;
+        a_im = w3;
+        b_re = tree[1];
+        b_im = tree[3];
+        w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+        w0 = fpr_add(w0, t0[0]);
+        w1 = fpr_add(w1, t0[1]);
+        w2 = fpr_add(w2, t0[2]);
+        w3 = fpr_add(w3, t0[3]);
+
+        /*
+         * Second recursive invocation.
+         */
+        a_re = w0;
+        a_im = w2;
+        b_re = w1;
+        b_im = w3;
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree0[3];
+        w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree0[0];
+        b_im = tree0[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree0[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z0[0] = fpr_add(a_re, c_re);
+        z0[2] = fpr_add(a_im, c_im);
+        z0[1] = fpr_sub(a_re, c_re);
+        z0[3] = fpr_sub(a_im, c_im);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * Normal end of recursion is for logn == 0. Since the last
+     * steps of the recursions were inlined in the blocks above
+     * (when logn == 1 or 2), this case is not reachable, and is
+     * retained here only for documentation purposes.
+
+    if (logn == 0) {
        fpr x0, x1, sigma;

        x0 = t0[0];
@ -429,6 +591,13 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
        return;
    }

+     */
+
+    /*
+     * General recursive case (logn >= 3).
+     */
+
+    n = (size_t)1 << logn;
    hn = n >> 1;
    tree0 = tree + n;
    tree1 = tree + n + ffLDL_treesize(logn - 1);
@ -480,7 +649,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
    const fpr *b00, *b01, *b10, *b11, *tree;
    fpr ni;
    uint32_t sqn, ng;
-    int16_t *s2tmp;
+    int16_t *s1tmp, *s2tmp;

    n = MKN(logn);
    t0 = tmp;
@ -542,6 +711,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
    /*
     * Compute the signature.
     */
+    s1tmp = (int16_t *)tx;
    sqn = 0;
    ng = 0;
    for (u = 0; u < n; u ++) {
@ -550,6 +720,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
        sqn += (uint32_t)(z * z);
        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
    }
    sqn |= -(ng >> 31);

@ -568,6 +739,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
    }
    if (PQCLEAN_FALCON1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
        return 1;
    }
    return 0;
@ -592,7 +764,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
    fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
    fpr ni;
    uint32_t sqn, ng;
-    int16_t *s2tmp;
+    int16_t *s1tmp, *s2tmp;

    n = MKN(logn);

@ -745,6 +917,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
    PQCLEAN_FALCON1024_CLEAN_iFFT(t0, logn);
    PQCLEAN_FALCON1024_CLEAN_iFFT(t1, logn);

+    s1tmp = (int16_t *)tx;
    sqn = 0;
    ng = 0;
    for (u = 0; u < n; u ++) {
@ -753,6 +926,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
        sqn += (uint32_t)(z * z);
        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
    }
    sqn |= -(ng >> 31);

@ -771,6 +945,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
    }
    if (PQCLEAN_FALCON1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
        return 1;
    }
    return 0;
@ -780,29 +955,28 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
 * Sample an integer value along a half-gaussian distribution centered
 * on zero and standard deviation 1.8205, with a precision of 72 bits.
 */
-static int
-gaussian0_sampler(prng *p) {
+int
+PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(prng *p) {

    static const uint32_t dist[] = {
-        6031371U, 13708371U, 13035518U,
-        5186761U,  1487980U, 12270720U,
-        3298653U,  4688887U,  5511555U,
-        1551448U,  9247616U,  9467675U,
-        539632U, 14076116U,  5909365U,
-        138809U, 10836485U, 13263376U,
-        26405U, 15335617U, 16601723U,
-        3714U, 14514117U, 13240074U,
-        386U,  8324059U,  3276722U,
-        29U, 12376792U,  7821247U,
-        1U, 11611789U,  3398254U,
-        0U,  1194629U,  4532444U,
-        0U,    37177U,  2973575U,
-        0U,      855U, 10369757U,
-        0U,       14U,  9441597U,
-        0U,        0U,  3075302U,
-        0U,        0U,    28626U,
-        0U,        0U,      197U,
-        0U,        0U,        1U
+        10745844u,  3068844u,  3741698u,
+        5559083u,  1580863u,  8248194u,
+        2260429u, 13669192u,  2736639u,
+        708981u,  4421575u, 10046180u,
+        169348u,  7122675u,  4136815u,
+        30538u, 13063405u,  7650655u,
+        4132u, 14505003u,  7826148u,
+        417u, 16768101u, 11363290u,
+        31u,  8444042u,  8086568u,
+        1u, 12844466u,   265321u,
+        0u,  1232676u, 13644283u,
+        0u,    38047u,  9111839u,
+        0u,      870u,  6138264u,
+        0u,       14u, 12545723u,
+        0u,        0u,  3104126u,
+        0u,        0u,    28824u,
+        0u,        0u,      198u,
+        0u,        0u,        1u
    };

    uint32_t v0, v1, v2, hi;
@ -843,7 +1017,7 @@ gaussian0_sampler(prng *p) {
 * Sample a bit with probability exp(-x) for some x >= 0.
 */
 static int
-BerExp(prng *p, fpr x) {
+BerExp(prng *p, fpr x, fpr ccs) {
    int s, i;
    fpr r;
    uint32_t sw, w;
@ -880,7 +1054,7 @@ BerExp(prng *p, fpr x) {
     * case). The bias is negligible since fpr_expm_p63() only computes
     * with 51 bits of precision or so.
     */
-    z = ((fpr_expm_p63(r) << 1) - 1) >> s;
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;

    /*
     * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
@ -896,11 +1070,6 @@ BerExp(prng *p, fpr x) {
    return (int)(w >> 31);
 }

-typedef struct {
-    prng p;
-    fpr sigma_min;
-} sampler_context;
-
 /*
 * The sampler produces a random integer that follows a discrete Gaussian
 * distribution, centered on mu, and with standard deviation sigma. The
@ -909,8 +1078,8 @@ typedef struct {
 * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
 * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
 */
-static int
-sampler(void *ctx, fpr mu, fpr isigma) {
+int
+PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
    sampler_context *spc;
    int s;
    fpr r, dss, ccs;
@ -952,7 +1121,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
         *  - b = 0: z <= 0 and sampled against a Gaussian
         *    centered on 0.
         */
-        z0 = gaussian0_sampler(&spc->p);
+        z0 = PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(&spc->p);
        b = prng_get_u8(&spc->p) & 1;
        z = b + ((b << 1) - 1) * z0;

@ -983,8 +1152,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
         */
        x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
        x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
-        x = fpr_mul(x, ccs);
-        if (BerExp(&spc->p, x)) {
+        if (BerExp(&spc->p, x, ccs)) {
            /*
             * Rejection sampling was centered on r, but the
             * actual center is mu = s + r.
@ -996,7 +1164,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {

 /* see inner.h */
 void
-PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
+PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
                                   const fpr *expanded_key,
                                   const uint16_t *hm, unsigned logn, uint8_t *tmp) {
    fpr *ftmp;
@ -1025,7 +1193,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
                        ? fpr_sigma_min_10
                        : fpr_sigma_min_9;
        PQCLEAN_FALCON1024_CLEAN_prng_init(&spc.p, rng);
-        samp = sampler;
+        samp = PQCLEAN_FALCON1024_CLEAN_sampler;
        samp_ctx = &spc;

        /*
@ -1040,7 +1208,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,

 /* see inner.h */
 void
-PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
+PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
                                  const int8_t *f, const int8_t *g,
                                  const int8_t *F, const int8_t *G,
                                  const uint16_t *hm, unsigned logn, uint8_t *tmp) {
@ -1070,7 +1238,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
                        ? fpr_sigma_min_10
                        : fpr_sigma_min_9;
        PQCLEAN_FALCON1024_CLEAN_prng_init(&spc.p, rng);
-        samp = sampler;
+        samp = PQCLEAN_FALCON1024_CLEAN_sampler;
        samp_ctx = &spc;

        /*
--- a/crypto_sign/falcon-1024/clean/vrfy.c
+++ b/crypto_sign/falcon-1024/clean/vrfy.c
@ -649,7 +649,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
    }

    /*
-     * Compute s1 = s2*h - c0 mod phi mod q (in tt[]).
+     * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
     */
    mq_NTT(tt, logn);
    mq_poly_montymul_ntt(tt, h, logn);
@ -657,7 +657,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
    mq_poly_sub(tt, c0, logn);

    /*
-     * Normalize s1 elements into the [-q/2..q/2] range.
+     * Normalize -s1 elements into the [-q/2..q/2] range.
     */
    for (u = 0; u < n; u ++) {
        int32_t w;
@ -668,7 +668,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
    }

    /*
-     * Signature is valid if and only if the aggregate (s1,s2) vector
+     * Signature is valid if and only if the aggregate (-s1,s2) vector
     * is short enough.
     */
    return PQCLEAN_FALCON1024_CLEAN_is_short((int16_t *)tt, s2, logn);
@ -699,7 +699,7 @@ PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
    return 1;
 }

-/* see internal.h */
+/* see inner.h */
 int
 PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
        const int8_t *f, const int8_t *g, const int8_t *F,
@ -743,3 +743,110 @@ PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
    }
    return 1;
 }
+
+/* see inner.h */
+int
+PQCLEAN_FALCON1024_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+    mq_NTT(tt, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+    }
+    return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h,
+                                        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+                                        unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+
+        w = (uint32_t)s1[u];
+        w += Q & -(w >> 31);
+        w = mq_sub(c0[u], w);
+        h[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+    mq_NTT(tt, logn);
+    mq_NTT(h, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = ~r & (uint32_t) - PQCLEAN_FALCON1024_CLEAN_is_short(s1, s2, logn);
+    return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCON1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+    uint16_t *s2;
+    size_t u, n;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    s2 = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)sig[u];
+        w += Q & -(w >> 31);
+        s2[u] = (uint16_t)w;
+    }
+    mq_NTT(s2, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u] - 1u;
+        r += (w >> 31);
+    }
+    return (int)r;
+}
--- a/crypto_sign/falcon-512/META.yml
+++ b/crypto_sign/falcon-512/META.yml
@ -4,8 +4,8 @@ claimed-nist-level: 1
 length-public-key: 897
 length-secret-key: 1281
 length-signature: 690
-nistkat-sha256: abc62e7be3d7c1db757ba3cbb771cfdc89c6b36fb5efc885593db89ec2ea8bc4
-testvectors-sha256: 1a1b170fc9e4623e7ff519c15ec7a2dda55e94a175756b7c72429451bd226b09
+nistkat-sha256: e9c3985f1ce732e29ca81aeca091f20d4dbb5beb456ee1a7ab41d04add4dab10
+testvectors-sha256: 036b5e803ab825146502513b7460b24cc9493f8e366323cd5e30e2dc6d4ca6a7
 principal-submitters:
  - Thomas Prest
 auxiliary-submitters:
--- a/crypto_sign/falcon-512/clean/common.c
+++ b/crypto_sign/falcon-512/clean/common.c
@ -33,10 +33,43 @@

 /* see inner.h */
 void
-PQCLEAN_FALCON512_CLEAN_hash_to_point(
-    shake256_context *sc,
-    uint16_t *x, unsigned logn, uint8_t *tmp) {
+PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn) {
+    /*
+     * This is the straightforward per-the-spec implementation. It
+     * is not constant-time, thus it might reveal information on the
+     * plaintext (at least, enough to check the plaintext against a
+     * list of potential plaintexts) in a scenario where the
+     * attacker does not have access to the signature value or to
+     * the public key, but knows the nonce (without knowledge of the
+     * nonce, the hashed output cannot be matched against potential
+     * plaintexts).
+     */
+    size_t n;

+    n = (size_t)1 << logn;
+    while (n > 0) {
+        uint8_t buf[2];
+        uint32_t w;
+
+        inner_shake256_extract(sc, (void *)buf, sizeof buf);
+        w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
+        if (w < 61445) {
+            while (w >= 12289) {
+                w -= 12289;
+            }
+            *x ++ = (uint16_t)w;
+            n --;
+        }
+    }
+}
+
+/* see inner.h */
+void
+PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(
+    inner_shake256_context *sc,
+    uint16_t *x, unsigned logn, uint8_t *tmp) {
    /*
     * Each 16-bit sample is a value in 0..65535. The value is
     * kept if it falls in 0..61444 (because 61445 = 5*12289)
@ -97,7 +130,7 @@ PQCLEAN_FALCON512_CLEAN_hash_to_point(
        uint8_t buf[2];
        uint32_t w, wr;

-        shake256_extract(sc, buf, sizeof buf);
+        inner_shake256_extract(sc, buf, sizeof buf);
        w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
        wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
        wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
@ -196,7 +229,6 @@ PQCLEAN_FALCON512_CLEAN_hash_to_point(
            *d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
        }
    }
-
 }

 /* see inner.h */
--- a/crypto_sign/falcon-512/clean/fpr.c
+++ b/crypto_sign/falcon-512/clean/fpr.c
@ -507,7 +507,7 @@ fpr_sqrt(fpr x) {


 uint64_t
-fpr_expm_p63(fpr x) {
+fpr_expm_p63(fpr x, fpr ccs) {
    /*
     * Polynomial approximation of exp(-x) is taken from FACCT:
     *   https://eprint.iacr.org/2018/1234
@ -539,6 +539,8 @@ fpr_expm_p63(fpr x) {

    uint64_t z, y;
    unsigned u;
+    uint32_t z0, z1, y0, y1;
+    uint64_t a, b;

    y = C[0];
    z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
@ -554,8 +556,7 @@ fpr_expm_p63(fpr x) {
         * also have appropriate IEEE754 floating-point support,
         * which is better.
         */
-        uint32_t z0, z1, y0, y1;
-        uint64_t a, b, c;
+        uint64_t c;

        z0 = (uint32_t)z;
        z1 = (uint32_t)(z >> 32);
@ -569,6 +570,24 @@ fpr_expm_p63(fpr x) {
        c += (uint64_t)z1 * (uint64_t)y1;
        y = C[u] - c;
    }
+
+    /*
+     * The scaling factor must be applied at the end. Since y is now
+     * in fixed-point notation, we have to convert the factor to the
+     * same format, and do an extra integer multiplication.
+     */
+    z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
+    z0 = (uint32_t)z;
+    z1 = (uint32_t)(z >> 32);
+    y0 = (uint32_t)y;
+    y1 = (uint32_t)(y >> 32);
+    a = ((uint64_t)z0 * (uint64_t)y1)
+        + (((uint64_t)z0 * (uint64_t)y0) >> 32);
+    b = ((uint64_t)z1 * (uint64_t)y0);
+    y = (a >> 32) + (b >> 32);
+    y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
+    y += (uint64_t)z1 * (uint64_t)y1;
+
    return y;
 }

--- a/crypto_sign/falcon-512/clean/fpr.h
+++ b/crypto_sign/falcon-512/clean/fpr.h
@ -232,6 +232,8 @@ static const fpr fpr_zero = 0;
 static const fpr fpr_one = 4607182418800017408;
 static const fpr fpr_two = 4611686018427387904;
 static const fpr fpr_onehalf = 4602678819172646912;
+static const fpr fpr_invsqrt2 = 4604544271217802189;
+static const fpr fpr_invsqrt8 = 4600040671590431693;
 static const fpr fpr_ptwo31 = 4746794007248502784;
 static const fpr fpr_ptwo31m1 = 4746794007244308480;
 static const fpr fpr_mtwo31m1 = 13970166044099084288U;
@ -444,7 +446,7 @@ fpr_lt(fpr x, fpr y) {
 * bits or so.
 */
 #define fpr_expm_p63   PQCLEAN_FALCON512_CLEAN_fpr_expm_p63
-uint64_t fpr_expm_p63(fpr x);
+uint64_t fpr_expm_p63(fpr x, fpr ccs);

 #define fpr_gm_tab   PQCLEAN_FALCON512_CLEAN_fpr_gm_tab
 extern const fpr fpr_gm_tab[];
--- a/crypto_sign/falcon-512/clean/inner.h
+++ b/crypto_sign/falcon-512/clean/inner.h
@ -34,6 +34,45 @@
 * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
 */

+/*
+ * IMPORTANT API RULES
+ * -------------------
+ *
+ * This API has some non-trivial usage rules:
+ *
+ *
+ *  - All public functions (i.e. the non-static ones) must be referenced
+ *    with the PQCLEAN_FALCON512_CLEAN_ macro (e.g. PQCLEAN_FALCON512_CLEAN_verify_raw for the verify_raw()
+ *    function). That macro adds a prefix to the name, which is
+ *    configurable with the FALCON_PREFIX macro. This allows compiling
+ *    the code into a specific "namespace" and potentially including
+ *    several versions of this code into a single application (e.g. to
+ *    have an AVX2 and a non-AVX2 variants and select the one to use at
+ *    runtime based on availability of AVX2 opcodes).
+ *
+ *  - Functions that need temporary buffers expects them as a final
+ *    tmp[] array of type uint8_t*, with a size which is documented for
+ *    each function. However, most have some alignment requirements,
+ *    because they will use the array to store 16-bit, 32-bit or 64-bit
+ *    values (e.g. uint64_t or double). The caller must ensure proper
+ *    alignment. What happens on unaligned access depends on the
+ *    underlying architecture, ranging from a slight time penalty
+ *    to immediate termination of the process.
+ *
+ *  - Some functions rely on specific rounding rules and precision for
+ *    floating-point numbers. On some systems (in particular 32-bit x86
+ *    with the 387 FPU), this requires setting an hardware control
+ *    word. The caller MUST use set_fpu_cw() to ensure proper precision:
+ *
+ *      oldcw = set_fpu_cw(2);
+ *      PQCLEAN_FALCON512_CLEAN_sign_dyn(...);
+ *      set_fpu_cw(oldcw);
+ *
+ *    On systems where the native floating-point precision is already
+ *    proper, or integer-based emulation is used, the set_fpu_cw()
+ *    function does nothing, so it can be called systematically.
+ */
+

 #include <stdint.h>
 #include <stdlib.h>
@ -42,22 +81,47 @@



+
+/*
+ * Some computations with floating-point elements, in particular
+ * rounding to the nearest integer, rely on operations using _exactly_
+ * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
+ * x86, the 387 FPU may be used (depending on the target OS) and, in
+ * that case, may use more precision bits (i.e. 64 bits, for an 80-bit
+ * total type length); to prevent miscomputations, we define an explicit
+ * function that modifies the precision in the FPU control word.
+ *
+ * set_fpu_cw() sets the precision to the provided value, and returns
+ * the previously set precision; callers are supposed to restore the
+ * previous precision on exit. The correct (52-bit) precision is
+ * configured with the value "2". On unsupported compilers, or on
+ * targets other than 32-bit x86, or when the native 'double' type is
+ * not used, the set_fpu_cw() function does nothing at all.
+ */
+static inline unsigned
+set_fpu_cw(unsigned x) {
+    return x;
+}
+
+
+
+
 /* ==================================================================== */
 /*
 * SHAKE256 implementation (shake.c).
 *
 * API is defined to be easily replaced with the fips202.h API defined
- * as part of PQ Clean.
+ * as part of PQClean.
 */


 #include "fips202.h"

-#define shake256_context                 shake256incctx
-#define shake256_init(sc)                shake256_inc_init(sc)
-#define shake256_inject(sc, in, len)     shake256_inc_absorb(sc, in, len)
-#define shake256_flip(sc)                shake256_inc_finalize(sc)
-#define shake256_extract(sc, out, len)   shake256_inc_squeeze(out, len, sc)
+#define inner_shake256_context                shake256incctx
+#define inner_shake256_init(sc)               shake256_inc_init(sc)
+#define inner_shake256_inject(sc, in, len)    shake256_inc_absorb(sc, in, len)
+#define inner_shake256_flip(sc)               shake256_inc_finalize(sc)
+#define inner_shake256_extract(sc, out, len)  shake256_inc_squeeze(out, len, sc)


 /* ==================================================================== */
@ -140,9 +204,22 @@ extern const uint8_t PQCLEAN_FALCON512_CLEAN_max_sig_bits[];

 /*
 * From a SHAKE256 context (must be already flipped), produce a new
- * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * point. This is the non-constant-time version, which may leak enough
+ * information to serve as a stop condition on a brute force attack on
+ * the hashed message (provided that the nonce value is known).
 */
-void PQCLEAN_FALCON512_CLEAN_hash_to_point(shake256_context *sc,
+void PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
+        uint16_t *x, unsigned logn);
+
+/*
+ * From a SHAKE256 context (must be already flipped), produce a new
+ * point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
+ * This function is constant-time but is typically more expensive than
+ * PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime().
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+void PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
        uint16_t *x, unsigned logn, uint8_t *tmp);

 /*
@ -184,6 +261,8 @@ void PQCLEAN_FALCON512_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
 *   logn      is the degree log
 *   tmp[]     temporary, must have at least 2*2^logn bytes
 * Returned value is 1 on success, 0 on error.
+ *
+ * tmp[] must have 16-bit alignment.
 */
 int PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
                                       const uint16_t *h, unsigned logn, uint8_t *tmp);
@ -195,6 +274,7 @@ int PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
 * reported if f is not invertible mod phi mod q).
 *
 * The tmp[] array must have room for at least 2*2^logn elements.
+ * tmp[] must have 16-bit alignment.
 */
 int PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
        const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
@ -208,11 +288,53 @@ int PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
 * The tmp[] array must have room for at least 4*2^logn bytes.
 *
 * Returned value is 1 in success, 0 on error (f not invertible).
+ * tmp[] must have 16-bit alignment.
 */
 int PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
        const int8_t *f, const int8_t *g, const int8_t *F,
        unsigned logn, uint8_t *tmp);

+/*
+ * Test whether a given polynomial is invertible modulo phi and q.
+ * Polynomial coefficients are small integers.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON512_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp);
+
+/*
+ * Count the number of elements of value zero in the NTT representation
+ * of the given polynomial: this is the number of primitive 2n-th roots
+ * of unity (modulo q = 12289) that are roots of the provided polynomial
+ * (taken modulo q).
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
+
+/*
+ * Internal signature verification with public key recovery:
+ *   h[]       receives the public key (NOT in NTT/Montgomery format)
+ *   c0[]      contains the hashed nonce+message
+ *   s1[]      is the first signature half
+ *   s2[]      is the second signature half
+ *   logn      is the degree log
+ *   tmp[]     temporary, must have at least 2*2^logn bytes
+ * Returned value is 1 on success, 0 on error. Success is returned if
+ * the signature is a short enough vector; in that case, the public
+ * key has been written to h[]. However, the caller must still
+ * verify that h[] is the correct value (e.g. with regards to a known
+ * hash of the public key).
+ *
+ * h[] may not overlap with any of the other arrays.
+ *
+ * tmp[] must have 16-bit alignment.
+ */
+int PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h,
+        const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+        unsigned logn, uint8_t *tmp);
+
 /* ==================================================================== */
 /*
 * Implementation of floating-point real numbers (fpr.h, fpr.c).
@ -358,7 +480,7 @@ typedef struct {
 * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
 * context (in "flipped" state) to obtain its initial state.
 */
-void PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src);
+void PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src);

 /*
 * Refill the PRNG buffer. This is normally invoked automatically, and
@ -586,6 +708,9 @@ void PQCLEAN_FALCON512_CLEAN_poly_merge_fft(fpr *f,

 /*
 * Required sizes of the temporary buffer (in bytes).
+ *
+ * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
+ * or 2) where it is slightly greater.
 */
 #define FALCON_KEYGEN_TEMP_1      136
 #define FALCON_KEYGEN_TEMP_2      272
@ -608,8 +733,11 @@ void PQCLEAN_FALCON512_CLEAN_poly_merge_fft(fpr *f,
 * public key is written in h. Either or both of G and h may be NULL,
 * in which case the corresponding element is not returned (they can
 * be recomputed from f, g and F).
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
 */
-void PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
+void PQCLEAN_FALCON512_CLEAN_keygen(inner_shake256_context *rng,
                                    int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                                    unsigned logn, uint8_t *tmp);

@ -624,6 +752,9 @@ void PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
 * a total of (8*logn+40)*2^logn bytes.
 *
 * The tmp[] array must have room for at least 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
 */
 void PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key,
        const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
@ -636,9 +767,15 @@ void PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key,
 *
 * The sig[] and hm[] buffers may overlap.
 *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
 * The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
 */
-void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
+void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
                                       const fpr *expanded_key,
                                       const uint16_t *hm, unsigned logn, uint8_t *tmp);

@ -651,13 +788,47 @@ void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
 *
 * The sig[] and hm[] buffers may overlap.
 *
+ * On successful output, the start of the tmp[] buffer contains the s1
+ * vector (as int16_t elements).
+ *
 * The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
+ *
+ * tmp[] must have 64-bit alignment.
+ * This function uses floating-point rounding (see set_fpu_cw()).
 */
-void PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
+void PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
                                      const int8_t *f, const int8_t *g,
                                      const int8_t *F, const int8_t *G,
                                      const uint16_t *hm, unsigned logn, uint8_t *tmp);

+/*
+ * Internal sampler engine. Exported for tests.
+ *
+ * sampler_context wraps around a source of random numbers (PRNG) and
+ * the sigma_min value (nominally dependent on the degree).
+ *
+ * sampler() takes as parameters:
+ *   ctx      pointer to the sampler_context structure
+ *   mu       center for the distribution
+ *   isigma   inverse of the distribution standard deviation
+ * It returns an integer sampled along the Gaussian distribution centered
+ * on mu and of standard deviation sigma = 1/isigma.
+ *
+ * gaussian0_sampler() takes as parameter a pointer to a PRNG, and
+ * returns an integer sampled along a half-Gaussian with standard
+ * deviation sigma0 = 1.8205 (center is 0, returned value is
+ * nonnegative).
+ */
+
+typedef struct {
+    prng p;
+    fpr sigma_min;
+} sampler_context;
+
+int PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
+
+int PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(prng *p);
+
 /* ==================================================================== */

 #endif
--- a/crypto_sign/falcon-512/clean/keygen.c
+++ b/crypto_sign/falcon-512/clean/keygen.c
@ -2171,6 +2171,9 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,

 /* ==================================================================== */

+
+#define RNG_CONTEXT   inner_shake256_context
+
 /*
 * Get a random 8-byte integer from a SHAKE-based RNG. This function
 * ensures consistent interpretation of the SHAKE output so that
@ -2178,14 +2181,14 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
 * a known seed is used.
 */
 static inline uint64_t
-get_rng_u64(shake256_context *rng) {
+get_rng_u64(inner_shake256_context *rng) {
    /*
     * We enforce little-endian representation.
     */

    uint8_t tmp[8];

-    shake256_extract(rng, tmp, sizeof tmp);
+    inner_shake256_extract(rng, tmp, sizeof tmp);
    return (uint64_t)tmp[0]
           | ((uint64_t)tmp[1] << 8)
           | ((uint64_t)tmp[2] << 16)
@ -2196,6 +2199,7 @@ get_rng_u64(shake256_context *rng) {
           | ((uint64_t)tmp[7] << 56);
 }

+
 /*
 * Table below incarnates a discrete Gaussian distribution:
 *    D(x) = exp(-(x^2)/(2*sigma^2))
@ -2227,7 +2231,7 @@ static const uint64_t gauss_1024_12289[] = {
 * together for lower dimensions.
 */
 static int
-mkgauss(shake256_context *rng, unsigned logn) {
+mkgauss(RNG_CONTEXT *rng, unsigned logn) {
    unsigned u, g;
    int val;

@ -3156,6 +3160,7 @@ solve_NTRU_intermediate(unsigned logn_top,
            fpr xv;

            xv = fpr_mul(rt2[u], pdc);
+
            /*
             * Sometimes the values can be out-of-bounds if
             * the algorithm fails; we must not call
@ -4006,7 +4011,7 @@ solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
 * also makes sure that the resultant of the polynomial with phi is odd.
 */
 static void
-poly_small_mkgauss(shake256_context *rng, int8_t *f, unsigned logn) {
+poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
    size_t n, u;
    unsigned mod2;

@ -4046,7 +4051,7 @@ restart:

 /* see falcon.h */
 void
-PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
+PQCLEAN_FALCON512_CLEAN_keygen(inner_shake256_context *rng,
                               int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
                               unsigned logn, uint8_t *tmp) {
    /*
@ -4070,8 +4075,10 @@ PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
     */
    size_t n, u;
    uint16_t *h2, *tmp2;
+    RNG_CONTEXT *rc;

    n = MKN(logn);
+    rc = rng;

    /*
     * We need to generate f and g randomly, until we find values
@ -4104,8 +4111,8 @@ PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
         * (i.e. the resultant of the polynomial with phi
         * will be odd).
         */
-        poly_small_mkgauss(rng, f, logn);
-        poly_small_mkgauss(rng, g, logn);
+        poly_small_mkgauss(rc, f, logn);
+        poly_small_mkgauss(rc, g, logn);

        /*
         * Verify that all coefficients are within the bounds
--- a/crypto_sign/falcon-512/clean/pqclean.c
+++ b/crypto_sign/falcon-512/clean/pqclean.c
@ -51,16 +51,16 @@ PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair(
    int8_t f[512], g[512], F[512];
    uint16_t h[512];
    unsigned char seed[48];
-    shake256_context rng;
+    inner_shake256_context rng;
    size_t u, v;

    /*
     * Generate key pair.
     */
    randombytes(seed, sizeof seed);
-    shake256_init(&rng);
-    shake256_inject(&rng, seed, sizeof seed);
-    shake256_flip(&rng);
+    inner_shake256_init(&rng);
+    inner_shake256_inject(&rng, seed, sizeof seed);
+    inner_shake256_flip(&rng);
    PQCLEAN_FALCON512_CLEAN_keygen(&rng, f, g, F, NULL, h, 9, tmp.b);

    /*
@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
        uint16_t hm[512];
    } r;
    unsigned char seed[48];
-    shake256_context sc;
+    inner_shake256_context sc;
    size_t u, v;

    /*
@ -181,19 +181,19 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
    /*
     * Hash message nonce + message into a vector.
     */
-    shake256_init(&sc);
-    shake256_inject(&sc, nonce, NONCELEN);
-    shake256_inject(&sc, m, mlen);
-    shake256_flip(&sc);
-    PQCLEAN_FALCON512_CLEAN_hash_to_point(&sc, r.hm, 9, tmp.b);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(&sc, r.hm, 9, tmp.b);

    /*
     * Initialize a RNG.
     */
    randombytes(seed, sizeof seed);
-    shake256_init(&sc);
-    shake256_inject(&sc, seed, sizeof seed);
-    shake256_flip(&sc);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, seed, sizeof seed);
+    inner_shake256_flip(&sc);

    /*
     * Compute and return the signature. This loops until a signature
@ -225,7 +225,7 @@ do_verify(
    } tmp;
    uint16_t h[512], hm[512];
    int16_t sig[512];
-    shake256_context sc;
+    inner_shake256_context sc;

    /*
     * Decode public key.
@ -253,11 +253,11 @@ do_verify(
    /*
     * Hash nonce + message into a vector.
     */
-    shake256_init(&sc);
-    shake256_inject(&sc, nonce, NONCELEN);
-    shake256_inject(&sc, m, mlen);
-    shake256_flip(&sc);
-    PQCLEAN_FALCON512_CLEAN_hash_to_point(&sc, hm, 9, tmp.b);
+    inner_shake256_init(&sc);
+    inner_shake256_inject(&sc, nonce, NONCELEN);
+    inner_shake256_inject(&sc, m, mlen);
+    inner_shake256_flip(&sc);
+    PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(&sc, hm, 9, tmp.b);

    /*
     * Verify signature.
--- a/crypto_sign/falcon-512/clean/rng.c
+++ b/crypto_sign/falcon-512/clean/rng.c
@ -36,7 +36,7 @@

 /* see inner.h */
 void
-PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src) {
+PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
    /*
     * To ensure reproducibility for a given seed, we
     * must enforce little-endian interpretation of
@ -46,7 +46,7 @@ PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src) {
    uint64_t th, tl;
    int i;

-    shake256_extract(src, tmp, 56);
+    inner_shake256_extract(src, tmp, 56);
    for (i = 0; i < 14; i ++) {
        uint32_t w;

--- a/crypto_sign/falcon-512/clean/sign.c
+++ b/crypto_sign/falcon-512/clean/sign.c
@ -417,8 +417,170 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
    size_t n, hn;
    const fpr *tree0, *tree1;

-    n = (size_t)1 << logn;
-    if (n == 1) {
+    /*
+     * When logn == 2, we inline the last two recursion levels.
+     */
+    if (logn == 2) {
+        fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        tree0 = tree + 4;
+        tree1 = tree + 8;
+
+        /*
+         * We split t1 into w*, then do the recursive invocation,
+         * with output in w*. We finally merge back into z1.
+         */
+        a_re = t1[0];
+        a_im = t1[2];
+        b_re = t1[1];
+        b_im = t1[3];
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree1[3];
+        w2 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, w2);
+        a_im = fpr_sub(x1, w3);
+        b_re = tree1[0];
+        b_im = tree1[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree1[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z1[0] = w0 = fpr_add(a_re, c_re);
+        z1[2] = w2 = fpr_add(a_im, c_im);
+        z1[1] = w1 = fpr_sub(a_re, c_re);
+        z1[3] = w3 = fpr_sub(a_im, c_im);
+
+        /*
+         * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
+         */
+        w0 = fpr_sub(t1[0], w0);
+        w1 = fpr_sub(t1[1], w1);
+        w2 = fpr_sub(t1[2], w2);
+        w3 = fpr_sub(t1[3], w3);
+
+        a_re = w0;
+        a_im = w2;
+        b_re = tree[0];
+        b_im = tree[2];
+        w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        a_re = w1;
+        a_im = w3;
+        b_re = tree[1];
+        b_im = tree[3];
+        w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+
+        w0 = fpr_add(w0, t0[0]);
+        w1 = fpr_add(w1, t0[1]);
+        w2 = fpr_add(w2, t0[2]);
+        w3 = fpr_add(w3, t0[3]);
+
+        /*
+         * Second recursive invocation.
+         */
+        a_re = w0;
+        a_im = w2;
+        b_re = w1;
+        b_im = w3;
+        c_re = fpr_add(a_re, b_re);
+        c_im = fpr_add(a_im, b_im);
+        w0 = fpr_half(c_re);
+        w1 = fpr_half(c_im);
+        c_re = fpr_sub(a_re, b_re);
+        c_im = fpr_sub(a_im, b_im);
+        w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
+        w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
+
+        x0 = w2;
+        x1 = w3;
+        sigma = tree0[3];
+        w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree0[0];
+        b_im = tree0[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, w0);
+        x1 = fpr_add(c_im, w1);
+        sigma = tree0[2];
+        w0 = fpr_of(samp(samp_ctx, x0, sigma));
+        w1 = fpr_of(samp(samp_ctx, x1, sigma));
+
+        a_re = w0;
+        a_im = w1;
+        b_re = w2;
+        b_im = w3;
+        c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
+        c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
+        z0[0] = fpr_add(a_re, c_re);
+        z0[2] = fpr_add(a_im, c_im);
+        z0[1] = fpr_sub(a_re, c_re);
+        z0[3] = fpr_sub(a_im, c_im);
+
+        return;
+    }
+
+    /*
+     * Case logn == 1 is reachable only when using Falcon-2 (the
+     * smallest size for which Falcon is mathematically defined, but
+     * of course way too insecure to be of any use).
+     */
+    if (logn == 1) {
+        fpr x0, x1, y0, y1, sigma;
+        fpr a_re, a_im, b_re, b_im, c_re, c_im;
+
+        x0 = t1[0];
+        x1 = t1[1];
+        sigma = tree[3];
+        z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
+        z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
+        a_re = fpr_sub(x0, y0);
+        a_im = fpr_sub(x1, y1);
+        b_re = tree[0];
+        b_im = tree[1];
+        c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
+        c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
+        x0 = fpr_add(c_re, t0[0]);
+        x1 = fpr_add(c_im, t0[1]);
+        sigma = tree[2];
+        z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
+        z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
+
+        return;
+    }
+
+    /*
+     * Normal end of recursion is for logn == 0. Since the last
+     * steps of the recursions were inlined in the blocks above
+     * (when logn == 1 or 2), this case is not reachable, and is
+     * retained here only for documentation purposes.
+
+    if (logn == 0) {
        fpr x0, x1, sigma;

        x0 = t0[0];
@ -429,6 +591,13 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
        return;
    }

+     */
+
+    /*
+     * General recursive case (logn >= 3).
+     */
+
+    n = (size_t)1 << logn;
    hn = n >> 1;
    tree0 = tree + n;
    tree1 = tree + n + ffLDL_treesize(logn - 1);
@ -480,7 +649,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
    const fpr *b00, *b01, *b10, *b11, *tree;
    fpr ni;
    uint32_t sqn, ng;
-    int16_t *s2tmp;
+    int16_t *s1tmp, *s2tmp;

    n = MKN(logn);
    t0 = tmp;
@ -542,6 +711,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
    /*
     * Compute the signature.
     */
+    s1tmp = (int16_t *)tx;
    sqn = 0;
    ng = 0;
    for (u = 0; u < n; u ++) {
@ -550,6 +720,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
        sqn += (uint32_t)(z * z);
        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
    }
    sqn |= -(ng >> 31);

@ -568,6 +739,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
    }
    if (PQCLEAN_FALCON512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
        return 1;
    }
    return 0;
@ -592,7 +764,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
    fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
    fpr ni;
    uint32_t sqn, ng;
-    int16_t *s2tmp;
+    int16_t *s1tmp, *s2tmp;

    n = MKN(logn);

@ -745,6 +917,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
    PQCLEAN_FALCON512_CLEAN_iFFT(t0, logn);
    PQCLEAN_FALCON512_CLEAN_iFFT(t1, logn);

+    s1tmp = (int16_t *)tx;
    sqn = 0;
    ng = 0;
    for (u = 0; u < n; u ++) {
@ -753,6 +926,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
        z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
        sqn += (uint32_t)(z * z);
        ng |= sqn;
+        s1tmp[u] = (int16_t)z;
    }
    sqn |= -(ng >> 31);

@ -771,6 +945,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
    }
    if (PQCLEAN_FALCON512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
        memcpy(s2, s2tmp, n * sizeof * s2);
+        memcpy(tmp, s1tmp, n * sizeof * s1tmp);
        return 1;
    }
    return 0;
@ -780,29 +955,28 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
 * Sample an integer value along a half-gaussian distribution centered
 * on zero and standard deviation 1.8205, with a precision of 72 bits.
 */
-static int
-gaussian0_sampler(prng *p) {
+int
+PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(prng *p) {

    static const uint32_t dist[] = {
-        6031371U, 13708371U, 13035518U,
-        5186761U,  1487980U, 12270720U,
-        3298653U,  4688887U,  5511555U,
-        1551448U,  9247616U,  9467675U,
-        539632U, 14076116U,  5909365U,
-        138809U, 10836485U, 13263376U,
-        26405U, 15335617U, 16601723U,
-        3714U, 14514117U, 13240074U,
-        386U,  8324059U,  3276722U,
-        29U, 12376792U,  7821247U,
-        1U, 11611789U,  3398254U,
-        0U,  1194629U,  4532444U,
-        0U,    37177U,  2973575U,
-        0U,      855U, 10369757U,
-        0U,       14U,  9441597U,
-        0U,        0U,  3075302U,
-        0U,        0U,    28626U,
-        0U,        0U,      197U,
-        0U,        0U,        1U
+        10745844u,  3068844u,  3741698u,
+        5559083u,  1580863u,  8248194u,
+        2260429u, 13669192u,  2736639u,
+        708981u,  4421575u, 10046180u,
+        169348u,  7122675u,  4136815u,
+        30538u, 13063405u,  7650655u,
+        4132u, 14505003u,  7826148u,
+        417u, 16768101u, 11363290u,
+        31u,  8444042u,  8086568u,
+        1u, 12844466u,   265321u,
+        0u,  1232676u, 13644283u,
+        0u,    38047u,  9111839u,
+        0u,      870u,  6138264u,
+        0u,       14u, 12545723u,
+        0u,        0u,  3104126u,
+        0u,        0u,    28824u,
+        0u,        0u,      198u,
+        0u,        0u,        1u
    };

    uint32_t v0, v1, v2, hi;
@ -843,7 +1017,7 @@ gaussian0_sampler(prng *p) {
 * Sample a bit with probability exp(-x) for some x >= 0.
 */
 static int
-BerExp(prng *p, fpr x) {
+BerExp(prng *p, fpr x, fpr ccs) {
    int s, i;
    fpr r;
    uint32_t sw, w;
@ -880,7 +1054,7 @@ BerExp(prng *p, fpr x) {
     * case). The bias is negligible since fpr_expm_p63() only computes
     * with 51 bits of precision or so.
     */
-    z = ((fpr_expm_p63(r) << 1) - 1) >> s;
+    z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;

    /*
     * Sample a bit with probability exp(-x). Since x = s*log(2) + r,
@ -896,11 +1070,6 @@ BerExp(prng *p, fpr x) {
    return (int)(w >> 31);
 }

-typedef struct {
-    prng p;
-    fpr sigma_min;
-} sampler_context;
-
 /*
 * The sampler produces a random integer that follows a discrete Gaussian
 * distribution, centered on mu, and with standard deviation sigma. The
@ -909,8 +1078,8 @@ typedef struct {
 * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
 * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
 */
-static int
-sampler(void *ctx, fpr mu, fpr isigma) {
+int
+PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
    sampler_context *spc;
    int s;
    fpr r, dss, ccs;
@ -952,7 +1121,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
         *  - b = 0: z <= 0 and sampled against a Gaussian
         *    centered on 0.
         */
-        z0 = gaussian0_sampler(&spc->p);
+        z0 = PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(&spc->p);
        b = prng_get_u8(&spc->p) & 1;
        z = b + ((b << 1) - 1) * z0;

@ -983,8 +1152,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
         */
        x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
        x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
-        x = fpr_mul(x, ccs);
-        if (BerExp(&spc->p, x)) {
+        if (BerExp(&spc->p, x, ccs)) {
            /*
             * Rejection sampling was centered on r, but the
             * actual center is mu = s + r.
@ -996,7 +1164,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {

 /* see inner.h */
 void
-PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
+PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
                                  const fpr *expanded_key,
                                  const uint16_t *hm, unsigned logn, uint8_t *tmp) {
    fpr *ftmp;
@ -1025,7 +1193,7 @@ PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
                        ? fpr_sigma_min_10
                        : fpr_sigma_min_9;
        PQCLEAN_FALCON512_CLEAN_prng_init(&spc.p, rng);
-        samp = sampler;
+        samp = PQCLEAN_FALCON512_CLEAN_sampler;
        samp_ctx = &spc;

        /*
@ -1040,7 +1208,7 @@ PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,

 /* see inner.h */
 void
-PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
+PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
                                 const int8_t *f, const int8_t *g,
                                 const int8_t *F, const int8_t *G,
                                 const uint16_t *hm, unsigned logn, uint8_t *tmp) {
@ -1070,7 +1238,7 @@ PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
                        ? fpr_sigma_min_10
                        : fpr_sigma_min_9;
        PQCLEAN_FALCON512_CLEAN_prng_init(&spc.p, rng);
-        samp = sampler;
+        samp = PQCLEAN_FALCON512_CLEAN_sampler;
        samp_ctx = &spc;

        /*
--- a/crypto_sign/falcon-512/clean/vrfy.c
+++ b/crypto_sign/falcon-512/clean/vrfy.c
@ -649,7 +649,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
    }

    /*
-     * Compute s1 = s2*h - c0 mod phi mod q (in tt[]).
+     * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
     */
    mq_NTT(tt, logn);
    mq_poly_montymul_ntt(tt, h, logn);
@ -657,7 +657,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
    mq_poly_sub(tt, c0, logn);

    /*
-     * Normalize s1 elements into the [-q/2..q/2] range.
+     * Normalize -s1 elements into the [-q/2..q/2] range.
     */
    for (u = 0; u < n; u ++) {
        int32_t w;
@ -668,7 +668,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
    }

    /*
-     * Signature is valid if and only if the aggregate (s1,s2) vector
+     * Signature is valid if and only if the aggregate (-s1,s2) vector
     * is short enough.
     */
    return PQCLEAN_FALCON512_CLEAN_is_short((int16_t *)tt, s2, logn);
@ -699,7 +699,7 @@ PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
    return 1;
 }

-/* see internal.h */
+/* see inner.h */
 int
 PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
        const int8_t *f, const int8_t *g, const int8_t *F,
@ -743,3 +743,110 @@ PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
    }
    return 1;
 }
+
+/* see inner.h */
+int
+PQCLEAN_FALCON512_CLEAN_is_invertible(
+    const int16_t *s2, unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+    }
+    mq_NTT(tt, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+    }
+    return (int)(1u - (r >> 31));
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h,
+                                       const uint16_t *c0, const int16_t *s1, const int16_t *s2,
+                                       unsigned logn, uint8_t *tmp) {
+    size_t u, n;
+    uint16_t *tt;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+
+    /*
+     * Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
+     * and c0 - s1 into h[].
+     */
+    tt = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u];
+        w += Q & -(w >> 31);
+        tt[u] = (uint16_t)w;
+
+        w = (uint32_t)s1[u];
+        w += Q & -(w >> 31);
+        w = mq_sub(c0[u], w);
+        h[u] = (uint16_t)w;
+    }
+
+    /*
+     * Compute h = (c0 - s1) / s2. If one of the coefficients of s2
+     * is zero (in NTT representation) then the operation fails. We
+     * keep that information into a flag so that we do not deviate
+     * from strict constant-time processing; if all coefficients of
+     * s2 are non-zero, then the high bit of r will be zero.
+     */
+    mq_NTT(tt, logn);
+    mq_NTT(h, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        r |= (uint32_t)(tt[u] - 1);
+        h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
+    }
+    mq_iNTT(h, logn);
+
+    /*
+     * Signature is acceptable if and only if it is short enough,
+     * and s2 was invertible mod phi mod q. The caller must still
+     * check that the rebuilt public key matches the expected
+     * value (e.g. through a hash).
+     */
+    r = ~r & (uint32_t) - PQCLEAN_FALCON512_CLEAN_is_short(s1, s2, logn);
+    return (int)(r >> 31);
+}
+
+/* see inner.h */
+int
+PQCLEAN_FALCON512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
+    uint16_t *s2;
+    size_t u, n;
+    uint32_t r;
+
+    n = (size_t)1 << logn;
+    s2 = (uint16_t *)tmp;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)sig[u];
+        w += Q & -(w >> 31);
+        s2[u] = (uint16_t)w;
+    }
+    mq_NTT(s2, logn);
+    r = 0;
+    for (u = 0; u < n; u ++) {
+        uint32_t w;
+
+        w = (uint32_t)s2[u] - 1u;
+        r += (w >> 31);
+    }
+    return (int)r;
+}