From 44a050106a2952bf5e4438621882703a1e92fc3f Mon Sep 17 00:00:00 2001 From: Thomas Pornin Date: Thu, 19 Sep 2019 15:55:28 -0400 Subject: [PATCH] Fixed sampler bug (update to new upstream Falcon code 2019-09-18). --- crypto_sign/falcon-1024/META.yml | 4 +- crypto_sign/falcon-1024/clean/common.c | 42 +++- crypto_sign/falcon-1024/clean/fpr.c | 25 ++- crypto_sign/falcon-1024/clean/fpr.h | 4 +- crypto_sign/falcon-1024/clean/inner.h | 195 ++++++++++++++++-- crypto_sign/falcon-1024/clean/keygen.c | 21 +- crypto_sign/falcon-1024/clean/pqclean.c | 38 ++-- crypto_sign/falcon-1024/clean/rng.c | 4 +- crypto_sign/falcon-1024/clean/sign.c | 250 ++++++++++++++++++++---- crypto_sign/falcon-1024/clean/vrfy.c | 115 ++++++++++- crypto_sign/falcon-512/META.yml | 4 +- crypto_sign/falcon-512/clean/common.c | 42 +++- crypto_sign/falcon-512/clean/fpr.c | 25 ++- crypto_sign/falcon-512/clean/fpr.h | 4 +- crypto_sign/falcon-512/clean/inner.h | 195 ++++++++++++++++-- crypto_sign/falcon-512/clean/keygen.c | 21 +- crypto_sign/falcon-512/clean/pqclean.c | 38 ++-- crypto_sign/falcon-512/clean/rng.c | 4 +- crypto_sign/falcon-512/clean/sign.c | 250 ++++++++++++++++++++---- crypto_sign/falcon-512/clean/vrfy.c | 115 ++++++++++- 20 files changed, 1204 insertions(+), 192 deletions(-) diff --git a/crypto_sign/falcon-1024/META.yml b/crypto_sign/falcon-1024/META.yml index ddae840c..695cf3c0 100644 --- a/crypto_sign/falcon-1024/META.yml +++ b/crypto_sign/falcon-1024/META.yml @@ -4,8 +4,8 @@ claimed-nist-level: 5 length-public-key: 1793 length-secret-key: 2305 length-signature: 1330 -nistkat-sha256: ad3d17869fdc05deae13ffa2ef26bde125b42f61b2dcd861a1ae20adcb2accc5 -testvectors-sha256: bd8076c13722d8c555c68fc6bd7763e1a9dd5483ee7c8d1c74dd2df459c72a40 +nistkat-sha256: ef2104e326221515621638ca03cd99802271bdd9907e2ae5fc7b8d19d696c584 +testvectors-sha256: 14ee0e3f0ea4b9b25193a54eed9100b1bb1cf5dbc7813fd9dc9180c1ea1a1042 principal-submitters: - Thomas Prest auxiliary-submitters: diff --git a/crypto_sign/falcon-1024/clean/common.c b/crypto_sign/falcon-1024/clean/common.c index 7dc8ad20..bb2d7ece 100644 --- a/crypto_sign/falcon-1024/clean/common.c +++ b/crypto_sign/falcon-1024/clean/common.c @@ -33,10 +33,43 @@ /* see inner.h */ void -PQCLEAN_FALCON1024_CLEAN_hash_to_point( - shake256_context *sc, - uint16_t *x, unsigned logn, uint8_t *tmp) { +PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime( + inner_shake256_context *sc, + uint16_t *x, unsigned logn) { + /* + * This is the straightforward per-the-spec implementation. It + * is not constant-time, thus it might reveal information on the + * plaintext (at least, enough to check the plaintext against a + * list of potential plaintexts) in a scenario where the + * attacker does not have access to the signature value or to + * the public key, but knows the nonce (without knowledge of the + * nonce, the hashed output cannot be matched against potential + * plaintexts). + */ + size_t n; + n = (size_t)1 << logn; + while (n > 0) { + uint8_t buf[2]; + uint32_t w; + + inner_shake256_extract(sc, (void *)buf, sizeof buf); + w = ((unsigned)buf[0] << 8) | (unsigned)buf[1]; + if (w < 61445) { + while (w >= 12289) { + w -= 12289; + } + *x ++ = (uint16_t)w; + n --; + } + } +} + +/* see inner.h */ +void +PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct( + inner_shake256_context *sc, + uint16_t *x, unsigned logn, uint8_t *tmp) { /* * Each 16-bit sample is a value in 0..65535. The value is * kept if it falls in 0..61444 (because 61445 = 5*12289) @@ -97,7 +130,7 @@ PQCLEAN_FALCON1024_CLEAN_hash_to_point( uint8_t buf[2]; uint32_t w, wr; - shake256_extract(sc, buf, sizeof buf); + inner_shake256_extract(sc, buf, sizeof buf); w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1]; wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1)); wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1)); @@ -196,7 +229,6 @@ PQCLEAN_FALCON1024_CLEAN_hash_to_point( *d = (uint16_t)(dv ^ (mk & (sv ^ dv))); } } - } /* see inner.h */ diff --git a/crypto_sign/falcon-1024/clean/fpr.c b/crypto_sign/falcon-1024/clean/fpr.c index b9a8999d..636b4092 100644 --- a/crypto_sign/falcon-1024/clean/fpr.c +++ b/crypto_sign/falcon-1024/clean/fpr.c @@ -507,7 +507,7 @@ fpr_sqrt(fpr x) { uint64_t -fpr_expm_p63(fpr x) { +fpr_expm_p63(fpr x, fpr ccs) { /* * Polynomial approximation of exp(-x) is taken from FACCT: * https://eprint.iacr.org/2018/1234 @@ -539,6 +539,8 @@ fpr_expm_p63(fpr x) { uint64_t z, y; unsigned u; + uint32_t z0, z1, y0, y1; + uint64_t a, b; y = C[0]; z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1; @@ -554,8 +556,7 @@ fpr_expm_p63(fpr x) { * also have appropriate IEEE754 floating-point support, * which is better. */ - uint32_t z0, z1, y0, y1; - uint64_t a, b, c; + uint64_t c; z0 = (uint32_t)z; z1 = (uint32_t)(z >> 32); @@ -569,6 +570,24 @@ fpr_expm_p63(fpr x) { c += (uint64_t)z1 * (uint64_t)y1; y = C[u] - c; } + + /* + * The scaling factor must be applied at the end. Since y is now + * in fixed-point notation, we have to convert the factor to the + * same format, and do an extra integer multiplication. + */ + z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1; + z0 = (uint32_t)z; + z1 = (uint32_t)(z >> 32); + y0 = (uint32_t)y; + y1 = (uint32_t)(y >> 32); + a = ((uint64_t)z0 * (uint64_t)y1) + + (((uint64_t)z0 * (uint64_t)y0) >> 32); + b = ((uint64_t)z1 * (uint64_t)y0); + y = (a >> 32) + (b >> 32); + y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32); + y += (uint64_t)z1 * (uint64_t)y1; + return y; } diff --git a/crypto_sign/falcon-1024/clean/fpr.h b/crypto_sign/falcon-1024/clean/fpr.h index 2dfc9b85..c3103dc1 100644 --- a/crypto_sign/falcon-1024/clean/fpr.h +++ b/crypto_sign/falcon-1024/clean/fpr.h @@ -232,6 +232,8 @@ static const fpr fpr_zero = 0; static const fpr fpr_one = 4607182418800017408; static const fpr fpr_two = 4611686018427387904; static const fpr fpr_onehalf = 4602678819172646912; +static const fpr fpr_invsqrt2 = 4604544271217802189; +static const fpr fpr_invsqrt8 = 4600040671590431693; static const fpr fpr_ptwo31 = 4746794007248502784; static const fpr fpr_ptwo31m1 = 4746794007244308480; static const fpr fpr_mtwo31m1 = 13970166044099084288U; @@ -444,7 +446,7 @@ fpr_lt(fpr x, fpr y) { * bits or so. */ #define fpr_expm_p63 PQCLEAN_FALCON1024_CLEAN_fpr_expm_p63 -uint64_t fpr_expm_p63(fpr x); +uint64_t fpr_expm_p63(fpr x, fpr ccs); #define fpr_gm_tab PQCLEAN_FALCON1024_CLEAN_fpr_gm_tab extern const fpr fpr_gm_tab[]; diff --git a/crypto_sign/falcon-1024/clean/inner.h b/crypto_sign/falcon-1024/clean/inner.h index a12d3755..98855160 100644 --- a/crypto_sign/falcon-1024/clean/inner.h +++ b/crypto_sign/falcon-1024/clean/inner.h @@ -34,6 +34,45 @@ * @author Thomas Pornin */ +/* + * IMPORTANT API RULES + * ------------------- + * + * This API has some non-trivial usage rules: + * + * + * - All public functions (i.e. the non-static ones) must be referenced + * with the PQCLEAN_FALCON1024_CLEAN_ macro (e.g. PQCLEAN_FALCON1024_CLEAN_verify_raw for the verify_raw() + * function). That macro adds a prefix to the name, which is + * configurable with the FALCON_PREFIX macro. This allows compiling + * the code into a specific "namespace" and potentially including + * several versions of this code into a single application (e.g. to + * have an AVX2 and a non-AVX2 variants and select the one to use at + * runtime based on availability of AVX2 opcodes). + * + * - Functions that need temporary buffers expects them as a final + * tmp[] array of type uint8_t*, with a size which is documented for + * each function. However, most have some alignment requirements, + * because they will use the array to store 16-bit, 32-bit or 64-bit + * values (e.g. uint64_t or double). The caller must ensure proper + * alignment. What happens on unaligned access depends on the + * underlying architecture, ranging from a slight time penalty + * to immediate termination of the process. + * + * - Some functions rely on specific rounding rules and precision for + * floating-point numbers. On some systems (in particular 32-bit x86 + * with the 387 FPU), this requires setting an hardware control + * word. The caller MUST use set_fpu_cw() to ensure proper precision: + * + * oldcw = set_fpu_cw(2); + * PQCLEAN_FALCON1024_CLEAN_sign_dyn(...); + * set_fpu_cw(oldcw); + * + * On systems where the native floating-point precision is already + * proper, or integer-based emulation is used, the set_fpu_cw() + * function does nothing, so it can be called systematically. + */ + #include #include @@ -42,22 +81,47 @@ + +/* + * Some computations with floating-point elements, in particular + * rounding to the nearest integer, rely on operations using _exactly_ + * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit + * x86, the 387 FPU may be used (depending on the target OS) and, in + * that case, may use more precision bits (i.e. 64 bits, for an 80-bit + * total type length); to prevent miscomputations, we define an explicit + * function that modifies the precision in the FPU control word. + * + * set_fpu_cw() sets the precision to the provided value, and returns + * the previously set precision; callers are supposed to restore the + * previous precision on exit. The correct (52-bit) precision is + * configured with the value "2". On unsupported compilers, or on + * targets other than 32-bit x86, or when the native 'double' type is + * not used, the set_fpu_cw() function does nothing at all. + */ +static inline unsigned +set_fpu_cw(unsigned x) { + return x; +} + + + + /* ==================================================================== */ /* * SHAKE256 implementation (shake.c). * * API is defined to be easily replaced with the fips202.h API defined - * as part of PQ Clean. + * as part of PQClean. */ #include "fips202.h" -#define shake256_context shake256incctx -#define shake256_init(sc) shake256_inc_init(sc) -#define shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) -#define shake256_flip(sc) shake256_inc_finalize(sc) -#define shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) +#define inner_shake256_context shake256incctx +#define inner_shake256_init(sc) shake256_inc_init(sc) +#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) +#define inner_shake256_flip(sc) shake256_inc_finalize(sc) +#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) /* ==================================================================== */ @@ -140,9 +204,22 @@ extern const uint8_t PQCLEAN_FALCON1024_CLEAN_max_sig_bits[]; /* * From a SHAKE256 context (must be already flipped), produce a new - * point. The temporary buffer (tmp) must have room for 2*2^logn bytes. + * point. This is the non-constant-time version, which may leak enough + * information to serve as a stop condition on a brute force attack on + * the hashed message (provided that the nonce value is known). */ -void PQCLEAN_FALCON1024_CLEAN_hash_to_point(shake256_context *sc, +void PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(inner_shake256_context *sc, + uint16_t *x, unsigned logn); + +/* + * From a SHAKE256 context (must be already flipped), produce a new + * point. The temporary buffer (tmp) must have room for 2*2^logn bytes. + * This function is constant-time but is typically more expensive than + * PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(). + * + * tmp[] must have 16-bit alignment. + */ +void PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(inner_shake256_context *sc, uint16_t *x, unsigned logn, uint8_t *tmp); /* @@ -184,6 +261,8 @@ void PQCLEAN_FALCON1024_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn); * logn is the degree log * tmp[] temporary, must have at least 2*2^logn bytes * Returned value is 1 on success, 0 on error. + * + * tmp[] must have 16-bit alignment. */ int PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, const uint16_t *h, unsigned logn, uint8_t *tmp); @@ -195,6 +274,7 @@ int PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, * reported if f is not invertible mod phi mod q). * * The tmp[] array must have room for at least 2*2^logn elements. + * tmp[] must have 16-bit alignment. */ int PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h, const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp); @@ -208,11 +288,53 @@ int PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h, * The tmp[] array must have room for at least 4*2^logn bytes. * * Returned value is 1 in success, 0 on error (f not invertible). + * tmp[] must have 16-bit alignment. */ int PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G, const int8_t *f, const int8_t *g, const int8_t *F, unsigned logn, uint8_t *tmp); +/* + * Test whether a given polynomial is invertible modulo phi and q. + * Polynomial coefficients are small integers. + * + * tmp[] must have 16-bit alignment. + */ +int PQCLEAN_FALCON1024_CLEAN_is_invertible( + const int16_t *s2, unsigned logn, uint8_t *tmp); + +/* + * Count the number of elements of value zero in the NTT representation + * of the given polynomial: this is the number of primitive 2n-th roots + * of unity (modulo q = 12289) that are roots of the provided polynomial + * (taken modulo q). + * + * tmp[] must have 16-bit alignment. + */ +int PQCLEAN_FALCON1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp); + +/* + * Internal signature verification with public key recovery: + * h[] receives the public key (NOT in NTT/Montgomery format) + * c0[] contains the hashed nonce+message + * s1[] is the first signature half + * s2[] is the second signature half + * logn is the degree log + * tmp[] temporary, must have at least 2*2^logn bytes + * Returned value is 1 on success, 0 on error. Success is returned if + * the signature is a short enough vector; in that case, the public + * key has been written to h[]. However, the caller must still + * verify that h[] is the correct value (e.g. with regards to a known + * hash of the public key). + * + * h[] may not overlap with any of the other arrays. + * + * tmp[] must have 16-bit alignment. + */ +int PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h, + const uint16_t *c0, const int16_t *s1, const int16_t *s2, + unsigned logn, uint8_t *tmp); + /* ==================================================================== */ /* * Implementation of floating-point real numbers (fpr.h, fpr.c). @@ -358,7 +480,7 @@ typedef struct { * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256 * context (in "flipped" state) to obtain its initial state. */ -void PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src); +void PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src); /* * Refill the PRNG buffer. This is normally invoked automatically, and @@ -586,6 +708,9 @@ void PQCLEAN_FALCON1024_CLEAN_poly_merge_fft(fpr *f, /* * Required sizes of the temporary buffer (in bytes). + * + * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1 + * or 2) where it is slightly greater. */ #define FALCON_KEYGEN_TEMP_1 136 #define FALCON_KEYGEN_TEMP_2 272 @@ -608,8 +733,11 @@ void PQCLEAN_FALCON1024_CLEAN_poly_merge_fft(fpr *f, * public key is written in h. Either or both of G and h may be NULL, * in which case the corresponding element is not returned (they can * be recomputed from f, g and F). + * + * tmp[] must have 64-bit alignment. + * This function uses floating-point rounding (see set_fpu_cw()). */ -void PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng, +void PQCLEAN_FALCON1024_CLEAN_keygen(inner_shake256_context *rng, int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, unsigned logn, uint8_t *tmp); @@ -624,6 +752,9 @@ void PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng, * a total of (8*logn+40)*2^logn bytes. * * The tmp[] array must have room for at least 48*2^logn bytes. + * + * tmp[] must have 64-bit alignment. + * This function uses floating-point rounding (see set_fpu_cw()). */ void PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key, const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, @@ -636,9 +767,15 @@ void PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key, * * The sig[] and hm[] buffers may overlap. * + * On successful output, the start of the tmp[] buffer contains the s1 + * vector (as int16_t elements). + * * The minimal size (in bytes) of tmp[] is 48*2^logn bytes. + * + * tmp[] must have 64-bit alignment. + * This function uses floating-point rounding (see set_fpu_cw()). */ -void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, +void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng, const fpr *expanded_key, const uint16_t *hm, unsigned logn, uint8_t *tmp); @@ -651,13 +788,47 @@ void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, * * The sig[] and hm[] buffers may overlap. * + * On successful output, the start of the tmp[] buffer contains the s1 + * vector (as int16_t elements). + * * The minimal size (in bytes) of tmp[] is 72*2^logn bytes. + * + * tmp[] must have 64-bit alignment. + * This function uses floating-point rounding (see set_fpu_cw()). */ -void PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng, +void PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng, const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, const uint16_t *hm, unsigned logn, uint8_t *tmp); +/* + * Internal sampler engine. Exported for tests. + * + * sampler_context wraps around a source of random numbers (PRNG) and + * the sigma_min value (nominally dependent on the degree). + * + * sampler() takes as parameters: + * ctx pointer to the sampler_context structure + * mu center for the distribution + * isigma inverse of the distribution standard deviation + * It returns an integer sampled along the Gaussian distribution centered + * on mu and of standard deviation sigma = 1/isigma. + * + * gaussian0_sampler() takes as parameter a pointer to a PRNG, and + * returns an integer sampled along a half-Gaussian with standard + * deviation sigma0 = 1.8205 (center is 0, returned value is + * nonnegative). + */ + +typedef struct { + prng p; + fpr sigma_min; +} sampler_context; + +int PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma); + +int PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(prng *p); + /* ==================================================================== */ #endif diff --git a/crypto_sign/falcon-1024/clean/keygen.c b/crypto_sign/falcon-1024/clean/keygen.c index ad6eb66f..47081537 100644 --- a/crypto_sign/falcon-1024/clean/keygen.c +++ b/crypto_sign/falcon-1024/clean/keygen.c @@ -2171,6 +2171,9 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride, /* ==================================================================== */ + +#define RNG_CONTEXT inner_shake256_context + /* * Get a random 8-byte integer from a SHAKE-based RNG. This function * ensures consistent interpretation of the SHAKE output so that @@ -2178,14 +2181,14 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride, * a known seed is used. */ static inline uint64_t -get_rng_u64(shake256_context *rng) { +get_rng_u64(inner_shake256_context *rng) { /* * We enforce little-endian representation. */ uint8_t tmp[8]; - shake256_extract(rng, tmp, sizeof tmp); + inner_shake256_extract(rng, tmp, sizeof tmp); return (uint64_t)tmp[0] | ((uint64_t)tmp[1] << 8) | ((uint64_t)tmp[2] << 16) @@ -2196,6 +2199,7 @@ get_rng_u64(shake256_context *rng) { | ((uint64_t)tmp[7] << 56); } + /* * Table below incarnates a discrete Gaussian distribution: * D(x) = exp(-(x^2)/(2*sigma^2)) @@ -2227,7 +2231,7 @@ static const uint64_t gauss_1024_12289[] = { * together for lower dimensions. */ static int -mkgauss(shake256_context *rng, unsigned logn) { +mkgauss(RNG_CONTEXT *rng, unsigned logn) { unsigned u, g; int val; @@ -3156,6 +3160,7 @@ solve_NTRU_intermediate(unsigned logn_top, fpr xv; xv = fpr_mul(rt2[u], pdc); + /* * Sometimes the values can be out-of-bounds if * the algorithm fails; we must not call @@ -4006,7 +4011,7 @@ solve_NTRU(unsigned logn, int8_t *F, int8_t *G, * also makes sure that the resultant of the polynomial with phi is odd. */ static void -poly_small_mkgauss(shake256_context *rng, int8_t *f, unsigned logn) { +poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) { size_t n, u; unsigned mod2; @@ -4046,7 +4051,7 @@ restart: /* see falcon.h */ void -PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng, +PQCLEAN_FALCON1024_CLEAN_keygen(inner_shake256_context *rng, int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, unsigned logn, uint8_t *tmp) { /* @@ -4070,8 +4075,10 @@ PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng, */ size_t n, u; uint16_t *h2, *tmp2; + RNG_CONTEXT *rc; n = MKN(logn); + rc = rng; /* * We need to generate f and g randomly, until we find values @@ -4104,8 +4111,8 @@ PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng, * (i.e. the resultant of the polynomial with phi * will be odd). */ - poly_small_mkgauss(rng, f, logn); - poly_small_mkgauss(rng, g, logn); + poly_small_mkgauss(rc, f, logn); + poly_small_mkgauss(rc, g, logn); /* * Verify that all coefficients are within the bounds diff --git a/crypto_sign/falcon-1024/clean/pqclean.c b/crypto_sign/falcon-1024/clean/pqclean.c index 50e21f11..bbab1921 100644 --- a/crypto_sign/falcon-1024/clean/pqclean.c +++ b/crypto_sign/falcon-1024/clean/pqclean.c @@ -51,16 +51,16 @@ PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair( int8_t f[1024], g[1024], F[1024]; uint16_t h[1024]; unsigned char seed[48]; - shake256_context rng; + inner_shake256_context rng; size_t u, v; /* * Generate key pair. */ randombytes(seed, sizeof seed); - shake256_init(&rng); - shake256_inject(&rng, seed, sizeof seed); - shake256_flip(&rng); + inner_shake256_init(&rng); + inner_shake256_inject(&rng, seed, sizeof seed); + inner_shake256_flip(&rng); PQCLEAN_FALCON1024_CLEAN_keygen(&rng, f, g, F, NULL, h, 10, tmp.b); /* @@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, uint16_t hm[1024]; } r; unsigned char seed[48]; - shake256_context sc; + inner_shake256_context sc; size_t u, v; /* @@ -181,19 +181,19 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, /* * Hash message nonce + message into a vector. */ - shake256_init(&sc); - shake256_inject(&sc, nonce, NONCELEN); - shake256_inject(&sc, m, mlen); - shake256_flip(&sc); - PQCLEAN_FALCON1024_CLEAN_hash_to_point(&sc, r.hm, 10, tmp.b); + inner_shake256_init(&sc); + inner_shake256_inject(&sc, nonce, NONCELEN); + inner_shake256_inject(&sc, m, mlen); + inner_shake256_flip(&sc); + PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, r.hm, 10, tmp.b); /* * Initialize a RNG. */ randombytes(seed, sizeof seed); - shake256_init(&sc); - shake256_inject(&sc, seed, sizeof seed); - shake256_flip(&sc); + inner_shake256_init(&sc); + inner_shake256_inject(&sc, seed, sizeof seed); + inner_shake256_flip(&sc); /* * Compute and return the signature. This loops until a signature @@ -225,7 +225,7 @@ do_verify( } tmp; uint16_t h[1024], hm[1024]; int16_t sig[1024]; - shake256_context sc; + inner_shake256_context sc; /* * Decode public key. @@ -253,11 +253,11 @@ do_verify( /* * Hash nonce + message into a vector. */ - shake256_init(&sc); - shake256_inject(&sc, nonce, NONCELEN); - shake256_inject(&sc, m, mlen); - shake256_flip(&sc); - PQCLEAN_FALCON1024_CLEAN_hash_to_point(&sc, hm, 10, tmp.b); + inner_shake256_init(&sc); + inner_shake256_inject(&sc, nonce, NONCELEN); + inner_shake256_inject(&sc, m, mlen); + inner_shake256_flip(&sc); + PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, hm, 10, tmp.b); /* * Verify signature. diff --git a/crypto_sign/falcon-1024/clean/rng.c b/crypto_sign/falcon-1024/clean/rng.c index e247a639..6be52b65 100644 --- a/crypto_sign/falcon-1024/clean/rng.c +++ b/crypto_sign/falcon-1024/clean/rng.c @@ -36,7 +36,7 @@ /* see inner.h */ void -PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src) { +PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src) { /* * To ensure reproducibility for a given seed, we * must enforce little-endian interpretation of @@ -46,7 +46,7 @@ PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src) { uint64_t th, tl; int i; - shake256_extract(src, tmp, 56); + inner_shake256_extract(src, tmp, 56); for (i = 0; i < 14; i ++) { uint32_t w; diff --git a/crypto_sign/falcon-1024/clean/sign.c b/crypto_sign/falcon-1024/clean/sign.c index 9307206e..d6689eb3 100644 --- a/crypto_sign/falcon-1024/clean/sign.c +++ b/crypto_sign/falcon-1024/clean/sign.c @@ -417,8 +417,170 @@ ffSampling_fft(samplerZ samp, void *samp_ctx, size_t n, hn; const fpr *tree0, *tree1; - n = (size_t)1 << logn; - if (n == 1) { + /* + * When logn == 2, we inline the last two recursion levels. + */ + if (logn == 2) { + fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma; + fpr a_re, a_im, b_re, b_im, c_re, c_im; + + tree0 = tree + 4; + tree1 = tree + 8; + + /* + * We split t1 into w*, then do the recursive invocation, + * with output in w*. We finally merge back into z1. + */ + a_re = t1[0]; + a_im = t1[2]; + b_re = t1[1]; + b_im = t1[3]; + c_re = fpr_add(a_re, b_re); + c_im = fpr_add(a_im, b_im); + w0 = fpr_half(c_re); + w1 = fpr_half(c_im); + c_re = fpr_sub(a_re, b_re); + c_im = fpr_sub(a_im, b_im); + w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); + w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); + + x0 = w2; + x1 = w3; + sigma = tree1[3]; + w2 = fpr_of(samp(samp_ctx, x0, sigma)); + w3 = fpr_of(samp(samp_ctx, x1, sigma)); + a_re = fpr_sub(x0, w2); + a_im = fpr_sub(x1, w3); + b_re = tree1[0]; + b_im = tree1[1]; + c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + x0 = fpr_add(c_re, w0); + x1 = fpr_add(c_im, w1); + sigma = tree1[2]; + w0 = fpr_of(samp(samp_ctx, x0, sigma)); + w1 = fpr_of(samp(samp_ctx, x1, sigma)); + + a_re = w0; + a_im = w1; + b_re = w2; + b_im = w3; + c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); + c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); + z1[0] = w0 = fpr_add(a_re, c_re); + z1[2] = w2 = fpr_add(a_im, c_im); + z1[1] = w1 = fpr_sub(a_re, c_re); + z1[3] = w3 = fpr_sub(a_im, c_im); + + /* + * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. + */ + w0 = fpr_sub(t1[0], w0); + w1 = fpr_sub(t1[1], w1); + w2 = fpr_sub(t1[2], w2); + w3 = fpr_sub(t1[3], w3); + + a_re = w0; + a_im = w2; + b_re = tree[0]; + b_im = tree[2]; + w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + a_re = w1; + a_im = w3; + b_re = tree[1]; + b_im = tree[3]; + w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + + w0 = fpr_add(w0, t0[0]); + w1 = fpr_add(w1, t0[1]); + w2 = fpr_add(w2, t0[2]); + w3 = fpr_add(w3, t0[3]); + + /* + * Second recursive invocation. + */ + a_re = w0; + a_im = w2; + b_re = w1; + b_im = w3; + c_re = fpr_add(a_re, b_re); + c_im = fpr_add(a_im, b_im); + w0 = fpr_half(c_re); + w1 = fpr_half(c_im); + c_re = fpr_sub(a_re, b_re); + c_im = fpr_sub(a_im, b_im); + w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); + w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); + + x0 = w2; + x1 = w3; + sigma = tree0[3]; + w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma)); + w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma)); + a_re = fpr_sub(x0, y0); + a_im = fpr_sub(x1, y1); + b_re = tree0[0]; + b_im = tree0[1]; + c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + x0 = fpr_add(c_re, w0); + x1 = fpr_add(c_im, w1); + sigma = tree0[2]; + w0 = fpr_of(samp(samp_ctx, x0, sigma)); + w1 = fpr_of(samp(samp_ctx, x1, sigma)); + + a_re = w0; + a_im = w1; + b_re = w2; + b_im = w3; + c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); + c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); + z0[0] = fpr_add(a_re, c_re); + z0[2] = fpr_add(a_im, c_im); + z0[1] = fpr_sub(a_re, c_re); + z0[3] = fpr_sub(a_im, c_im); + + return; + } + + /* + * Case logn == 1 is reachable only when using Falcon-2 (the + * smallest size for which Falcon is mathematically defined, but + * of course way too insecure to be of any use). + */ + if (logn == 1) { + fpr x0, x1, y0, y1, sigma; + fpr a_re, a_im, b_re, b_im, c_re, c_im; + + x0 = t1[0]; + x1 = t1[1]; + sigma = tree[3]; + z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma)); + z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma)); + a_re = fpr_sub(x0, y0); + a_im = fpr_sub(x1, y1); + b_re = tree[0]; + b_im = tree[1]; + c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + x0 = fpr_add(c_re, t0[0]); + x1 = fpr_add(c_im, t0[1]); + sigma = tree[2]; + z0[0] = fpr_of(samp(samp_ctx, x0, sigma)); + z0[1] = fpr_of(samp(samp_ctx, x1, sigma)); + + return; + } + + /* + * Normal end of recursion is for logn == 0. Since the last + * steps of the recursions were inlined in the blocks above + * (when logn == 1 or 2), this case is not reachable, and is + * retained here only for documentation purposes. + + if (logn == 0) { fpr x0, x1, sigma; x0 = t0[0]; @@ -429,6 +591,13 @@ ffSampling_fft(samplerZ samp, void *samp_ctx, return; } + */ + + /* + * General recursive case (logn >= 3). + */ + + n = (size_t)1 << logn; hn = n >> 1; tree0 = tree + n; tree1 = tree + n + ffLDL_treesize(logn - 1); @@ -480,7 +649,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, const fpr *b00, *b01, *b10, *b11, *tree; fpr ni; uint32_t sqn, ng; - int16_t *s2tmp; + int16_t *s1tmp, *s2tmp; n = MKN(logn); t0 = tmp; @@ -542,6 +711,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, /* * Compute the signature. */ + s1tmp = (int16_t *)tx; sqn = 0; ng = 0; for (u = 0; u < n; u ++) { @@ -550,6 +720,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); sqn += (uint32_t)(z * z); ng |= sqn; + s1tmp[u] = (int16_t)z; } sqn |= -(ng >> 31); @@ -568,6 +739,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, } if (PQCLEAN_FALCON1024_CLEAN_is_short_half(sqn, s2tmp, logn)) { memcpy(s2, s2tmp, n * sizeof * s2); + memcpy(tmp, s1tmp, n * sizeof * s1tmp); return 1; } return 0; @@ -592,7 +764,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11; fpr ni; uint32_t sqn, ng; - int16_t *s2tmp; + int16_t *s1tmp, *s2tmp; n = MKN(logn); @@ -745,6 +917,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, PQCLEAN_FALCON1024_CLEAN_iFFT(t0, logn); PQCLEAN_FALCON1024_CLEAN_iFFT(t1, logn); + s1tmp = (int16_t *)tx; sqn = 0; ng = 0; for (u = 0; u < n; u ++) { @@ -753,6 +926,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); sqn += (uint32_t)(z * z); ng |= sqn; + s1tmp[u] = (int16_t)z; } sqn |= -(ng >> 31); @@ -771,6 +945,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, } if (PQCLEAN_FALCON1024_CLEAN_is_short_half(sqn, s2tmp, logn)) { memcpy(s2, s2tmp, n * sizeof * s2); + memcpy(tmp, s1tmp, n * sizeof * s1tmp); return 1; } return 0; @@ -780,29 +955,28 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, * Sample an integer value along a half-gaussian distribution centered * on zero and standard deviation 1.8205, with a precision of 72 bits. */ -static int -gaussian0_sampler(prng *p) { +int +PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(prng *p) { static const uint32_t dist[] = { - 6031371U, 13708371U, 13035518U, - 5186761U, 1487980U, 12270720U, - 3298653U, 4688887U, 5511555U, - 1551448U, 9247616U, 9467675U, - 539632U, 14076116U, 5909365U, - 138809U, 10836485U, 13263376U, - 26405U, 15335617U, 16601723U, - 3714U, 14514117U, 13240074U, - 386U, 8324059U, 3276722U, - 29U, 12376792U, 7821247U, - 1U, 11611789U, 3398254U, - 0U, 1194629U, 4532444U, - 0U, 37177U, 2973575U, - 0U, 855U, 10369757U, - 0U, 14U, 9441597U, - 0U, 0U, 3075302U, - 0U, 0U, 28626U, - 0U, 0U, 197U, - 0U, 0U, 1U + 10745844u, 3068844u, 3741698u, + 5559083u, 1580863u, 8248194u, + 2260429u, 13669192u, 2736639u, + 708981u, 4421575u, 10046180u, + 169348u, 7122675u, 4136815u, + 30538u, 13063405u, 7650655u, + 4132u, 14505003u, 7826148u, + 417u, 16768101u, 11363290u, + 31u, 8444042u, 8086568u, + 1u, 12844466u, 265321u, + 0u, 1232676u, 13644283u, + 0u, 38047u, 9111839u, + 0u, 870u, 6138264u, + 0u, 14u, 12545723u, + 0u, 0u, 3104126u, + 0u, 0u, 28824u, + 0u, 0u, 198u, + 0u, 0u, 1u }; uint32_t v0, v1, v2, hi; @@ -843,7 +1017,7 @@ gaussian0_sampler(prng *p) { * Sample a bit with probability exp(-x) for some x >= 0. */ static int -BerExp(prng *p, fpr x) { +BerExp(prng *p, fpr x, fpr ccs) { int s, i; fpr r; uint32_t sw, w; @@ -880,7 +1054,7 @@ BerExp(prng *p, fpr x) { * case). The bias is negligible since fpr_expm_p63() only computes * with 51 bits of precision or so. */ - z = ((fpr_expm_p63(r) << 1) - 1) >> s; + z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s; /* * Sample a bit with probability exp(-x). Since x = s*log(2) + r, @@ -896,11 +1070,6 @@ BerExp(prng *p, fpr x) { return (int)(w >> 31); } -typedef struct { - prng p; - fpr sigma_min; -} sampler_context; - /* * The sampler produces a random integer that follows a discrete Gaussian * distribution, centered on mu, and with standard deviation sigma. The @@ -909,8 +1078,8 @@ typedef struct { * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9. */ -static int -sampler(void *ctx, fpr mu, fpr isigma) { +int +PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) { sampler_context *spc; int s; fpr r, dss, ccs; @@ -952,7 +1121,7 @@ sampler(void *ctx, fpr mu, fpr isigma) { * - b = 0: z <= 0 and sampled against a Gaussian * centered on 0. */ - z0 = gaussian0_sampler(&spc->p); + z0 = PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(&spc->p); b = prng_get_u8(&spc->p) & 1; z = b + ((b << 1) - 1) * z0; @@ -983,8 +1152,7 @@ sampler(void *ctx, fpr mu, fpr isigma) { */ x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss); x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0)); - x = fpr_mul(x, ccs); - if (BerExp(&spc->p, x)) { + if (BerExp(&spc->p, x, ccs)) { /* * Rejection sampling was centered on r, but the * actual center is mu = s + r. @@ -996,7 +1164,7 @@ sampler(void *ctx, fpr mu, fpr isigma) { /* see inner.h */ void -PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, +PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng, const fpr *expanded_key, const uint16_t *hm, unsigned logn, uint8_t *tmp) { fpr *ftmp; @@ -1025,7 +1193,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, ? fpr_sigma_min_10 : fpr_sigma_min_9; PQCLEAN_FALCON1024_CLEAN_prng_init(&spc.p, rng); - samp = sampler; + samp = PQCLEAN_FALCON1024_CLEAN_sampler; samp_ctx = &spc; /* @@ -1040,7 +1208,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, /* see inner.h */ void -PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng, +PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng, const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, const uint16_t *hm, unsigned logn, uint8_t *tmp) { @@ -1070,7 +1238,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng, ? fpr_sigma_min_10 : fpr_sigma_min_9; PQCLEAN_FALCON1024_CLEAN_prng_init(&spc.p, rng); - samp = sampler; + samp = PQCLEAN_FALCON1024_CLEAN_sampler; samp_ctx = &spc; /* diff --git a/crypto_sign/falcon-1024/clean/vrfy.c b/crypto_sign/falcon-1024/clean/vrfy.c index 6c3f4abf..780127cf 100644 --- a/crypto_sign/falcon-1024/clean/vrfy.c +++ b/crypto_sign/falcon-1024/clean/vrfy.c @@ -649,7 +649,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, } /* - * Compute s1 = s2*h - c0 mod phi mod q (in tt[]). + * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]). */ mq_NTT(tt, logn); mq_poly_montymul_ntt(tt, h, logn); @@ -657,7 +657,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, mq_poly_sub(tt, c0, logn); /* - * Normalize s1 elements into the [-q/2..q/2] range. + * Normalize -s1 elements into the [-q/2..q/2] range. */ for (u = 0; u < n; u ++) { int32_t w; @@ -668,7 +668,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, } /* - * Signature is valid if and only if the aggregate (s1,s2) vector + * Signature is valid if and only if the aggregate (-s1,s2) vector * is short enough. */ return PQCLEAN_FALCON1024_CLEAN_is_short((int16_t *)tt, s2, logn); @@ -699,7 +699,7 @@ PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h, return 1; } -/* see internal.h */ +/* see inner.h */ int PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G, const int8_t *f, const int8_t *g, const int8_t *F, @@ -743,3 +743,110 @@ PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G, } return 1; } + +/* see inner.h */ +int +PQCLEAN_FALCON1024_CLEAN_is_invertible( + const int16_t *s2, unsigned logn, uint8_t *tmp) { + size_t u, n; + uint16_t *tt; + uint32_t r; + + n = (size_t)1 << logn; + tt = (uint16_t *)tmp; + for (u = 0; u < n; u ++) { + uint32_t w; + + w = (uint32_t)s2[u]; + w += Q & -(w >> 31); + tt[u] = (uint16_t)w; + } + mq_NTT(tt, logn); + r = 0; + for (u = 0; u < n; u ++) { + r |= (uint32_t)(tt[u] - 1); + } + return (int)(1u - (r >> 31)); +} + +/* see inner.h */ +int +PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h, + const uint16_t *c0, const int16_t *s1, const int16_t *s2, + unsigned logn, uint8_t *tmp) { + size_t u, n; + uint16_t *tt; + uint32_t r; + + n = (size_t)1 << logn; + + /* + * Reduce elements of s1 and s2 modulo q; then write s2 into tt[] + * and c0 - s1 into h[]. + */ + tt = (uint16_t *)tmp; + for (u = 0; u < n; u ++) { + uint32_t w; + + w = (uint32_t)s2[u]; + w += Q & -(w >> 31); + tt[u] = (uint16_t)w; + + w = (uint32_t)s1[u]; + w += Q & -(w >> 31); + w = mq_sub(c0[u], w); + h[u] = (uint16_t)w; + } + + /* + * Compute h = (c0 - s1) / s2. If one of the coefficients of s2 + * is zero (in NTT representation) then the operation fails. We + * keep that information into a flag so that we do not deviate + * from strict constant-time processing; if all coefficients of + * s2 are non-zero, then the high bit of r will be zero. + */ + mq_NTT(tt, logn); + mq_NTT(h, logn); + r = 0; + for (u = 0; u < n; u ++) { + r |= (uint32_t)(tt[u] - 1); + h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); + } + mq_iNTT(h, logn); + + /* + * Signature is acceptable if and only if it is short enough, + * and s2 was invertible mod phi mod q. The caller must still + * check that the rebuilt public key matches the expected + * value (e.g. through a hash). + */ + r = ~r & (uint32_t) - PQCLEAN_FALCON1024_CLEAN_is_short(s1, s2, logn); + return (int)(r >> 31); +} + +/* see inner.h */ +int +PQCLEAN_FALCON1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) { + uint16_t *s2; + size_t u, n; + uint32_t r; + + n = (size_t)1 << logn; + s2 = (uint16_t *)tmp; + for (u = 0; u < n; u ++) { + uint32_t w; + + w = (uint32_t)sig[u]; + w += Q & -(w >> 31); + s2[u] = (uint16_t)w; + } + mq_NTT(s2, logn); + r = 0; + for (u = 0; u < n; u ++) { + uint32_t w; + + w = (uint32_t)s2[u] - 1u; + r += (w >> 31); + } + return (int)r; +} diff --git a/crypto_sign/falcon-512/META.yml b/crypto_sign/falcon-512/META.yml index e12b3db9..e66ba9be 100644 --- a/crypto_sign/falcon-512/META.yml +++ b/crypto_sign/falcon-512/META.yml @@ -4,8 +4,8 @@ claimed-nist-level: 1 length-public-key: 897 length-secret-key: 1281 length-signature: 690 -nistkat-sha256: abc62e7be3d7c1db757ba3cbb771cfdc89c6b36fb5efc885593db89ec2ea8bc4 -testvectors-sha256: 1a1b170fc9e4623e7ff519c15ec7a2dda55e94a175756b7c72429451bd226b09 +nistkat-sha256: e9c3985f1ce732e29ca81aeca091f20d4dbb5beb456ee1a7ab41d04add4dab10 +testvectors-sha256: 036b5e803ab825146502513b7460b24cc9493f8e366323cd5e30e2dc6d4ca6a7 principal-submitters: - Thomas Prest auxiliary-submitters: diff --git a/crypto_sign/falcon-512/clean/common.c b/crypto_sign/falcon-512/clean/common.c index e46a4eb5..dcea0c1a 100644 --- a/crypto_sign/falcon-512/clean/common.c +++ b/crypto_sign/falcon-512/clean/common.c @@ -33,10 +33,43 @@ /* see inner.h */ void -PQCLEAN_FALCON512_CLEAN_hash_to_point( - shake256_context *sc, - uint16_t *x, unsigned logn, uint8_t *tmp) { +PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime( + inner_shake256_context *sc, + uint16_t *x, unsigned logn) { + /* + * This is the straightforward per-the-spec implementation. It + * is not constant-time, thus it might reveal information on the + * plaintext (at least, enough to check the plaintext against a + * list of potential plaintexts) in a scenario where the + * attacker does not have access to the signature value or to + * the public key, but knows the nonce (without knowledge of the + * nonce, the hashed output cannot be matched against potential + * plaintexts). + */ + size_t n; + n = (size_t)1 << logn; + while (n > 0) { + uint8_t buf[2]; + uint32_t w; + + inner_shake256_extract(sc, (void *)buf, sizeof buf); + w = ((unsigned)buf[0] << 8) | (unsigned)buf[1]; + if (w < 61445) { + while (w >= 12289) { + w -= 12289; + } + *x ++ = (uint16_t)w; + n --; + } + } +} + +/* see inner.h */ +void +PQCLEAN_FALCON512_CLEAN_hash_to_point_ct( + inner_shake256_context *sc, + uint16_t *x, unsigned logn, uint8_t *tmp) { /* * Each 16-bit sample is a value in 0..65535. The value is * kept if it falls in 0..61444 (because 61445 = 5*12289) @@ -97,7 +130,7 @@ PQCLEAN_FALCON512_CLEAN_hash_to_point( uint8_t buf[2]; uint32_t w, wr; - shake256_extract(sc, buf, sizeof buf); + inner_shake256_extract(sc, buf, sizeof buf); w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1]; wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1)); wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1)); @@ -196,7 +229,6 @@ PQCLEAN_FALCON512_CLEAN_hash_to_point( *d = (uint16_t)(dv ^ (mk & (sv ^ dv))); } } - } /* see inner.h */ diff --git a/crypto_sign/falcon-512/clean/fpr.c b/crypto_sign/falcon-512/clean/fpr.c index b9a8999d..636b4092 100644 --- a/crypto_sign/falcon-512/clean/fpr.c +++ b/crypto_sign/falcon-512/clean/fpr.c @@ -507,7 +507,7 @@ fpr_sqrt(fpr x) { uint64_t -fpr_expm_p63(fpr x) { +fpr_expm_p63(fpr x, fpr ccs) { /* * Polynomial approximation of exp(-x) is taken from FACCT: * https://eprint.iacr.org/2018/1234 @@ -539,6 +539,8 @@ fpr_expm_p63(fpr x) { uint64_t z, y; unsigned u; + uint32_t z0, z1, y0, y1; + uint64_t a, b; y = C[0]; z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1; @@ -554,8 +556,7 @@ fpr_expm_p63(fpr x) { * also have appropriate IEEE754 floating-point support, * which is better. */ - uint32_t z0, z1, y0, y1; - uint64_t a, b, c; + uint64_t c; z0 = (uint32_t)z; z1 = (uint32_t)(z >> 32); @@ -569,6 +570,24 @@ fpr_expm_p63(fpr x) { c += (uint64_t)z1 * (uint64_t)y1; y = C[u] - c; } + + /* + * The scaling factor must be applied at the end. Since y is now + * in fixed-point notation, we have to convert the factor to the + * same format, and do an extra integer multiplication. + */ + z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1; + z0 = (uint32_t)z; + z1 = (uint32_t)(z >> 32); + y0 = (uint32_t)y; + y1 = (uint32_t)(y >> 32); + a = ((uint64_t)z0 * (uint64_t)y1) + + (((uint64_t)z0 * (uint64_t)y0) >> 32); + b = ((uint64_t)z1 * (uint64_t)y0); + y = (a >> 32) + (b >> 32); + y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32); + y += (uint64_t)z1 * (uint64_t)y1; + return y; } diff --git a/crypto_sign/falcon-512/clean/fpr.h b/crypto_sign/falcon-512/clean/fpr.h index ef7275a1..f29e55f3 100644 --- a/crypto_sign/falcon-512/clean/fpr.h +++ b/crypto_sign/falcon-512/clean/fpr.h @@ -232,6 +232,8 @@ static const fpr fpr_zero = 0; static const fpr fpr_one = 4607182418800017408; static const fpr fpr_two = 4611686018427387904; static const fpr fpr_onehalf = 4602678819172646912; +static const fpr fpr_invsqrt2 = 4604544271217802189; +static const fpr fpr_invsqrt8 = 4600040671590431693; static const fpr fpr_ptwo31 = 4746794007248502784; static const fpr fpr_ptwo31m1 = 4746794007244308480; static const fpr fpr_mtwo31m1 = 13970166044099084288U; @@ -444,7 +446,7 @@ fpr_lt(fpr x, fpr y) { * bits or so. */ #define fpr_expm_p63 PQCLEAN_FALCON512_CLEAN_fpr_expm_p63 -uint64_t fpr_expm_p63(fpr x); +uint64_t fpr_expm_p63(fpr x, fpr ccs); #define fpr_gm_tab PQCLEAN_FALCON512_CLEAN_fpr_gm_tab extern const fpr fpr_gm_tab[]; diff --git a/crypto_sign/falcon-512/clean/inner.h b/crypto_sign/falcon-512/clean/inner.h index 4861df6e..78c74bb8 100644 --- a/crypto_sign/falcon-512/clean/inner.h +++ b/crypto_sign/falcon-512/clean/inner.h @@ -34,6 +34,45 @@ * @author Thomas Pornin */ +/* + * IMPORTANT API RULES + * ------------------- + * + * This API has some non-trivial usage rules: + * + * + * - All public functions (i.e. the non-static ones) must be referenced + * with the PQCLEAN_FALCON512_CLEAN_ macro (e.g. PQCLEAN_FALCON512_CLEAN_verify_raw for the verify_raw() + * function). That macro adds a prefix to the name, which is + * configurable with the FALCON_PREFIX macro. This allows compiling + * the code into a specific "namespace" and potentially including + * several versions of this code into a single application (e.g. to + * have an AVX2 and a non-AVX2 variants and select the one to use at + * runtime based on availability of AVX2 opcodes). + * + * - Functions that need temporary buffers expects them as a final + * tmp[] array of type uint8_t*, with a size which is documented for + * each function. However, most have some alignment requirements, + * because they will use the array to store 16-bit, 32-bit or 64-bit + * values (e.g. uint64_t or double). The caller must ensure proper + * alignment. What happens on unaligned access depends on the + * underlying architecture, ranging from a slight time penalty + * to immediate termination of the process. + * + * - Some functions rely on specific rounding rules and precision for + * floating-point numbers. On some systems (in particular 32-bit x86 + * with the 387 FPU), this requires setting an hardware control + * word. The caller MUST use set_fpu_cw() to ensure proper precision: + * + * oldcw = set_fpu_cw(2); + * PQCLEAN_FALCON512_CLEAN_sign_dyn(...); + * set_fpu_cw(oldcw); + * + * On systems where the native floating-point precision is already + * proper, or integer-based emulation is used, the set_fpu_cw() + * function does nothing, so it can be called systematically. + */ + #include #include @@ -42,22 +81,47 @@ + +/* + * Some computations with floating-point elements, in particular + * rounding to the nearest integer, rely on operations using _exactly_ + * the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit + * x86, the 387 FPU may be used (depending on the target OS) and, in + * that case, may use more precision bits (i.e. 64 bits, for an 80-bit + * total type length); to prevent miscomputations, we define an explicit + * function that modifies the precision in the FPU control word. + * + * set_fpu_cw() sets the precision to the provided value, and returns + * the previously set precision; callers are supposed to restore the + * previous precision on exit. The correct (52-bit) precision is + * configured with the value "2". On unsupported compilers, or on + * targets other than 32-bit x86, or when the native 'double' type is + * not used, the set_fpu_cw() function does nothing at all. + */ +static inline unsigned +set_fpu_cw(unsigned x) { + return x; +} + + + + /* ==================================================================== */ /* * SHAKE256 implementation (shake.c). * * API is defined to be easily replaced with the fips202.h API defined - * as part of PQ Clean. + * as part of PQClean. */ #include "fips202.h" -#define shake256_context shake256incctx -#define shake256_init(sc) shake256_inc_init(sc) -#define shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) -#define shake256_flip(sc) shake256_inc_finalize(sc) -#define shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) +#define inner_shake256_context shake256incctx +#define inner_shake256_init(sc) shake256_inc_init(sc) +#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len) +#define inner_shake256_flip(sc) shake256_inc_finalize(sc) +#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc) /* ==================================================================== */ @@ -140,9 +204,22 @@ extern const uint8_t PQCLEAN_FALCON512_CLEAN_max_sig_bits[]; /* * From a SHAKE256 context (must be already flipped), produce a new - * point. The temporary buffer (tmp) must have room for 2*2^logn bytes. + * point. This is the non-constant-time version, which may leak enough + * information to serve as a stop condition on a brute force attack on + * the hashed message (provided that the nonce value is known). */ -void PQCLEAN_FALCON512_CLEAN_hash_to_point(shake256_context *sc, +void PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime(inner_shake256_context *sc, + uint16_t *x, unsigned logn); + +/* + * From a SHAKE256 context (must be already flipped), produce a new + * point. The temporary buffer (tmp) must have room for 2*2^logn bytes. + * This function is constant-time but is typically more expensive than + * PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime(). + * + * tmp[] must have 16-bit alignment. + */ +void PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(inner_shake256_context *sc, uint16_t *x, unsigned logn, uint8_t *tmp); /* @@ -184,6 +261,8 @@ void PQCLEAN_FALCON512_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn); * logn is the degree log * tmp[] temporary, must have at least 2*2^logn bytes * Returned value is 1 on success, 0 on error. + * + * tmp[] must have 16-bit alignment. */ int PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, const uint16_t *h, unsigned logn, uint8_t *tmp); @@ -195,6 +274,7 @@ int PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, * reported if f is not invertible mod phi mod q). * * The tmp[] array must have room for at least 2*2^logn elements. + * tmp[] must have 16-bit alignment. */ int PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h, const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp); @@ -208,11 +288,53 @@ int PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h, * The tmp[] array must have room for at least 4*2^logn bytes. * * Returned value is 1 in success, 0 on error (f not invertible). + * tmp[] must have 16-bit alignment. */ int PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G, const int8_t *f, const int8_t *g, const int8_t *F, unsigned logn, uint8_t *tmp); +/* + * Test whether a given polynomial is invertible modulo phi and q. + * Polynomial coefficients are small integers. + * + * tmp[] must have 16-bit alignment. + */ +int PQCLEAN_FALCON512_CLEAN_is_invertible( + const int16_t *s2, unsigned logn, uint8_t *tmp); + +/* + * Count the number of elements of value zero in the NTT representation + * of the given polynomial: this is the number of primitive 2n-th roots + * of unity (modulo q = 12289) that are roots of the provided polynomial + * (taken modulo q). + * + * tmp[] must have 16-bit alignment. + */ +int PQCLEAN_FALCON512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp); + +/* + * Internal signature verification with public key recovery: + * h[] receives the public key (NOT in NTT/Montgomery format) + * c0[] contains the hashed nonce+message + * s1[] is the first signature half + * s2[] is the second signature half + * logn is the degree log + * tmp[] temporary, must have at least 2*2^logn bytes + * Returned value is 1 on success, 0 on error. Success is returned if + * the signature is a short enough vector; in that case, the public + * key has been written to h[]. However, the caller must still + * verify that h[] is the correct value (e.g. with regards to a known + * hash of the public key). + * + * h[] may not overlap with any of the other arrays. + * + * tmp[] must have 16-bit alignment. + */ +int PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h, + const uint16_t *c0, const int16_t *s1, const int16_t *s2, + unsigned logn, uint8_t *tmp); + /* ==================================================================== */ /* * Implementation of floating-point real numbers (fpr.h, fpr.c). @@ -358,7 +480,7 @@ typedef struct { * Instantiate a PRNG. That PRNG will feed over the provided SHAKE256 * context (in "flipped" state) to obtain its initial state. */ -void PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src); +void PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src); /* * Refill the PRNG buffer. This is normally invoked automatically, and @@ -586,6 +708,9 @@ void PQCLEAN_FALCON512_CLEAN_poly_merge_fft(fpr *f, /* * Required sizes of the temporary buffer (in bytes). + * + * This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1 + * or 2) where it is slightly greater. */ #define FALCON_KEYGEN_TEMP_1 136 #define FALCON_KEYGEN_TEMP_2 272 @@ -608,8 +733,11 @@ void PQCLEAN_FALCON512_CLEAN_poly_merge_fft(fpr *f, * public key is written in h. Either or both of G and h may be NULL, * in which case the corresponding element is not returned (they can * be recomputed from f, g and F). + * + * tmp[] must have 64-bit alignment. + * This function uses floating-point rounding (see set_fpu_cw()). */ -void PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng, +void PQCLEAN_FALCON512_CLEAN_keygen(inner_shake256_context *rng, int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, unsigned logn, uint8_t *tmp); @@ -624,6 +752,9 @@ void PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng, * a total of (8*logn+40)*2^logn bytes. * * The tmp[] array must have room for at least 48*2^logn bytes. + * + * tmp[] must have 64-bit alignment. + * This function uses floating-point rounding (see set_fpu_cw()). */ void PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key, const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, @@ -636,9 +767,15 @@ void PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key, * * The sig[] and hm[] buffers may overlap. * + * On successful output, the start of the tmp[] buffer contains the s1 + * vector (as int16_t elements). + * * The minimal size (in bytes) of tmp[] is 48*2^logn bytes. + * + * tmp[] must have 64-bit alignment. + * This function uses floating-point rounding (see set_fpu_cw()). */ -void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, +void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng, const fpr *expanded_key, const uint16_t *hm, unsigned logn, uint8_t *tmp); @@ -651,13 +788,47 @@ void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, * * The sig[] and hm[] buffers may overlap. * + * On successful output, the start of the tmp[] buffer contains the s1 + * vector (as int16_t elements). + * * The minimal size (in bytes) of tmp[] is 72*2^logn bytes. + * + * tmp[] must have 64-bit alignment. + * This function uses floating-point rounding (see set_fpu_cw()). */ -void PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng, +void PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng, const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, const uint16_t *hm, unsigned logn, uint8_t *tmp); +/* + * Internal sampler engine. Exported for tests. + * + * sampler_context wraps around a source of random numbers (PRNG) and + * the sigma_min value (nominally dependent on the degree). + * + * sampler() takes as parameters: + * ctx pointer to the sampler_context structure + * mu center for the distribution + * isigma inverse of the distribution standard deviation + * It returns an integer sampled along the Gaussian distribution centered + * on mu and of standard deviation sigma = 1/isigma. + * + * gaussian0_sampler() takes as parameter a pointer to a PRNG, and + * returns an integer sampled along a half-Gaussian with standard + * deviation sigma0 = 1.8205 (center is 0, returned value is + * nonnegative). + */ + +typedef struct { + prng p; + fpr sigma_min; +} sampler_context; + +int PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma); + +int PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(prng *p); + /* ==================================================================== */ #endif diff --git a/crypto_sign/falcon-512/clean/keygen.c b/crypto_sign/falcon-512/clean/keygen.c index 691165ae..b8f0dac1 100644 --- a/crypto_sign/falcon-512/clean/keygen.c +++ b/crypto_sign/falcon-512/clean/keygen.c @@ -2171,6 +2171,9 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride, /* ==================================================================== */ + +#define RNG_CONTEXT inner_shake256_context + /* * Get a random 8-byte integer from a SHAKE-based RNG. This function * ensures consistent interpretation of the SHAKE output so that @@ -2178,14 +2181,14 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride, * a known seed is used. */ static inline uint64_t -get_rng_u64(shake256_context *rng) { +get_rng_u64(inner_shake256_context *rng) { /* * We enforce little-endian representation. */ uint8_t tmp[8]; - shake256_extract(rng, tmp, sizeof tmp); + inner_shake256_extract(rng, tmp, sizeof tmp); return (uint64_t)tmp[0] | ((uint64_t)tmp[1] << 8) | ((uint64_t)tmp[2] << 16) @@ -2196,6 +2199,7 @@ get_rng_u64(shake256_context *rng) { | ((uint64_t)tmp[7] << 56); } + /* * Table below incarnates a discrete Gaussian distribution: * D(x) = exp(-(x^2)/(2*sigma^2)) @@ -2227,7 +2231,7 @@ static const uint64_t gauss_1024_12289[] = { * together for lower dimensions. */ static int -mkgauss(shake256_context *rng, unsigned logn) { +mkgauss(RNG_CONTEXT *rng, unsigned logn) { unsigned u, g; int val; @@ -3156,6 +3160,7 @@ solve_NTRU_intermediate(unsigned logn_top, fpr xv; xv = fpr_mul(rt2[u], pdc); + /* * Sometimes the values can be out-of-bounds if * the algorithm fails; we must not call @@ -4006,7 +4011,7 @@ solve_NTRU(unsigned logn, int8_t *F, int8_t *G, * also makes sure that the resultant of the polynomial with phi is odd. */ static void -poly_small_mkgauss(shake256_context *rng, int8_t *f, unsigned logn) { +poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) { size_t n, u; unsigned mod2; @@ -4046,7 +4051,7 @@ restart: /* see falcon.h */ void -PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng, +PQCLEAN_FALCON512_CLEAN_keygen(inner_shake256_context *rng, int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h, unsigned logn, uint8_t *tmp) { /* @@ -4070,8 +4075,10 @@ PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng, */ size_t n, u; uint16_t *h2, *tmp2; + RNG_CONTEXT *rc; n = MKN(logn); + rc = rng; /* * We need to generate f and g randomly, until we find values @@ -4104,8 +4111,8 @@ PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng, * (i.e. the resultant of the polynomial with phi * will be odd). */ - poly_small_mkgauss(rng, f, logn); - poly_small_mkgauss(rng, g, logn); + poly_small_mkgauss(rc, f, logn); + poly_small_mkgauss(rc, g, logn); /* * Verify that all coefficients are within the bounds diff --git a/crypto_sign/falcon-512/clean/pqclean.c b/crypto_sign/falcon-512/clean/pqclean.c index 6e5ddd99..c31599b5 100644 --- a/crypto_sign/falcon-512/clean/pqclean.c +++ b/crypto_sign/falcon-512/clean/pqclean.c @@ -51,16 +51,16 @@ PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair( int8_t f[512], g[512], F[512]; uint16_t h[512]; unsigned char seed[48]; - shake256_context rng; + inner_shake256_context rng; size_t u, v; /* * Generate key pair. */ randombytes(seed, sizeof seed); - shake256_init(&rng); - shake256_inject(&rng, seed, sizeof seed); - shake256_flip(&rng); + inner_shake256_init(&rng); + inner_shake256_inject(&rng, seed, sizeof seed); + inner_shake256_flip(&rng); PQCLEAN_FALCON512_CLEAN_keygen(&rng, f, g, F, NULL, h, 9, tmp.b); /* @@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, uint16_t hm[512]; } r; unsigned char seed[48]; - shake256_context sc; + inner_shake256_context sc; size_t u, v; /* @@ -181,19 +181,19 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen, /* * Hash message nonce + message into a vector. */ - shake256_init(&sc); - shake256_inject(&sc, nonce, NONCELEN); - shake256_inject(&sc, m, mlen); - shake256_flip(&sc); - PQCLEAN_FALCON512_CLEAN_hash_to_point(&sc, r.hm, 9, tmp.b); + inner_shake256_init(&sc); + inner_shake256_inject(&sc, nonce, NONCELEN); + inner_shake256_inject(&sc, m, mlen); + inner_shake256_flip(&sc); + PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(&sc, r.hm, 9, tmp.b); /* * Initialize a RNG. */ randombytes(seed, sizeof seed); - shake256_init(&sc); - shake256_inject(&sc, seed, sizeof seed); - shake256_flip(&sc); + inner_shake256_init(&sc); + inner_shake256_inject(&sc, seed, sizeof seed); + inner_shake256_flip(&sc); /* * Compute and return the signature. This loops until a signature @@ -225,7 +225,7 @@ do_verify( } tmp; uint16_t h[512], hm[512]; int16_t sig[512]; - shake256_context sc; + inner_shake256_context sc; /* * Decode public key. @@ -253,11 +253,11 @@ do_verify( /* * Hash nonce + message into a vector. */ - shake256_init(&sc); - shake256_inject(&sc, nonce, NONCELEN); - shake256_inject(&sc, m, mlen); - shake256_flip(&sc); - PQCLEAN_FALCON512_CLEAN_hash_to_point(&sc, hm, 9, tmp.b); + inner_shake256_init(&sc); + inner_shake256_inject(&sc, nonce, NONCELEN); + inner_shake256_inject(&sc, m, mlen); + inner_shake256_flip(&sc); + PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(&sc, hm, 9, tmp.b); /* * Verify signature. diff --git a/crypto_sign/falcon-512/clean/rng.c b/crypto_sign/falcon-512/clean/rng.c index f09bec93..93859344 100644 --- a/crypto_sign/falcon-512/clean/rng.c +++ b/crypto_sign/falcon-512/clean/rng.c @@ -36,7 +36,7 @@ /* see inner.h */ void -PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src) { +PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src) { /* * To ensure reproducibility for a given seed, we * must enforce little-endian interpretation of @@ -46,7 +46,7 @@ PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src) { uint64_t th, tl; int i; - shake256_extract(src, tmp, 56); + inner_shake256_extract(src, tmp, 56); for (i = 0; i < 14; i ++) { uint32_t w; diff --git a/crypto_sign/falcon-512/clean/sign.c b/crypto_sign/falcon-512/clean/sign.c index 9fd0fc78..d53fda29 100644 --- a/crypto_sign/falcon-512/clean/sign.c +++ b/crypto_sign/falcon-512/clean/sign.c @@ -417,8 +417,170 @@ ffSampling_fft(samplerZ samp, void *samp_ctx, size_t n, hn; const fpr *tree0, *tree1; - n = (size_t)1 << logn; - if (n == 1) { + /* + * When logn == 2, we inline the last two recursion levels. + */ + if (logn == 2) { + fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma; + fpr a_re, a_im, b_re, b_im, c_re, c_im; + + tree0 = tree + 4; + tree1 = tree + 8; + + /* + * We split t1 into w*, then do the recursive invocation, + * with output in w*. We finally merge back into z1. + */ + a_re = t1[0]; + a_im = t1[2]; + b_re = t1[1]; + b_im = t1[3]; + c_re = fpr_add(a_re, b_re); + c_im = fpr_add(a_im, b_im); + w0 = fpr_half(c_re); + w1 = fpr_half(c_im); + c_re = fpr_sub(a_re, b_re); + c_im = fpr_sub(a_im, b_im); + w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); + w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); + + x0 = w2; + x1 = w3; + sigma = tree1[3]; + w2 = fpr_of(samp(samp_ctx, x0, sigma)); + w3 = fpr_of(samp(samp_ctx, x1, sigma)); + a_re = fpr_sub(x0, w2); + a_im = fpr_sub(x1, w3); + b_re = tree1[0]; + b_im = tree1[1]; + c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + x0 = fpr_add(c_re, w0); + x1 = fpr_add(c_im, w1); + sigma = tree1[2]; + w0 = fpr_of(samp(samp_ctx, x0, sigma)); + w1 = fpr_of(samp(samp_ctx, x1, sigma)); + + a_re = w0; + a_im = w1; + b_re = w2; + b_im = w3; + c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); + c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); + z1[0] = w0 = fpr_add(a_re, c_re); + z1[2] = w2 = fpr_add(a_im, c_im); + z1[1] = w1 = fpr_sub(a_re, c_re); + z1[3] = w3 = fpr_sub(a_im, c_im); + + /* + * Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*. + */ + w0 = fpr_sub(t1[0], w0); + w1 = fpr_sub(t1[1], w1); + w2 = fpr_sub(t1[2], w2); + w3 = fpr_sub(t1[3], w3); + + a_re = w0; + a_im = w2; + b_re = tree[0]; + b_im = tree[2]; + w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + a_re = w1; + a_im = w3; + b_re = tree[1]; + b_im = tree[3]; + w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + + w0 = fpr_add(w0, t0[0]); + w1 = fpr_add(w1, t0[1]); + w2 = fpr_add(w2, t0[2]); + w3 = fpr_add(w3, t0[3]); + + /* + * Second recursive invocation. + */ + a_re = w0; + a_im = w2; + b_re = w1; + b_im = w3; + c_re = fpr_add(a_re, b_re); + c_im = fpr_add(a_im, b_im); + w0 = fpr_half(c_re); + w1 = fpr_half(c_im); + c_re = fpr_sub(a_re, b_re); + c_im = fpr_sub(a_im, b_im); + w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8); + w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8); + + x0 = w2; + x1 = w3; + sigma = tree0[3]; + w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma)); + w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma)); + a_re = fpr_sub(x0, y0); + a_im = fpr_sub(x1, y1); + b_re = tree0[0]; + b_im = tree0[1]; + c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + x0 = fpr_add(c_re, w0); + x1 = fpr_add(c_im, w1); + sigma = tree0[2]; + w0 = fpr_of(samp(samp_ctx, x0, sigma)); + w1 = fpr_of(samp(samp_ctx, x1, sigma)); + + a_re = w0; + a_im = w1; + b_re = w2; + b_im = w3; + c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2); + c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2); + z0[0] = fpr_add(a_re, c_re); + z0[2] = fpr_add(a_im, c_im); + z0[1] = fpr_sub(a_re, c_re); + z0[3] = fpr_sub(a_im, c_im); + + return; + } + + /* + * Case logn == 1 is reachable only when using Falcon-2 (the + * smallest size for which Falcon is mathematically defined, but + * of course way too insecure to be of any use). + */ + if (logn == 1) { + fpr x0, x1, y0, y1, sigma; + fpr a_re, a_im, b_re, b_im, c_re, c_im; + + x0 = t1[0]; + x1 = t1[1]; + sigma = tree[3]; + z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma)); + z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma)); + a_re = fpr_sub(x0, y0); + a_im = fpr_sub(x1, y1); + b_re = tree[0]; + b_im = tree[1]; + c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im)); + c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re)); + x0 = fpr_add(c_re, t0[0]); + x1 = fpr_add(c_im, t0[1]); + sigma = tree[2]; + z0[0] = fpr_of(samp(samp_ctx, x0, sigma)); + z0[1] = fpr_of(samp(samp_ctx, x1, sigma)); + + return; + } + + /* + * Normal end of recursion is for logn == 0. Since the last + * steps of the recursions were inlined in the blocks above + * (when logn == 1 or 2), this case is not reachable, and is + * retained here only for documentation purposes. + + if (logn == 0) { fpr x0, x1, sigma; x0 = t0[0]; @@ -429,6 +591,13 @@ ffSampling_fft(samplerZ samp, void *samp_ctx, return; } + */ + + /* + * General recursive case (logn >= 3). + */ + + n = (size_t)1 << logn; hn = n >> 1; tree0 = tree + n; tree1 = tree + n + ffLDL_treesize(logn - 1); @@ -480,7 +649,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, const fpr *b00, *b01, *b10, *b11, *tree; fpr ni; uint32_t sqn, ng; - int16_t *s2tmp; + int16_t *s1tmp, *s2tmp; n = MKN(logn); t0 = tmp; @@ -542,6 +711,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, /* * Compute the signature. */ + s1tmp = (int16_t *)tx; sqn = 0; ng = 0; for (u = 0; u < n; u ++) { @@ -550,6 +720,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); sqn += (uint32_t)(z * z); ng |= sqn; + s1tmp[u] = (int16_t)z; } sqn |= -(ng >> 31); @@ -568,6 +739,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2, } if (PQCLEAN_FALCON512_CLEAN_is_short_half(sqn, s2tmp, logn)) { memcpy(s2, s2tmp, n * sizeof * s2); + memcpy(tmp, s1tmp, n * sizeof * s1tmp); return 1; } return 0; @@ -592,7 +764,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11; fpr ni; uint32_t sqn, ng; - int16_t *s2tmp; + int16_t *s1tmp, *s2tmp; n = MKN(logn); @@ -745,6 +917,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, PQCLEAN_FALCON512_CLEAN_iFFT(t0, logn); PQCLEAN_FALCON512_CLEAN_iFFT(t1, logn); + s1tmp = (int16_t *)tx; sqn = 0; ng = 0; for (u = 0; u < n; u ++) { @@ -753,6 +926,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]); sqn += (uint32_t)(z * z); ng |= sqn; + s1tmp[u] = (int16_t)z; } sqn |= -(ng >> 31); @@ -771,6 +945,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, } if (PQCLEAN_FALCON512_CLEAN_is_short_half(sqn, s2tmp, logn)) { memcpy(s2, s2tmp, n * sizeof * s2); + memcpy(tmp, s1tmp, n * sizeof * s1tmp); return 1; } return 0; @@ -780,29 +955,28 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2, * Sample an integer value along a half-gaussian distribution centered * on zero and standard deviation 1.8205, with a precision of 72 bits. */ -static int -gaussian0_sampler(prng *p) { +int +PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(prng *p) { static const uint32_t dist[] = { - 6031371U, 13708371U, 13035518U, - 5186761U, 1487980U, 12270720U, - 3298653U, 4688887U, 5511555U, - 1551448U, 9247616U, 9467675U, - 539632U, 14076116U, 5909365U, - 138809U, 10836485U, 13263376U, - 26405U, 15335617U, 16601723U, - 3714U, 14514117U, 13240074U, - 386U, 8324059U, 3276722U, - 29U, 12376792U, 7821247U, - 1U, 11611789U, 3398254U, - 0U, 1194629U, 4532444U, - 0U, 37177U, 2973575U, - 0U, 855U, 10369757U, - 0U, 14U, 9441597U, - 0U, 0U, 3075302U, - 0U, 0U, 28626U, - 0U, 0U, 197U, - 0U, 0U, 1U + 10745844u, 3068844u, 3741698u, + 5559083u, 1580863u, 8248194u, + 2260429u, 13669192u, 2736639u, + 708981u, 4421575u, 10046180u, + 169348u, 7122675u, 4136815u, + 30538u, 13063405u, 7650655u, + 4132u, 14505003u, 7826148u, + 417u, 16768101u, 11363290u, + 31u, 8444042u, 8086568u, + 1u, 12844466u, 265321u, + 0u, 1232676u, 13644283u, + 0u, 38047u, 9111839u, + 0u, 870u, 6138264u, + 0u, 14u, 12545723u, + 0u, 0u, 3104126u, + 0u, 0u, 28824u, + 0u, 0u, 198u, + 0u, 0u, 1u }; uint32_t v0, v1, v2, hi; @@ -843,7 +1017,7 @@ gaussian0_sampler(prng *p) { * Sample a bit with probability exp(-x) for some x >= 0. */ static int -BerExp(prng *p, fpr x) { +BerExp(prng *p, fpr x, fpr ccs) { int s, i; fpr r; uint32_t sw, w; @@ -880,7 +1054,7 @@ BerExp(prng *p, fpr x) { * case). The bias is negligible since fpr_expm_p63() only computes * with 51 bits of precision or so. */ - z = ((fpr_expm_p63(r) << 1) - 1) >> s; + z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s; /* * Sample a bit with probability exp(-x). Since x = s*log(2) + r, @@ -896,11 +1070,6 @@ BerExp(prng *p, fpr x) { return (int)(w >> 31); } -typedef struct { - prng p; - fpr sigma_min; -} sampler_context; - /* * The sampler produces a random integer that follows a discrete Gaussian * distribution, centered on mu, and with standard deviation sigma. The @@ -909,8 +1078,8 @@ typedef struct { * The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between * 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9. */ -static int -sampler(void *ctx, fpr mu, fpr isigma) { +int +PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) { sampler_context *spc; int s; fpr r, dss, ccs; @@ -952,7 +1121,7 @@ sampler(void *ctx, fpr mu, fpr isigma) { * - b = 0: z <= 0 and sampled against a Gaussian * centered on 0. */ - z0 = gaussian0_sampler(&spc->p); + z0 = PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(&spc->p); b = prng_get_u8(&spc->p) & 1; z = b + ((b << 1) - 1) * z0; @@ -983,8 +1152,7 @@ sampler(void *ctx, fpr mu, fpr isigma) { */ x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss); x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0)); - x = fpr_mul(x, ccs); - if (BerExp(&spc->p, x)) { + if (BerExp(&spc->p, x, ccs)) { /* * Rejection sampling was centered on r, but the * actual center is mu = s + r. @@ -996,7 +1164,7 @@ sampler(void *ctx, fpr mu, fpr isigma) { /* see inner.h */ void -PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, +PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng, const fpr *expanded_key, const uint16_t *hm, unsigned logn, uint8_t *tmp) { fpr *ftmp; @@ -1025,7 +1193,7 @@ PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, ? fpr_sigma_min_10 : fpr_sigma_min_9; PQCLEAN_FALCON512_CLEAN_prng_init(&spc.p, rng); - samp = sampler; + samp = PQCLEAN_FALCON512_CLEAN_sampler; samp_ctx = &spc; /* @@ -1040,7 +1208,7 @@ PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng, /* see inner.h */ void -PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng, +PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng, const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G, const uint16_t *hm, unsigned logn, uint8_t *tmp) { @@ -1070,7 +1238,7 @@ PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng, ? fpr_sigma_min_10 : fpr_sigma_min_9; PQCLEAN_FALCON512_CLEAN_prng_init(&spc.p, rng); - samp = sampler; + samp = PQCLEAN_FALCON512_CLEAN_sampler; samp_ctx = &spc; /* diff --git a/crypto_sign/falcon-512/clean/vrfy.c b/crypto_sign/falcon-512/clean/vrfy.c index 839e80ea..779bd2c8 100644 --- a/crypto_sign/falcon-512/clean/vrfy.c +++ b/crypto_sign/falcon-512/clean/vrfy.c @@ -649,7 +649,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, } /* - * Compute s1 = s2*h - c0 mod phi mod q (in tt[]). + * Compute -s1 = s2*h - c0 mod phi mod q (in tt[]). */ mq_NTT(tt, logn); mq_poly_montymul_ntt(tt, h, logn); @@ -657,7 +657,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, mq_poly_sub(tt, c0, logn); /* - * Normalize s1 elements into the [-q/2..q/2] range. + * Normalize -s1 elements into the [-q/2..q/2] range. */ for (u = 0; u < n; u ++) { int32_t w; @@ -668,7 +668,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2, } /* - * Signature is valid if and only if the aggregate (s1,s2) vector + * Signature is valid if and only if the aggregate (-s1,s2) vector * is short enough. */ return PQCLEAN_FALCON512_CLEAN_is_short((int16_t *)tt, s2, logn); @@ -699,7 +699,7 @@ PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h, return 1; } -/* see internal.h */ +/* see inner.h */ int PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G, const int8_t *f, const int8_t *g, const int8_t *F, @@ -743,3 +743,110 @@ PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G, } return 1; } + +/* see inner.h */ +int +PQCLEAN_FALCON512_CLEAN_is_invertible( + const int16_t *s2, unsigned logn, uint8_t *tmp) { + size_t u, n; + uint16_t *tt; + uint32_t r; + + n = (size_t)1 << logn; + tt = (uint16_t *)tmp; + for (u = 0; u < n; u ++) { + uint32_t w; + + w = (uint32_t)s2[u]; + w += Q & -(w >> 31); + tt[u] = (uint16_t)w; + } + mq_NTT(tt, logn); + r = 0; + for (u = 0; u < n; u ++) { + r |= (uint32_t)(tt[u] - 1); + } + return (int)(1u - (r >> 31)); +} + +/* see inner.h */ +int +PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h, + const uint16_t *c0, const int16_t *s1, const int16_t *s2, + unsigned logn, uint8_t *tmp) { + size_t u, n; + uint16_t *tt; + uint32_t r; + + n = (size_t)1 << logn; + + /* + * Reduce elements of s1 and s2 modulo q; then write s2 into tt[] + * and c0 - s1 into h[]. + */ + tt = (uint16_t *)tmp; + for (u = 0; u < n; u ++) { + uint32_t w; + + w = (uint32_t)s2[u]; + w += Q & -(w >> 31); + tt[u] = (uint16_t)w; + + w = (uint32_t)s1[u]; + w += Q & -(w >> 31); + w = mq_sub(c0[u], w); + h[u] = (uint16_t)w; + } + + /* + * Compute h = (c0 - s1) / s2. If one of the coefficients of s2 + * is zero (in NTT representation) then the operation fails. We + * keep that information into a flag so that we do not deviate + * from strict constant-time processing; if all coefficients of + * s2 are non-zero, then the high bit of r will be zero. + */ + mq_NTT(tt, logn); + mq_NTT(h, logn); + r = 0; + for (u = 0; u < n; u ++) { + r |= (uint32_t)(tt[u] - 1); + h[u] = (uint16_t)mq_div_12289(h[u], tt[u]); + } + mq_iNTT(h, logn); + + /* + * Signature is acceptable if and only if it is short enough, + * and s2 was invertible mod phi mod q. The caller must still + * check that the rebuilt public key matches the expected + * value (e.g. through a hash). + */ + r = ~r & (uint32_t) - PQCLEAN_FALCON512_CLEAN_is_short(s1, s2, logn); + return (int)(r >> 31); +} + +/* see inner.h */ +int +PQCLEAN_FALCON512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) { + uint16_t *s2; + size_t u, n; + uint32_t r; + + n = (size_t)1 << logn; + s2 = (uint16_t *)tmp; + for (u = 0; u < n; u ++) { + uint32_t w; + + w = (uint32_t)sig[u]; + w += Q & -(w >> 31); + s2[u] = (uint16_t)w; + } + mq_NTT(s2, logn); + r = 0; + for (u = 0; u < n; u ++) { + uint32_t w; + + w = (uint32_t)s2[u] - 1u; + r += (w >> 31); + } + return (int)r; +}