1
1
mirror of https://github.com/henrydcase/pqc.git synced 2024-11-22 15:39:07 +00:00

Fixed sampler bug (update to new upstream Falcon code 2019-09-18).

This commit is contained in:
Thomas Pornin 2019-09-19 15:55:28 -04:00
parent 4be5e497dc
commit 44a050106a
20 changed files with 1204 additions and 192 deletions

View File

@ -4,8 +4,8 @@ claimed-nist-level: 5
length-public-key: 1793
length-secret-key: 2305
length-signature: 1330
nistkat-sha256: ad3d17869fdc05deae13ffa2ef26bde125b42f61b2dcd861a1ae20adcb2accc5
testvectors-sha256: bd8076c13722d8c555c68fc6bd7763e1a9dd5483ee7c8d1c74dd2df459c72a40
nistkat-sha256: ef2104e326221515621638ca03cd99802271bdd9907e2ae5fc7b8d19d696c584
testvectors-sha256: 14ee0e3f0ea4b9b25193a54eed9100b1bb1cf5dbc7813fd9dc9180c1ea1a1042
principal-submitters:
- Thomas Prest
auxiliary-submitters:

View File

@ -33,10 +33,43 @@
/* see inner.h */
void
PQCLEAN_FALCON1024_CLEAN_hash_to_point(
shake256_context *sc,
uint16_t *x, unsigned logn, uint8_t *tmp) {
PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(
inner_shake256_context *sc,
uint16_t *x, unsigned logn) {
/*
* This is the straightforward per-the-spec implementation. It
* is not constant-time, thus it might reveal information on the
* plaintext (at least, enough to check the plaintext against a
* list of potential plaintexts) in a scenario where the
* attacker does not have access to the signature value or to
* the public key, but knows the nonce (without knowledge of the
* nonce, the hashed output cannot be matched against potential
* plaintexts).
*/
size_t n;
n = (size_t)1 << logn;
while (n > 0) {
uint8_t buf[2];
uint32_t w;
inner_shake256_extract(sc, (void *)buf, sizeof buf);
w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
if (w < 61445) {
while (w >= 12289) {
w -= 12289;
}
*x ++ = (uint16_t)w;
n --;
}
}
}
/* see inner.h */
void
PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(
inner_shake256_context *sc,
uint16_t *x, unsigned logn, uint8_t *tmp) {
/*
* Each 16-bit sample is a value in 0..65535. The value is
* kept if it falls in 0..61444 (because 61445 = 5*12289)
@ -97,7 +130,7 @@ PQCLEAN_FALCON1024_CLEAN_hash_to_point(
uint8_t buf[2];
uint32_t w, wr;
shake256_extract(sc, buf, sizeof buf);
inner_shake256_extract(sc, buf, sizeof buf);
w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
@ -196,7 +229,6 @@ PQCLEAN_FALCON1024_CLEAN_hash_to_point(
*d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
}
}
}
/* see inner.h */

View File

@ -507,7 +507,7 @@ fpr_sqrt(fpr x) {
uint64_t
fpr_expm_p63(fpr x) {
fpr_expm_p63(fpr x, fpr ccs) {
/*
* Polynomial approximation of exp(-x) is taken from FACCT:
* https://eprint.iacr.org/2018/1234
@ -539,6 +539,8 @@ fpr_expm_p63(fpr x) {
uint64_t z, y;
unsigned u;
uint32_t z0, z1, y0, y1;
uint64_t a, b;
y = C[0];
z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
@ -554,8 +556,7 @@ fpr_expm_p63(fpr x) {
* also have appropriate IEEE754 floating-point support,
* which is better.
*/
uint32_t z0, z1, y0, y1;
uint64_t a, b, c;
uint64_t c;
z0 = (uint32_t)z;
z1 = (uint32_t)(z >> 32);
@ -569,6 +570,24 @@ fpr_expm_p63(fpr x) {
c += (uint64_t)z1 * (uint64_t)y1;
y = C[u] - c;
}
/*
* The scaling factor must be applied at the end. Since y is now
* in fixed-point notation, we have to convert the factor to the
* same format, and do an extra integer multiplication.
*/
z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
z0 = (uint32_t)z;
z1 = (uint32_t)(z >> 32);
y0 = (uint32_t)y;
y1 = (uint32_t)(y >> 32);
a = ((uint64_t)z0 * (uint64_t)y1)
+ (((uint64_t)z0 * (uint64_t)y0) >> 32);
b = ((uint64_t)z1 * (uint64_t)y0);
y = (a >> 32) + (b >> 32);
y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
y += (uint64_t)z1 * (uint64_t)y1;
return y;
}

View File

@ -232,6 +232,8 @@ static const fpr fpr_zero = 0;
static const fpr fpr_one = 4607182418800017408;
static const fpr fpr_two = 4611686018427387904;
static const fpr fpr_onehalf = 4602678819172646912;
static const fpr fpr_invsqrt2 = 4604544271217802189;
static const fpr fpr_invsqrt8 = 4600040671590431693;
static const fpr fpr_ptwo31 = 4746794007248502784;
static const fpr fpr_ptwo31m1 = 4746794007244308480;
static const fpr fpr_mtwo31m1 = 13970166044099084288U;
@ -444,7 +446,7 @@ fpr_lt(fpr x, fpr y) {
* bits or so.
*/
#define fpr_expm_p63 PQCLEAN_FALCON1024_CLEAN_fpr_expm_p63
uint64_t fpr_expm_p63(fpr x);
uint64_t fpr_expm_p63(fpr x, fpr ccs);
#define fpr_gm_tab PQCLEAN_FALCON1024_CLEAN_fpr_gm_tab
extern const fpr fpr_gm_tab[];

View File

@ -34,6 +34,45 @@
* @author Thomas Pornin <thomas.pornin@nccgroup.com>
*/
/*
* IMPORTANT API RULES
* -------------------
*
* This API has some non-trivial usage rules:
*
*
* - All public functions (i.e. the non-static ones) must be referenced
* with the PQCLEAN_FALCON1024_CLEAN_ macro (e.g. PQCLEAN_FALCON1024_CLEAN_verify_raw for the verify_raw()
* function). That macro adds a prefix to the name, which is
* configurable with the FALCON_PREFIX macro. This allows compiling
* the code into a specific "namespace" and potentially including
* several versions of this code into a single application (e.g. to
* have an AVX2 and a non-AVX2 variants and select the one to use at
* runtime based on availability of AVX2 opcodes).
*
* - Functions that need temporary buffers expects them as a final
* tmp[] array of type uint8_t*, with a size which is documented for
* each function. However, most have some alignment requirements,
* because they will use the array to store 16-bit, 32-bit or 64-bit
* values (e.g. uint64_t or double). The caller must ensure proper
* alignment. What happens on unaligned access depends on the
* underlying architecture, ranging from a slight time penalty
* to immediate termination of the process.
*
* - Some functions rely on specific rounding rules and precision for
* floating-point numbers. On some systems (in particular 32-bit x86
* with the 387 FPU), this requires setting an hardware control
* word. The caller MUST use set_fpu_cw() to ensure proper precision:
*
* oldcw = set_fpu_cw(2);
* PQCLEAN_FALCON1024_CLEAN_sign_dyn(...);
* set_fpu_cw(oldcw);
*
* On systems where the native floating-point precision is already
* proper, or integer-based emulation is used, the set_fpu_cw()
* function does nothing, so it can be called systematically.
*/
#include <stdint.h>
#include <stdlib.h>
@ -42,6 +81,31 @@
/*
* Some computations with floating-point elements, in particular
* rounding to the nearest integer, rely on operations using _exactly_
* the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
* x86, the 387 FPU may be used (depending on the target OS) and, in
* that case, may use more precision bits (i.e. 64 bits, for an 80-bit
* total type length); to prevent miscomputations, we define an explicit
* function that modifies the precision in the FPU control word.
*
* set_fpu_cw() sets the precision to the provided value, and returns
* the previously set precision; callers are supposed to restore the
* previous precision on exit. The correct (52-bit) precision is
* configured with the value "2". On unsupported compilers, or on
* targets other than 32-bit x86, or when the native 'double' type is
* not used, the set_fpu_cw() function does nothing at all.
*/
static inline unsigned
set_fpu_cw(unsigned x) {
return x;
}
/* ==================================================================== */
/*
* SHAKE256 implementation (shake.c).
@ -53,11 +117,11 @@
#include "fips202.h"
#define shake256_context shake256incctx
#define shake256_init(sc) shake256_inc_init(sc)
#define shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
#define shake256_flip(sc) shake256_inc_finalize(sc)
#define shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
#define inner_shake256_context shake256incctx
#define inner_shake256_init(sc) shake256_inc_init(sc)
#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
#define inner_shake256_flip(sc) shake256_inc_finalize(sc)
#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
/* ==================================================================== */
@ -140,9 +204,22 @@ extern const uint8_t PQCLEAN_FALCON1024_CLEAN_max_sig_bits[];
/*
* From a SHAKE256 context (must be already flipped), produce a new
* point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
* point. This is the non-constant-time version, which may leak enough
* information to serve as a stop condition on a brute force attack on
* the hashed message (provided that the nonce value is known).
*/
void PQCLEAN_FALCON1024_CLEAN_hash_to_point(shake256_context *sc,
void PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
uint16_t *x, unsigned logn);
/*
* From a SHAKE256 context (must be already flipped), produce a new
* point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
* This function is constant-time but is typically more expensive than
* PQCLEAN_FALCON1024_CLEAN_hash_to_point_vartime().
*
* tmp[] must have 16-bit alignment.
*/
void PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
uint16_t *x, unsigned logn, uint8_t *tmp);
/*
@ -184,6 +261,8 @@ void PQCLEAN_FALCON1024_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
* logn is the degree log
* tmp[] temporary, must have at least 2*2^logn bytes
* Returned value is 1 on success, 0 on error.
*
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
const uint16_t *h, unsigned logn, uint8_t *tmp);
@ -195,6 +274,7 @@ int PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
* reported if f is not invertible mod phi mod q).
*
* The tmp[] array must have room for at least 2*2^logn elements.
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
@ -208,11 +288,53 @@ int PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
* The tmp[] array must have room for at least 4*2^logn bytes.
*
* Returned value is 1 in success, 0 on error (f not invertible).
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
const int8_t *f, const int8_t *g, const int8_t *F,
unsigned logn, uint8_t *tmp);
/*
* Test whether a given polynomial is invertible modulo phi and q.
* Polynomial coefficients are small integers.
*
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON1024_CLEAN_is_invertible(
const int16_t *s2, unsigned logn, uint8_t *tmp);
/*
* Count the number of elements of value zero in the NTT representation
* of the given polynomial: this is the number of primitive 2n-th roots
* of unity (modulo q = 12289) that are roots of the provided polynomial
* (taken modulo q).
*
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
/*
* Internal signature verification with public key recovery:
* h[] receives the public key (NOT in NTT/Montgomery format)
* c0[] contains the hashed nonce+message
* s1[] is the first signature half
* s2[] is the second signature half
* logn is the degree log
* tmp[] temporary, must have at least 2*2^logn bytes
* Returned value is 1 on success, 0 on error. Success is returned if
* the signature is a short enough vector; in that case, the public
* key has been written to h[]. However, the caller must still
* verify that h[] is the correct value (e.g. with regards to a known
* hash of the public key).
*
* h[] may not overlap with any of the other arrays.
*
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h,
const uint16_t *c0, const int16_t *s1, const int16_t *s2,
unsigned logn, uint8_t *tmp);
/* ==================================================================== */
/*
* Implementation of floating-point real numbers (fpr.h, fpr.c).
@ -358,7 +480,7 @@ typedef struct {
* Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
* context (in "flipped" state) to obtain its initial state.
*/
void PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src);
void PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src);
/*
* Refill the PRNG buffer. This is normally invoked automatically, and
@ -586,6 +708,9 @@ void PQCLEAN_FALCON1024_CLEAN_poly_merge_fft(fpr *f,
/*
* Required sizes of the temporary buffer (in bytes).
*
* This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
* or 2) where it is slightly greater.
*/
#define FALCON_KEYGEN_TEMP_1 136
#define FALCON_KEYGEN_TEMP_2 272
@ -608,8 +733,11 @@ void PQCLEAN_FALCON1024_CLEAN_poly_merge_fft(fpr *f,
* public key is written in h. Either or both of G and h may be NULL,
* in which case the corresponding element is not returned (they can
* be recomputed from f, g and F).
*
* tmp[] must have 64-bit alignment.
* This function uses floating-point rounding (see set_fpu_cw()).
*/
void PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
void PQCLEAN_FALCON1024_CLEAN_keygen(inner_shake256_context *rng,
int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
unsigned logn, uint8_t *tmp);
@ -624,6 +752,9 @@ void PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
* a total of (8*logn+40)*2^logn bytes.
*
* The tmp[] array must have room for at least 48*2^logn bytes.
*
* tmp[] must have 64-bit alignment.
* This function uses floating-point rounding (see set_fpu_cw()).
*/
void PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key,
const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
@ -636,9 +767,15 @@ void PQCLEAN_FALCON1024_CLEAN_expand_privkey(fpr *expanded_key,
*
* The sig[] and hm[] buffers may overlap.
*
* On successful output, the start of the tmp[] buffer contains the s1
* vector (as int16_t elements).
*
* The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
*
* tmp[] must have 64-bit alignment.
* This function uses floating-point rounding (see set_fpu_cw()).
*/
void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
const fpr *expanded_key,
const uint16_t *hm, unsigned logn, uint8_t *tmp);
@ -651,13 +788,47 @@ void PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
*
* The sig[] and hm[] buffers may overlap.
*
* On successful output, the start of the tmp[] buffer contains the s1
* vector (as int16_t elements).
*
* The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
*
* tmp[] must have 64-bit alignment.
* This function uses floating-point rounding (see set_fpu_cw()).
*/
void PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
void PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
const int8_t *f, const int8_t *g,
const int8_t *F, const int8_t *G,
const uint16_t *hm, unsigned logn, uint8_t *tmp);
/*
* Internal sampler engine. Exported for tests.
*
* sampler_context wraps around a source of random numbers (PRNG) and
* the sigma_min value (nominally dependent on the degree).
*
* sampler() takes as parameters:
* ctx pointer to the sampler_context structure
* mu center for the distribution
* isigma inverse of the distribution standard deviation
* It returns an integer sampled along the Gaussian distribution centered
* on mu and of standard deviation sigma = 1/isigma.
*
* gaussian0_sampler() takes as parameter a pointer to a PRNG, and
* returns an integer sampled along a half-Gaussian with standard
* deviation sigma0 = 1.8205 (center is 0, returned value is
* nonnegative).
*/
typedef struct {
prng p;
fpr sigma_min;
} sampler_context;
int PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
int PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(prng *p);
/* ==================================================================== */
#endif

View File

@ -2171,6 +2171,9 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
/* ==================================================================== */
#define RNG_CONTEXT inner_shake256_context
/*
* Get a random 8-byte integer from a SHAKE-based RNG. This function
* ensures consistent interpretation of the SHAKE output so that
@ -2178,14 +2181,14 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
* a known seed is used.
*/
static inline uint64_t
get_rng_u64(shake256_context *rng) {
get_rng_u64(inner_shake256_context *rng) {
/*
* We enforce little-endian representation.
*/
uint8_t tmp[8];
shake256_extract(rng, tmp, sizeof tmp);
inner_shake256_extract(rng, tmp, sizeof tmp);
return (uint64_t)tmp[0]
| ((uint64_t)tmp[1] << 8)
| ((uint64_t)tmp[2] << 16)
@ -2196,6 +2199,7 @@ get_rng_u64(shake256_context *rng) {
| ((uint64_t)tmp[7] << 56);
}
/*
* Table below incarnates a discrete Gaussian distribution:
* D(x) = exp(-(x^2)/(2*sigma^2))
@ -2227,7 +2231,7 @@ static const uint64_t gauss_1024_12289[] = {
* together for lower dimensions.
*/
static int
mkgauss(shake256_context *rng, unsigned logn) {
mkgauss(RNG_CONTEXT *rng, unsigned logn) {
unsigned u, g;
int val;
@ -3156,6 +3160,7 @@ solve_NTRU_intermediate(unsigned logn_top,
fpr xv;
xv = fpr_mul(rt2[u], pdc);
/*
* Sometimes the values can be out-of-bounds if
* the algorithm fails; we must not call
@ -4006,7 +4011,7 @@ solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
* also makes sure that the resultant of the polynomial with phi is odd.
*/
static void
poly_small_mkgauss(shake256_context *rng, int8_t *f, unsigned logn) {
poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
size_t n, u;
unsigned mod2;
@ -4046,7 +4051,7 @@ restart:
/* see falcon.h */
void
PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
PQCLEAN_FALCON1024_CLEAN_keygen(inner_shake256_context *rng,
int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
unsigned logn, uint8_t *tmp) {
/*
@ -4070,8 +4075,10 @@ PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
*/
size_t n, u;
uint16_t *h2, *tmp2;
RNG_CONTEXT *rc;
n = MKN(logn);
rc = rng;
/*
* We need to generate f and g randomly, until we find values
@ -4104,8 +4111,8 @@ PQCLEAN_FALCON1024_CLEAN_keygen(shake256_context *rng,
* (i.e. the resultant of the polynomial with phi
* will be odd).
*/
poly_small_mkgauss(rng, f, logn);
poly_small_mkgauss(rng, g, logn);
poly_small_mkgauss(rc, f, logn);
poly_small_mkgauss(rc, g, logn);
/*
* Verify that all coefficients are within the bounds

View File

@ -51,16 +51,16 @@ PQCLEAN_FALCON1024_CLEAN_crypto_sign_keypair(
int8_t f[1024], g[1024], F[1024];
uint16_t h[1024];
unsigned char seed[48];
shake256_context rng;
inner_shake256_context rng;
size_t u, v;
/*
* Generate key pair.
*/
randombytes(seed, sizeof seed);
shake256_init(&rng);
shake256_inject(&rng, seed, sizeof seed);
shake256_flip(&rng);
inner_shake256_init(&rng);
inner_shake256_inject(&rng, seed, sizeof seed);
inner_shake256_flip(&rng);
PQCLEAN_FALCON1024_CLEAN_keygen(&rng, f, g, F, NULL, h, 10, tmp.b);
/*
@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
uint16_t hm[1024];
} r;
unsigned char seed[48];
shake256_context sc;
inner_shake256_context sc;
size_t u, v;
/*
@ -181,19 +181,19 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
/*
* Hash message nonce + message into a vector.
*/
shake256_init(&sc);
shake256_inject(&sc, nonce, NONCELEN);
shake256_inject(&sc, m, mlen);
shake256_flip(&sc);
PQCLEAN_FALCON1024_CLEAN_hash_to_point(&sc, r.hm, 10, tmp.b);
inner_shake256_init(&sc);
inner_shake256_inject(&sc, nonce, NONCELEN);
inner_shake256_inject(&sc, m, mlen);
inner_shake256_flip(&sc);
PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, r.hm, 10, tmp.b);
/*
* Initialize a RNG.
*/
randombytes(seed, sizeof seed);
shake256_init(&sc);
shake256_inject(&sc, seed, sizeof seed);
shake256_flip(&sc);
inner_shake256_init(&sc);
inner_shake256_inject(&sc, seed, sizeof seed);
inner_shake256_flip(&sc);
/*
* Compute and return the signature. This loops until a signature
@ -225,7 +225,7 @@ do_verify(
} tmp;
uint16_t h[1024], hm[1024];
int16_t sig[1024];
shake256_context sc;
inner_shake256_context sc;
/*
* Decode public key.
@ -253,11 +253,11 @@ do_verify(
/*
* Hash nonce + message into a vector.
*/
shake256_init(&sc);
shake256_inject(&sc, nonce, NONCELEN);
shake256_inject(&sc, m, mlen);
shake256_flip(&sc);
PQCLEAN_FALCON1024_CLEAN_hash_to_point(&sc, hm, 10, tmp.b);
inner_shake256_init(&sc);
inner_shake256_inject(&sc, nonce, NONCELEN);
inner_shake256_inject(&sc, m, mlen);
inner_shake256_flip(&sc);
PQCLEAN_FALCON1024_CLEAN_hash_to_point_ct(&sc, hm, 10, tmp.b);
/*
* Verify signature.

View File

@ -36,7 +36,7 @@
/* see inner.h */
void
PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src) {
PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
/*
* To ensure reproducibility for a given seed, we
* must enforce little-endian interpretation of
@ -46,7 +46,7 @@ PQCLEAN_FALCON1024_CLEAN_prng_init(prng *p, shake256_context *src) {
uint64_t th, tl;
int i;
shake256_extract(src, tmp, 56);
inner_shake256_extract(src, tmp, 56);
for (i = 0; i < 14; i ++) {
uint32_t w;

View File

@ -417,8 +417,170 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
size_t n, hn;
const fpr *tree0, *tree1;
n = (size_t)1 << logn;
if (n == 1) {
/*
* When logn == 2, we inline the last two recursion levels.
*/
if (logn == 2) {
fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
fpr a_re, a_im, b_re, b_im, c_re, c_im;
tree0 = tree + 4;
tree1 = tree + 8;
/*
* We split t1 into w*, then do the recursive invocation,
* with output in w*. We finally merge back into z1.
*/
a_re = t1[0];
a_im = t1[2];
b_re = t1[1];
b_im = t1[3];
c_re = fpr_add(a_re, b_re);
c_im = fpr_add(a_im, b_im);
w0 = fpr_half(c_re);
w1 = fpr_half(c_im);
c_re = fpr_sub(a_re, b_re);
c_im = fpr_sub(a_im, b_im);
w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
x0 = w2;
x1 = w3;
sigma = tree1[3];
w2 = fpr_of(samp(samp_ctx, x0, sigma));
w3 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = fpr_sub(x0, w2);
a_im = fpr_sub(x1, w3);
b_re = tree1[0];
b_im = tree1[1];
c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
x0 = fpr_add(c_re, w0);
x1 = fpr_add(c_im, w1);
sigma = tree1[2];
w0 = fpr_of(samp(samp_ctx, x0, sigma));
w1 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = w0;
a_im = w1;
b_re = w2;
b_im = w3;
c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
z1[0] = w0 = fpr_add(a_re, c_re);
z1[2] = w2 = fpr_add(a_im, c_im);
z1[1] = w1 = fpr_sub(a_re, c_re);
z1[3] = w3 = fpr_sub(a_im, c_im);
/*
* Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
*/
w0 = fpr_sub(t1[0], w0);
w1 = fpr_sub(t1[1], w1);
w2 = fpr_sub(t1[2], w2);
w3 = fpr_sub(t1[3], w3);
a_re = w0;
a_im = w2;
b_re = tree[0];
b_im = tree[2];
w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
a_re = w1;
a_im = w3;
b_re = tree[1];
b_im = tree[3];
w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
w0 = fpr_add(w0, t0[0]);
w1 = fpr_add(w1, t0[1]);
w2 = fpr_add(w2, t0[2]);
w3 = fpr_add(w3, t0[3]);
/*
* Second recursive invocation.
*/
a_re = w0;
a_im = w2;
b_re = w1;
b_im = w3;
c_re = fpr_add(a_re, b_re);
c_im = fpr_add(a_im, b_im);
w0 = fpr_half(c_re);
w1 = fpr_half(c_im);
c_re = fpr_sub(a_re, b_re);
c_im = fpr_sub(a_im, b_im);
w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
x0 = w2;
x1 = w3;
sigma = tree0[3];
w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = fpr_sub(x0, y0);
a_im = fpr_sub(x1, y1);
b_re = tree0[0];
b_im = tree0[1];
c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
x0 = fpr_add(c_re, w0);
x1 = fpr_add(c_im, w1);
sigma = tree0[2];
w0 = fpr_of(samp(samp_ctx, x0, sigma));
w1 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = w0;
a_im = w1;
b_re = w2;
b_im = w3;
c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
z0[0] = fpr_add(a_re, c_re);
z0[2] = fpr_add(a_im, c_im);
z0[1] = fpr_sub(a_re, c_re);
z0[3] = fpr_sub(a_im, c_im);
return;
}
/*
* Case logn == 1 is reachable only when using Falcon-2 (the
* smallest size for which Falcon is mathematically defined, but
* of course way too insecure to be of any use).
*/
if (logn == 1) {
fpr x0, x1, y0, y1, sigma;
fpr a_re, a_im, b_re, b_im, c_re, c_im;
x0 = t1[0];
x1 = t1[1];
sigma = tree[3];
z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = fpr_sub(x0, y0);
a_im = fpr_sub(x1, y1);
b_re = tree[0];
b_im = tree[1];
c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
x0 = fpr_add(c_re, t0[0]);
x1 = fpr_add(c_im, t0[1]);
sigma = tree[2];
z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
return;
}
/*
* Normal end of recursion is for logn == 0. Since the last
* steps of the recursions were inlined in the blocks above
* (when logn == 1 or 2), this case is not reachable, and is
* retained here only for documentation purposes.
if (logn == 0) {
fpr x0, x1, sigma;
x0 = t0[0];
@ -429,6 +591,13 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
return;
}
*/
/*
* General recursive case (logn >= 3).
*/
n = (size_t)1 << logn;
hn = n >> 1;
tree0 = tree + n;
tree1 = tree + n + ffLDL_treesize(logn - 1);
@ -480,7 +649,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
const fpr *b00, *b01, *b10, *b11, *tree;
fpr ni;
uint32_t sqn, ng;
int16_t *s2tmp;
int16_t *s1tmp, *s2tmp;
n = MKN(logn);
t0 = tmp;
@ -542,6 +711,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
/*
* Compute the signature.
*/
s1tmp = (int16_t *)tx;
sqn = 0;
ng = 0;
for (u = 0; u < n; u ++) {
@ -550,6 +720,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
sqn += (uint32_t)(z * z);
ng |= sqn;
s1tmp[u] = (int16_t)z;
}
sqn |= -(ng >> 31);
@ -568,6 +739,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
}
if (PQCLEAN_FALCON1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
memcpy(s2, s2tmp, n * sizeof * s2);
memcpy(tmp, s1tmp, n * sizeof * s1tmp);
return 1;
}
return 0;
@ -592,7 +764,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
fpr ni;
uint32_t sqn, ng;
int16_t *s2tmp;
int16_t *s1tmp, *s2tmp;
n = MKN(logn);
@ -745,6 +917,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
PQCLEAN_FALCON1024_CLEAN_iFFT(t0, logn);
PQCLEAN_FALCON1024_CLEAN_iFFT(t1, logn);
s1tmp = (int16_t *)tx;
sqn = 0;
ng = 0;
for (u = 0; u < n; u ++) {
@ -753,6 +926,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
sqn += (uint32_t)(z * z);
ng |= sqn;
s1tmp[u] = (int16_t)z;
}
sqn |= -(ng >> 31);
@ -771,6 +945,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
}
if (PQCLEAN_FALCON1024_CLEAN_is_short_half(sqn, s2tmp, logn)) {
memcpy(s2, s2tmp, n * sizeof * s2);
memcpy(tmp, s1tmp, n * sizeof * s1tmp);
return 1;
}
return 0;
@ -780,29 +955,28 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
* Sample an integer value along a half-gaussian distribution centered
* on zero and standard deviation 1.8205, with a precision of 72 bits.
*/
static int
gaussian0_sampler(prng *p) {
int
PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(prng *p) {
static const uint32_t dist[] = {
6031371U, 13708371U, 13035518U,
5186761U, 1487980U, 12270720U,
3298653U, 4688887U, 5511555U,
1551448U, 9247616U, 9467675U,
539632U, 14076116U, 5909365U,
138809U, 10836485U, 13263376U,
26405U, 15335617U, 16601723U,
3714U, 14514117U, 13240074U,
386U, 8324059U, 3276722U,
29U, 12376792U, 7821247U,
1U, 11611789U, 3398254U,
0U, 1194629U, 4532444U,
0U, 37177U, 2973575U,
0U, 855U, 10369757U,
0U, 14U, 9441597U,
0U, 0U, 3075302U,
0U, 0U, 28626U,
0U, 0U, 197U,
0U, 0U, 1U
10745844u, 3068844u, 3741698u,
5559083u, 1580863u, 8248194u,
2260429u, 13669192u, 2736639u,
708981u, 4421575u, 10046180u,
169348u, 7122675u, 4136815u,
30538u, 13063405u, 7650655u,
4132u, 14505003u, 7826148u,
417u, 16768101u, 11363290u,
31u, 8444042u, 8086568u,
1u, 12844466u, 265321u,
0u, 1232676u, 13644283u,
0u, 38047u, 9111839u,
0u, 870u, 6138264u,
0u, 14u, 12545723u,
0u, 0u, 3104126u,
0u, 0u, 28824u,
0u, 0u, 198u,
0u, 0u, 1u
};
uint32_t v0, v1, v2, hi;
@ -843,7 +1017,7 @@ gaussian0_sampler(prng *p) {
* Sample a bit with probability exp(-x) for some x >= 0.
*/
static int
BerExp(prng *p, fpr x) {
BerExp(prng *p, fpr x, fpr ccs) {
int s, i;
fpr r;
uint32_t sw, w;
@ -880,7 +1054,7 @@ BerExp(prng *p, fpr x) {
* case). The bias is negligible since fpr_expm_p63() only computes
* with 51 bits of precision or so.
*/
z = ((fpr_expm_p63(r) << 1) - 1) >> s;
z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
/*
* Sample a bit with probability exp(-x). Since x = s*log(2) + r,
@ -896,11 +1070,6 @@ BerExp(prng *p, fpr x) {
return (int)(w >> 31);
}
typedef struct {
prng p;
fpr sigma_min;
} sampler_context;
/*
* The sampler produces a random integer that follows a discrete Gaussian
* distribution, centered on mu, and with standard deviation sigma. The
@ -909,8 +1078,8 @@ typedef struct {
* The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
* 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
*/
static int
sampler(void *ctx, fpr mu, fpr isigma) {
int
PQCLEAN_FALCON1024_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
sampler_context *spc;
int s;
fpr r, dss, ccs;
@ -952,7 +1121,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
* - b = 0: z <= 0 and sampled against a Gaussian
* centered on 0.
*/
z0 = gaussian0_sampler(&spc->p);
z0 = PQCLEAN_FALCON1024_CLEAN_gaussian0_sampler(&spc->p);
b = prng_get_u8(&spc->p) & 1;
z = b + ((b << 1) - 1) * z0;
@ -983,8 +1152,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
*/
x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
x = fpr_mul(x, ccs);
if (BerExp(&spc->p, x)) {
if (BerExp(&spc->p, x, ccs)) {
/*
* Rejection sampling was centered on r, but the
* actual center is mu = s + r.
@ -996,7 +1164,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
/* see inner.h */
void
PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
const fpr *expanded_key,
const uint16_t *hm, unsigned logn, uint8_t *tmp) {
fpr *ftmp;
@ -1025,7 +1193,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
? fpr_sigma_min_10
: fpr_sigma_min_9;
PQCLEAN_FALCON1024_CLEAN_prng_init(&spc.p, rng);
samp = sampler;
samp = PQCLEAN_FALCON1024_CLEAN_sampler;
samp_ctx = &spc;
/*
@ -1040,7 +1208,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
/* see inner.h */
void
PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
const int8_t *f, const int8_t *g,
const int8_t *F, const int8_t *G,
const uint16_t *hm, unsigned logn, uint8_t *tmp) {
@ -1070,7 +1238,7 @@ PQCLEAN_FALCON1024_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
? fpr_sigma_min_10
: fpr_sigma_min_9;
PQCLEAN_FALCON1024_CLEAN_prng_init(&spc.p, rng);
samp = sampler;
samp = PQCLEAN_FALCON1024_CLEAN_sampler;
samp_ctx = &spc;
/*

View File

@ -649,7 +649,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
}
/*
* Compute s1 = s2*h - c0 mod phi mod q (in tt[]).
* Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
*/
mq_NTT(tt, logn);
mq_poly_montymul_ntt(tt, h, logn);
@ -657,7 +657,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
mq_poly_sub(tt, c0, logn);
/*
* Normalize s1 elements into the [-q/2..q/2] range.
* Normalize -s1 elements into the [-q/2..q/2] range.
*/
for (u = 0; u < n; u ++) {
int32_t w;
@ -668,7 +668,7 @@ PQCLEAN_FALCON1024_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
}
/*
* Signature is valid if and only if the aggregate (s1,s2) vector
* Signature is valid if and only if the aggregate (-s1,s2) vector
* is short enough.
*/
return PQCLEAN_FALCON1024_CLEAN_is_short((int16_t *)tt, s2, logn);
@ -699,7 +699,7 @@ PQCLEAN_FALCON1024_CLEAN_compute_public(uint16_t *h,
return 1;
}
/* see internal.h */
/* see inner.h */
int
PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
const int8_t *f, const int8_t *g, const int8_t *F,
@ -743,3 +743,110 @@ PQCLEAN_FALCON1024_CLEAN_complete_private(int8_t *G,
}
return 1;
}
/* see inner.h */
int
PQCLEAN_FALCON1024_CLEAN_is_invertible(
const int16_t *s2, unsigned logn, uint8_t *tmp) {
size_t u, n;
uint16_t *tt;
uint32_t r;
n = (size_t)1 << logn;
tt = (uint16_t *)tmp;
for (u = 0; u < n; u ++) {
uint32_t w;
w = (uint32_t)s2[u];
w += Q & -(w >> 31);
tt[u] = (uint16_t)w;
}
mq_NTT(tt, logn);
r = 0;
for (u = 0; u < n; u ++) {
r |= (uint32_t)(tt[u] - 1);
}
return (int)(1u - (r >> 31));
}
/* see inner.h */
int
PQCLEAN_FALCON1024_CLEAN_verify_recover(uint16_t *h,
const uint16_t *c0, const int16_t *s1, const int16_t *s2,
unsigned logn, uint8_t *tmp) {
size_t u, n;
uint16_t *tt;
uint32_t r;
n = (size_t)1 << logn;
/*
* Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
* and c0 - s1 into h[].
*/
tt = (uint16_t *)tmp;
for (u = 0; u < n; u ++) {
uint32_t w;
w = (uint32_t)s2[u];
w += Q & -(w >> 31);
tt[u] = (uint16_t)w;
w = (uint32_t)s1[u];
w += Q & -(w >> 31);
w = mq_sub(c0[u], w);
h[u] = (uint16_t)w;
}
/*
* Compute h = (c0 - s1) / s2. If one of the coefficients of s2
* is zero (in NTT representation) then the operation fails. We
* keep that information into a flag so that we do not deviate
* from strict constant-time processing; if all coefficients of
* s2 are non-zero, then the high bit of r will be zero.
*/
mq_NTT(tt, logn);
mq_NTT(h, logn);
r = 0;
for (u = 0; u < n; u ++) {
r |= (uint32_t)(tt[u] - 1);
h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
}
mq_iNTT(h, logn);
/*
* Signature is acceptable if and only if it is short enough,
* and s2 was invertible mod phi mod q. The caller must still
* check that the rebuilt public key matches the expected
* value (e.g. through a hash).
*/
r = ~r & (uint32_t) - PQCLEAN_FALCON1024_CLEAN_is_short(s1, s2, logn);
return (int)(r >> 31);
}
/* see inner.h */
int
PQCLEAN_FALCON1024_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
uint16_t *s2;
size_t u, n;
uint32_t r;
n = (size_t)1 << logn;
s2 = (uint16_t *)tmp;
for (u = 0; u < n; u ++) {
uint32_t w;
w = (uint32_t)sig[u];
w += Q & -(w >> 31);
s2[u] = (uint16_t)w;
}
mq_NTT(s2, logn);
r = 0;
for (u = 0; u < n; u ++) {
uint32_t w;
w = (uint32_t)s2[u] - 1u;
r += (w >> 31);
}
return (int)r;
}

View File

@ -4,8 +4,8 @@ claimed-nist-level: 1
length-public-key: 897
length-secret-key: 1281
length-signature: 690
nistkat-sha256: abc62e7be3d7c1db757ba3cbb771cfdc89c6b36fb5efc885593db89ec2ea8bc4
testvectors-sha256: 1a1b170fc9e4623e7ff519c15ec7a2dda55e94a175756b7c72429451bd226b09
nistkat-sha256: e9c3985f1ce732e29ca81aeca091f20d4dbb5beb456ee1a7ab41d04add4dab10
testvectors-sha256: 036b5e803ab825146502513b7460b24cc9493f8e366323cd5e30e2dc6d4ca6a7
principal-submitters:
- Thomas Prest
auxiliary-submitters:

View File

@ -33,10 +33,43 @@
/* see inner.h */
void
PQCLEAN_FALCON512_CLEAN_hash_to_point(
shake256_context *sc,
uint16_t *x, unsigned logn, uint8_t *tmp) {
PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime(
inner_shake256_context *sc,
uint16_t *x, unsigned logn) {
/*
* This is the straightforward per-the-spec implementation. It
* is not constant-time, thus it might reveal information on the
* plaintext (at least, enough to check the plaintext against a
* list of potential plaintexts) in a scenario where the
* attacker does not have access to the signature value or to
* the public key, but knows the nonce (without knowledge of the
* nonce, the hashed output cannot be matched against potential
* plaintexts).
*/
size_t n;
n = (size_t)1 << logn;
while (n > 0) {
uint8_t buf[2];
uint32_t w;
inner_shake256_extract(sc, (void *)buf, sizeof buf);
w = ((unsigned)buf[0] << 8) | (unsigned)buf[1];
if (w < 61445) {
while (w >= 12289) {
w -= 12289;
}
*x ++ = (uint16_t)w;
n --;
}
}
}
/* see inner.h */
void
PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(
inner_shake256_context *sc,
uint16_t *x, unsigned logn, uint8_t *tmp) {
/*
* Each 16-bit sample is a value in 0..65535. The value is
* kept if it falls in 0..61444 (because 61445 = 5*12289)
@ -97,7 +130,7 @@ PQCLEAN_FALCON512_CLEAN_hash_to_point(
uint8_t buf[2];
uint32_t w, wr;
shake256_extract(sc, buf, sizeof buf);
inner_shake256_extract(sc, buf, sizeof buf);
w = ((uint32_t)buf[0] << 8) | (uint32_t)buf[1];
wr = w - ((uint32_t)24578 & (((w - 24578) >> 31) - 1));
wr = wr - ((uint32_t)24578 & (((wr - 24578) >> 31) - 1));
@ -196,7 +229,6 @@ PQCLEAN_FALCON512_CLEAN_hash_to_point(
*d = (uint16_t)(dv ^ (mk & (sv ^ dv)));
}
}
}
/* see inner.h */

View File

@ -507,7 +507,7 @@ fpr_sqrt(fpr x) {
uint64_t
fpr_expm_p63(fpr x) {
fpr_expm_p63(fpr x, fpr ccs) {
/*
* Polynomial approximation of exp(-x) is taken from FACCT:
* https://eprint.iacr.org/2018/1234
@ -539,6 +539,8 @@ fpr_expm_p63(fpr x) {
uint64_t z, y;
unsigned u;
uint32_t z0, z1, y0, y1;
uint64_t a, b;
y = C[0];
z = (uint64_t)fpr_trunc(fpr_mul(x, fpr_ptwo63)) << 1;
@ -554,8 +556,7 @@ fpr_expm_p63(fpr x) {
* also have appropriate IEEE754 floating-point support,
* which is better.
*/
uint32_t z0, z1, y0, y1;
uint64_t a, b, c;
uint64_t c;
z0 = (uint32_t)z;
z1 = (uint32_t)(z >> 32);
@ -569,6 +570,24 @@ fpr_expm_p63(fpr x) {
c += (uint64_t)z1 * (uint64_t)y1;
y = C[u] - c;
}
/*
* The scaling factor must be applied at the end. Since y is now
* in fixed-point notation, we have to convert the factor to the
* same format, and do an extra integer multiplication.
*/
z = (uint64_t)fpr_trunc(fpr_mul(ccs, fpr_ptwo63)) << 1;
z0 = (uint32_t)z;
z1 = (uint32_t)(z >> 32);
y0 = (uint32_t)y;
y1 = (uint32_t)(y >> 32);
a = ((uint64_t)z0 * (uint64_t)y1)
+ (((uint64_t)z0 * (uint64_t)y0) >> 32);
b = ((uint64_t)z1 * (uint64_t)y0);
y = (a >> 32) + (b >> 32);
y += (((uint64_t)(uint32_t)a + (uint64_t)(uint32_t)b) >> 32);
y += (uint64_t)z1 * (uint64_t)y1;
return y;
}

View File

@ -232,6 +232,8 @@ static const fpr fpr_zero = 0;
static const fpr fpr_one = 4607182418800017408;
static const fpr fpr_two = 4611686018427387904;
static const fpr fpr_onehalf = 4602678819172646912;
static const fpr fpr_invsqrt2 = 4604544271217802189;
static const fpr fpr_invsqrt8 = 4600040671590431693;
static const fpr fpr_ptwo31 = 4746794007248502784;
static const fpr fpr_ptwo31m1 = 4746794007244308480;
static const fpr fpr_mtwo31m1 = 13970166044099084288U;
@ -444,7 +446,7 @@ fpr_lt(fpr x, fpr y) {
* bits or so.
*/
#define fpr_expm_p63 PQCLEAN_FALCON512_CLEAN_fpr_expm_p63
uint64_t fpr_expm_p63(fpr x);
uint64_t fpr_expm_p63(fpr x, fpr ccs);
#define fpr_gm_tab PQCLEAN_FALCON512_CLEAN_fpr_gm_tab
extern const fpr fpr_gm_tab[];

View File

@ -34,6 +34,45 @@
* @author Thomas Pornin <thomas.pornin@nccgroup.com>
*/
/*
* IMPORTANT API RULES
* -------------------
*
* This API has some non-trivial usage rules:
*
*
* - All public functions (i.e. the non-static ones) must be referenced
* with the PQCLEAN_FALCON512_CLEAN_ macro (e.g. PQCLEAN_FALCON512_CLEAN_verify_raw for the verify_raw()
* function). That macro adds a prefix to the name, which is
* configurable with the FALCON_PREFIX macro. This allows compiling
* the code into a specific "namespace" and potentially including
* several versions of this code into a single application (e.g. to
* have an AVX2 and a non-AVX2 variants and select the one to use at
* runtime based on availability of AVX2 opcodes).
*
* - Functions that need temporary buffers expects them as a final
* tmp[] array of type uint8_t*, with a size which is documented for
* each function. However, most have some alignment requirements,
* because they will use the array to store 16-bit, 32-bit or 64-bit
* values (e.g. uint64_t or double). The caller must ensure proper
* alignment. What happens on unaligned access depends on the
* underlying architecture, ranging from a slight time penalty
* to immediate termination of the process.
*
* - Some functions rely on specific rounding rules and precision for
* floating-point numbers. On some systems (in particular 32-bit x86
* with the 387 FPU), this requires setting an hardware control
* word. The caller MUST use set_fpu_cw() to ensure proper precision:
*
* oldcw = set_fpu_cw(2);
* PQCLEAN_FALCON512_CLEAN_sign_dyn(...);
* set_fpu_cw(oldcw);
*
* On systems where the native floating-point precision is already
* proper, or integer-based emulation is used, the set_fpu_cw()
* function does nothing, so it can be called systematically.
*/
#include <stdint.h>
#include <stdlib.h>
@ -42,6 +81,31 @@
/*
* Some computations with floating-point elements, in particular
* rounding to the nearest integer, rely on operations using _exactly_
* the precision of IEEE-754 binary64 type (i.e. 52 bits). On 32-bit
* x86, the 387 FPU may be used (depending on the target OS) and, in
* that case, may use more precision bits (i.e. 64 bits, for an 80-bit
* total type length); to prevent miscomputations, we define an explicit
* function that modifies the precision in the FPU control word.
*
* set_fpu_cw() sets the precision to the provided value, and returns
* the previously set precision; callers are supposed to restore the
* previous precision on exit. The correct (52-bit) precision is
* configured with the value "2". On unsupported compilers, or on
* targets other than 32-bit x86, or when the native 'double' type is
* not used, the set_fpu_cw() function does nothing at all.
*/
static inline unsigned
set_fpu_cw(unsigned x) {
return x;
}
/* ==================================================================== */
/*
* SHAKE256 implementation (shake.c).
@ -53,11 +117,11 @@
#include "fips202.h"
#define shake256_context shake256incctx
#define shake256_init(sc) shake256_inc_init(sc)
#define shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
#define shake256_flip(sc) shake256_inc_finalize(sc)
#define shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
#define inner_shake256_context shake256incctx
#define inner_shake256_init(sc) shake256_inc_init(sc)
#define inner_shake256_inject(sc, in, len) shake256_inc_absorb(sc, in, len)
#define inner_shake256_flip(sc) shake256_inc_finalize(sc)
#define inner_shake256_extract(sc, out, len) shake256_inc_squeeze(out, len, sc)
/* ==================================================================== */
@ -140,9 +204,22 @@ extern const uint8_t PQCLEAN_FALCON512_CLEAN_max_sig_bits[];
/*
* From a SHAKE256 context (must be already flipped), produce a new
* point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
* point. This is the non-constant-time version, which may leak enough
* information to serve as a stop condition on a brute force attack on
* the hashed message (provided that the nonce value is known).
*/
void PQCLEAN_FALCON512_CLEAN_hash_to_point(shake256_context *sc,
void PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime(inner_shake256_context *sc,
uint16_t *x, unsigned logn);
/*
* From a SHAKE256 context (must be already flipped), produce a new
* point. The temporary buffer (tmp) must have room for 2*2^logn bytes.
* This function is constant-time but is typically more expensive than
* PQCLEAN_FALCON512_CLEAN_hash_to_point_vartime().
*
* tmp[] must have 16-bit alignment.
*/
void PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(inner_shake256_context *sc,
uint16_t *x, unsigned logn, uint8_t *tmp);
/*
@ -184,6 +261,8 @@ void PQCLEAN_FALCON512_CLEAN_to_ntt_monty(uint16_t *h, unsigned logn);
* logn is the degree log
* tmp[] temporary, must have at least 2*2^logn bytes
* Returned value is 1 on success, 0 on error.
*
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
const uint16_t *h, unsigned logn, uint8_t *tmp);
@ -195,6 +274,7 @@ int PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
* reported if f is not invertible mod phi mod q).
*
* The tmp[] array must have room for at least 2*2^logn elements.
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
const int8_t *f, const int8_t *g, unsigned logn, uint8_t *tmp);
@ -208,11 +288,53 @@ int PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
* The tmp[] array must have room for at least 4*2^logn bytes.
*
* Returned value is 1 in success, 0 on error (f not invertible).
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
const int8_t *f, const int8_t *g, const int8_t *F,
unsigned logn, uint8_t *tmp);
/*
* Test whether a given polynomial is invertible modulo phi and q.
* Polynomial coefficients are small integers.
*
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON512_CLEAN_is_invertible(
const int16_t *s2, unsigned logn, uint8_t *tmp);
/*
* Count the number of elements of value zero in the NTT representation
* of the given polynomial: this is the number of primitive 2n-th roots
* of unity (modulo q = 12289) that are roots of the provided polynomial
* (taken modulo q).
*
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp);
/*
* Internal signature verification with public key recovery:
* h[] receives the public key (NOT in NTT/Montgomery format)
* c0[] contains the hashed nonce+message
* s1[] is the first signature half
* s2[] is the second signature half
* logn is the degree log
* tmp[] temporary, must have at least 2*2^logn bytes
* Returned value is 1 on success, 0 on error. Success is returned if
* the signature is a short enough vector; in that case, the public
* key has been written to h[]. However, the caller must still
* verify that h[] is the correct value (e.g. with regards to a known
* hash of the public key).
*
* h[] may not overlap with any of the other arrays.
*
* tmp[] must have 16-bit alignment.
*/
int PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h,
const uint16_t *c0, const int16_t *s1, const int16_t *s2,
unsigned logn, uint8_t *tmp);
/* ==================================================================== */
/*
* Implementation of floating-point real numbers (fpr.h, fpr.c).
@ -358,7 +480,7 @@ typedef struct {
* Instantiate a PRNG. That PRNG will feed over the provided SHAKE256
* context (in "flipped" state) to obtain its initial state.
*/
void PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src);
void PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src);
/*
* Refill the PRNG buffer. This is normally invoked automatically, and
@ -586,6 +708,9 @@ void PQCLEAN_FALCON512_CLEAN_poly_merge_fft(fpr *f,
/*
* Required sizes of the temporary buffer (in bytes).
*
* This size is 28*2^logn bytes, except for degrees 2 and 4 (logn = 1
* or 2) where it is slightly greater.
*/
#define FALCON_KEYGEN_TEMP_1 136
#define FALCON_KEYGEN_TEMP_2 272
@ -608,8 +733,11 @@ void PQCLEAN_FALCON512_CLEAN_poly_merge_fft(fpr *f,
* public key is written in h. Either or both of G and h may be NULL,
* in which case the corresponding element is not returned (they can
* be recomputed from f, g and F).
*
* tmp[] must have 64-bit alignment.
* This function uses floating-point rounding (see set_fpu_cw()).
*/
void PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
void PQCLEAN_FALCON512_CLEAN_keygen(inner_shake256_context *rng,
int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
unsigned logn, uint8_t *tmp);
@ -624,6 +752,9 @@ void PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
* a total of (8*logn+40)*2^logn bytes.
*
* The tmp[] array must have room for at least 48*2^logn bytes.
*
* tmp[] must have 64-bit alignment.
* This function uses floating-point rounding (see set_fpu_cw()).
*/
void PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key,
const int8_t *f, const int8_t *g, const int8_t *F, const int8_t *G,
@ -636,9 +767,15 @@ void PQCLEAN_FALCON512_CLEAN_expand_privkey(fpr *expanded_key,
*
* The sig[] and hm[] buffers may overlap.
*
* On successful output, the start of the tmp[] buffer contains the s1
* vector (as int16_t elements).
*
* The minimal size (in bytes) of tmp[] is 48*2^logn bytes.
*
* tmp[] must have 64-bit alignment.
* This function uses floating-point rounding (see set_fpu_cw()).
*/
void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
const fpr *expanded_key,
const uint16_t *hm, unsigned logn, uint8_t *tmp);
@ -651,13 +788,47 @@ void PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
*
* The sig[] and hm[] buffers may overlap.
*
* On successful output, the start of the tmp[] buffer contains the s1
* vector (as int16_t elements).
*
* The minimal size (in bytes) of tmp[] is 72*2^logn bytes.
*
* tmp[] must have 64-bit alignment.
* This function uses floating-point rounding (see set_fpu_cw()).
*/
void PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
void PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
const int8_t *f, const int8_t *g,
const int8_t *F, const int8_t *G,
const uint16_t *hm, unsigned logn, uint8_t *tmp);
/*
* Internal sampler engine. Exported for tests.
*
* sampler_context wraps around a source of random numbers (PRNG) and
* the sigma_min value (nominally dependent on the degree).
*
* sampler() takes as parameters:
* ctx pointer to the sampler_context structure
* mu center for the distribution
* isigma inverse of the distribution standard deviation
* It returns an integer sampled along the Gaussian distribution centered
* on mu and of standard deviation sigma = 1/isigma.
*
* gaussian0_sampler() takes as parameter a pointer to a PRNG, and
* returns an integer sampled along a half-Gaussian with standard
* deviation sigma0 = 1.8205 (center is 0, returned value is
* nonnegative).
*/
typedef struct {
prng p;
fpr sigma_min;
} sampler_context;
int PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma);
int PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(prng *p);
/* ==================================================================== */
#endif

View File

@ -2171,6 +2171,9 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
/* ==================================================================== */
#define RNG_CONTEXT inner_shake256_context
/*
* Get a random 8-byte integer from a SHAKE-based RNG. This function
* ensures consistent interpretation of the SHAKE output so that
@ -2178,14 +2181,14 @@ poly_sub_scaled_ntt(uint32_t *F, size_t Flen, size_t Fstride,
* a known seed is used.
*/
static inline uint64_t
get_rng_u64(shake256_context *rng) {
get_rng_u64(inner_shake256_context *rng) {
/*
* We enforce little-endian representation.
*/
uint8_t tmp[8];
shake256_extract(rng, tmp, sizeof tmp);
inner_shake256_extract(rng, tmp, sizeof tmp);
return (uint64_t)tmp[0]
| ((uint64_t)tmp[1] << 8)
| ((uint64_t)tmp[2] << 16)
@ -2196,6 +2199,7 @@ get_rng_u64(shake256_context *rng) {
| ((uint64_t)tmp[7] << 56);
}
/*
* Table below incarnates a discrete Gaussian distribution:
* D(x) = exp(-(x^2)/(2*sigma^2))
@ -2227,7 +2231,7 @@ static const uint64_t gauss_1024_12289[] = {
* together for lower dimensions.
*/
static int
mkgauss(shake256_context *rng, unsigned logn) {
mkgauss(RNG_CONTEXT *rng, unsigned logn) {
unsigned u, g;
int val;
@ -3156,6 +3160,7 @@ solve_NTRU_intermediate(unsigned logn_top,
fpr xv;
xv = fpr_mul(rt2[u], pdc);
/*
* Sometimes the values can be out-of-bounds if
* the algorithm fails; we must not call
@ -4006,7 +4011,7 @@ solve_NTRU(unsigned logn, int8_t *F, int8_t *G,
* also makes sure that the resultant of the polynomial with phi is odd.
*/
static void
poly_small_mkgauss(shake256_context *rng, int8_t *f, unsigned logn) {
poly_small_mkgauss(RNG_CONTEXT *rng, int8_t *f, unsigned logn) {
size_t n, u;
unsigned mod2;
@ -4046,7 +4051,7 @@ restart:
/* see falcon.h */
void
PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
PQCLEAN_FALCON512_CLEAN_keygen(inner_shake256_context *rng,
int8_t *f, int8_t *g, int8_t *F, int8_t *G, uint16_t *h,
unsigned logn, uint8_t *tmp) {
/*
@ -4070,8 +4075,10 @@ PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
*/
size_t n, u;
uint16_t *h2, *tmp2;
RNG_CONTEXT *rc;
n = MKN(logn);
rc = rng;
/*
* We need to generate f and g randomly, until we find values
@ -4104,8 +4111,8 @@ PQCLEAN_FALCON512_CLEAN_keygen(shake256_context *rng,
* (i.e. the resultant of the polynomial with phi
* will be odd).
*/
poly_small_mkgauss(rng, f, logn);
poly_small_mkgauss(rng, g, logn);
poly_small_mkgauss(rc, f, logn);
poly_small_mkgauss(rc, g, logn);
/*
* Verify that all coefficients are within the bounds

View File

@ -51,16 +51,16 @@ PQCLEAN_FALCON512_CLEAN_crypto_sign_keypair(
int8_t f[512], g[512], F[512];
uint16_t h[512];
unsigned char seed[48];
shake256_context rng;
inner_shake256_context rng;
size_t u, v;
/*
* Generate key pair.
*/
randombytes(seed, sizeof seed);
shake256_init(&rng);
shake256_inject(&rng, seed, sizeof seed);
shake256_flip(&rng);
inner_shake256_init(&rng);
inner_shake256_inject(&rng, seed, sizeof seed);
inner_shake256_flip(&rng);
PQCLEAN_FALCON512_CLEAN_keygen(&rng, f, g, F, NULL, h, 9, tmp.b);
/*
@ -135,7 +135,7 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
uint16_t hm[512];
} r;
unsigned char seed[48];
shake256_context sc;
inner_shake256_context sc;
size_t u, v;
/*
@ -181,19 +181,19 @@ do_sign(uint8_t *nonce, uint8_t *sigbuf, size_t *sigbuflen,
/*
* Hash message nonce + message into a vector.
*/
shake256_init(&sc);
shake256_inject(&sc, nonce, NONCELEN);
shake256_inject(&sc, m, mlen);
shake256_flip(&sc);
PQCLEAN_FALCON512_CLEAN_hash_to_point(&sc, r.hm, 9, tmp.b);
inner_shake256_init(&sc);
inner_shake256_inject(&sc, nonce, NONCELEN);
inner_shake256_inject(&sc, m, mlen);
inner_shake256_flip(&sc);
PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(&sc, r.hm, 9, tmp.b);
/*
* Initialize a RNG.
*/
randombytes(seed, sizeof seed);
shake256_init(&sc);
shake256_inject(&sc, seed, sizeof seed);
shake256_flip(&sc);
inner_shake256_init(&sc);
inner_shake256_inject(&sc, seed, sizeof seed);
inner_shake256_flip(&sc);
/*
* Compute and return the signature. This loops until a signature
@ -225,7 +225,7 @@ do_verify(
} tmp;
uint16_t h[512], hm[512];
int16_t sig[512];
shake256_context sc;
inner_shake256_context sc;
/*
* Decode public key.
@ -253,11 +253,11 @@ do_verify(
/*
* Hash nonce + message into a vector.
*/
shake256_init(&sc);
shake256_inject(&sc, nonce, NONCELEN);
shake256_inject(&sc, m, mlen);
shake256_flip(&sc);
PQCLEAN_FALCON512_CLEAN_hash_to_point(&sc, hm, 9, tmp.b);
inner_shake256_init(&sc);
inner_shake256_inject(&sc, nonce, NONCELEN);
inner_shake256_inject(&sc, m, mlen);
inner_shake256_flip(&sc);
PQCLEAN_FALCON512_CLEAN_hash_to_point_ct(&sc, hm, 9, tmp.b);
/*
* Verify signature.

View File

@ -36,7 +36,7 @@
/* see inner.h */
void
PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src) {
PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, inner_shake256_context *src) {
/*
* To ensure reproducibility for a given seed, we
* must enforce little-endian interpretation of
@ -46,7 +46,7 @@ PQCLEAN_FALCON512_CLEAN_prng_init(prng *p, shake256_context *src) {
uint64_t th, tl;
int i;
shake256_extract(src, tmp, 56);
inner_shake256_extract(src, tmp, 56);
for (i = 0; i < 14; i ++) {
uint32_t w;

View File

@ -417,8 +417,170 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
size_t n, hn;
const fpr *tree0, *tree1;
n = (size_t)1 << logn;
if (n == 1) {
/*
* When logn == 2, we inline the last two recursion levels.
*/
if (logn == 2) {
fpr x0, x1, y0, y1, w0, w1, w2, w3, sigma;
fpr a_re, a_im, b_re, b_im, c_re, c_im;
tree0 = tree + 4;
tree1 = tree + 8;
/*
* We split t1 into w*, then do the recursive invocation,
* with output in w*. We finally merge back into z1.
*/
a_re = t1[0];
a_im = t1[2];
b_re = t1[1];
b_im = t1[3];
c_re = fpr_add(a_re, b_re);
c_im = fpr_add(a_im, b_im);
w0 = fpr_half(c_re);
w1 = fpr_half(c_im);
c_re = fpr_sub(a_re, b_re);
c_im = fpr_sub(a_im, b_im);
w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
x0 = w2;
x1 = w3;
sigma = tree1[3];
w2 = fpr_of(samp(samp_ctx, x0, sigma));
w3 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = fpr_sub(x0, w2);
a_im = fpr_sub(x1, w3);
b_re = tree1[0];
b_im = tree1[1];
c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
x0 = fpr_add(c_re, w0);
x1 = fpr_add(c_im, w1);
sigma = tree1[2];
w0 = fpr_of(samp(samp_ctx, x0, sigma));
w1 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = w0;
a_im = w1;
b_re = w2;
b_im = w3;
c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
z1[0] = w0 = fpr_add(a_re, c_re);
z1[2] = w2 = fpr_add(a_im, c_im);
z1[1] = w1 = fpr_sub(a_re, c_re);
z1[3] = w3 = fpr_sub(a_im, c_im);
/*
* Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
*/
w0 = fpr_sub(t1[0], w0);
w1 = fpr_sub(t1[1], w1);
w2 = fpr_sub(t1[2], w2);
w3 = fpr_sub(t1[3], w3);
a_re = w0;
a_im = w2;
b_re = tree[0];
b_im = tree[2];
w0 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
w2 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
a_re = w1;
a_im = w3;
b_re = tree[1];
b_im = tree[3];
w1 = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
w3 = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
w0 = fpr_add(w0, t0[0]);
w1 = fpr_add(w1, t0[1]);
w2 = fpr_add(w2, t0[2]);
w3 = fpr_add(w3, t0[3]);
/*
* Second recursive invocation.
*/
a_re = w0;
a_im = w2;
b_re = w1;
b_im = w3;
c_re = fpr_add(a_re, b_re);
c_im = fpr_add(a_im, b_im);
w0 = fpr_half(c_re);
w1 = fpr_half(c_im);
c_re = fpr_sub(a_re, b_re);
c_im = fpr_sub(a_im, b_im);
w2 = fpr_mul(fpr_add(c_re, c_im), fpr_invsqrt8);
w3 = fpr_mul(fpr_sub(c_im, c_re), fpr_invsqrt8);
x0 = w2;
x1 = w3;
sigma = tree0[3];
w2 = y0 = fpr_of(samp(samp_ctx, x0, sigma));
w3 = y1 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = fpr_sub(x0, y0);
a_im = fpr_sub(x1, y1);
b_re = tree0[0];
b_im = tree0[1];
c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
x0 = fpr_add(c_re, w0);
x1 = fpr_add(c_im, w1);
sigma = tree0[2];
w0 = fpr_of(samp(samp_ctx, x0, sigma));
w1 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = w0;
a_im = w1;
b_re = w2;
b_im = w3;
c_re = fpr_mul(fpr_sub(b_re, b_im), fpr_invsqrt2);
c_im = fpr_mul(fpr_add(b_re, b_im), fpr_invsqrt2);
z0[0] = fpr_add(a_re, c_re);
z0[2] = fpr_add(a_im, c_im);
z0[1] = fpr_sub(a_re, c_re);
z0[3] = fpr_sub(a_im, c_im);
return;
}
/*
* Case logn == 1 is reachable only when using Falcon-2 (the
* smallest size for which Falcon is mathematically defined, but
* of course way too insecure to be of any use).
*/
if (logn == 1) {
fpr x0, x1, y0, y1, sigma;
fpr a_re, a_im, b_re, b_im, c_re, c_im;
x0 = t1[0];
x1 = t1[1];
sigma = tree[3];
z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
a_re = fpr_sub(x0, y0);
a_im = fpr_sub(x1, y1);
b_re = tree[0];
b_im = tree[1];
c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
x0 = fpr_add(c_re, t0[0]);
x1 = fpr_add(c_im, t0[1]);
sigma = tree[2];
z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
return;
}
/*
* Normal end of recursion is for logn == 0. Since the last
* steps of the recursions were inlined in the blocks above
* (when logn == 1 or 2), this case is not reachable, and is
* retained here only for documentation purposes.
if (logn == 0) {
fpr x0, x1, sigma;
x0 = t0[0];
@ -429,6 +591,13 @@ ffSampling_fft(samplerZ samp, void *samp_ctx,
return;
}
*/
/*
* General recursive case (logn >= 3).
*/
n = (size_t)1 << logn;
hn = n >> 1;
tree0 = tree + n;
tree1 = tree + n + ffLDL_treesize(logn - 1);
@ -480,7 +649,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
const fpr *b00, *b01, *b10, *b11, *tree;
fpr ni;
uint32_t sqn, ng;
int16_t *s2tmp;
int16_t *s1tmp, *s2tmp;
n = MKN(logn);
t0 = tmp;
@ -542,6 +711,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
/*
* Compute the signature.
*/
s1tmp = (int16_t *)tx;
sqn = 0;
ng = 0;
for (u = 0; u < n; u ++) {
@ -550,6 +720,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
sqn += (uint32_t)(z * z);
ng |= sqn;
s1tmp[u] = (int16_t)z;
}
sqn |= -(ng >> 31);
@ -568,6 +739,7 @@ do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
}
if (PQCLEAN_FALCON512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
memcpy(s2, s2tmp, n * sizeof * s2);
memcpy(tmp, s1tmp, n * sizeof * s1tmp);
return 1;
}
return 0;
@ -592,7 +764,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
fpr ni;
uint32_t sqn, ng;
int16_t *s2tmp;
int16_t *s1tmp, *s2tmp;
n = MKN(logn);
@ -745,6 +917,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
PQCLEAN_FALCON512_CLEAN_iFFT(t0, logn);
PQCLEAN_FALCON512_CLEAN_iFFT(t1, logn);
s1tmp = (int16_t *)tx;
sqn = 0;
ng = 0;
for (u = 0; u < n; u ++) {
@ -753,6 +926,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
sqn += (uint32_t)(z * z);
ng |= sqn;
s1tmp[u] = (int16_t)z;
}
sqn |= -(ng >> 31);
@ -771,6 +945,7 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
}
if (PQCLEAN_FALCON512_CLEAN_is_short_half(sqn, s2tmp, logn)) {
memcpy(s2, s2tmp, n * sizeof * s2);
memcpy(tmp, s1tmp, n * sizeof * s1tmp);
return 1;
}
return 0;
@ -780,29 +955,28 @@ do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
* Sample an integer value along a half-gaussian distribution centered
* on zero and standard deviation 1.8205, with a precision of 72 bits.
*/
static int
gaussian0_sampler(prng *p) {
int
PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(prng *p) {
static const uint32_t dist[] = {
6031371U, 13708371U, 13035518U,
5186761U, 1487980U, 12270720U,
3298653U, 4688887U, 5511555U,
1551448U, 9247616U, 9467675U,
539632U, 14076116U, 5909365U,
138809U, 10836485U, 13263376U,
26405U, 15335617U, 16601723U,
3714U, 14514117U, 13240074U,
386U, 8324059U, 3276722U,
29U, 12376792U, 7821247U,
1U, 11611789U, 3398254U,
0U, 1194629U, 4532444U,
0U, 37177U, 2973575U,
0U, 855U, 10369757U,
0U, 14U, 9441597U,
0U, 0U, 3075302U,
0U, 0U, 28626U,
0U, 0U, 197U,
0U, 0U, 1U
10745844u, 3068844u, 3741698u,
5559083u, 1580863u, 8248194u,
2260429u, 13669192u, 2736639u,
708981u, 4421575u, 10046180u,
169348u, 7122675u, 4136815u,
30538u, 13063405u, 7650655u,
4132u, 14505003u, 7826148u,
417u, 16768101u, 11363290u,
31u, 8444042u, 8086568u,
1u, 12844466u, 265321u,
0u, 1232676u, 13644283u,
0u, 38047u, 9111839u,
0u, 870u, 6138264u,
0u, 14u, 12545723u,
0u, 0u, 3104126u,
0u, 0u, 28824u,
0u, 0u, 198u,
0u, 0u, 1u
};
uint32_t v0, v1, v2, hi;
@ -843,7 +1017,7 @@ gaussian0_sampler(prng *p) {
* Sample a bit with probability exp(-x) for some x >= 0.
*/
static int
BerExp(prng *p, fpr x) {
BerExp(prng *p, fpr x, fpr ccs) {
int s, i;
fpr r;
uint32_t sw, w;
@ -880,7 +1054,7 @@ BerExp(prng *p, fpr x) {
* case). The bias is negligible since fpr_expm_p63() only computes
* with 51 bits of precision or so.
*/
z = ((fpr_expm_p63(r) << 1) - 1) >> s;
z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
/*
* Sample a bit with probability exp(-x). Since x = s*log(2) + r,
@ -896,11 +1070,6 @@ BerExp(prng *p, fpr x) {
return (int)(w >> 31);
}
typedef struct {
prng p;
fpr sigma_min;
} sampler_context;
/*
* The sampler produces a random integer that follows a discrete Gaussian
* distribution, centered on mu, and with standard deviation sigma. The
@ -909,8 +1078,8 @@ typedef struct {
* The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
* 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
*/
static int
sampler(void *ctx, fpr mu, fpr isigma) {
int
PQCLEAN_FALCON512_CLEAN_sampler(void *ctx, fpr mu, fpr isigma) {
sampler_context *spc;
int s;
fpr r, dss, ccs;
@ -952,7 +1121,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
* - b = 0: z <= 0 and sampled against a Gaussian
* centered on 0.
*/
z0 = gaussian0_sampler(&spc->p);
z0 = PQCLEAN_FALCON512_CLEAN_gaussian0_sampler(&spc->p);
b = prng_get_u8(&spc->p) & 1;
z = b + ((b << 1) - 1) * z0;
@ -983,8 +1152,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
*/
x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
x = fpr_mul(x, ccs);
if (BerExp(&spc->p, x)) {
if (BerExp(&spc->p, x, ccs)) {
/*
* Rejection sampling was centered on r, but the
* actual center is mu = s + r.
@ -996,7 +1164,7 @@ sampler(void *ctx, fpr mu, fpr isigma) {
/* see inner.h */
void
PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, inner_shake256_context *rng,
const fpr *expanded_key,
const uint16_t *hm, unsigned logn, uint8_t *tmp) {
fpr *ftmp;
@ -1025,7 +1193,7 @@ PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
? fpr_sigma_min_10
: fpr_sigma_min_9;
PQCLEAN_FALCON512_CLEAN_prng_init(&spc.p, rng);
samp = sampler;
samp = PQCLEAN_FALCON512_CLEAN_sampler;
samp_ctx = &spc;
/*
@ -1040,7 +1208,7 @@ PQCLEAN_FALCON512_CLEAN_sign_tree(int16_t *sig, shake256_context *rng,
/* see inner.h */
void
PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, inner_shake256_context *rng,
const int8_t *f, const int8_t *g,
const int8_t *F, const int8_t *G,
const uint16_t *hm, unsigned logn, uint8_t *tmp) {
@ -1070,7 +1238,7 @@ PQCLEAN_FALCON512_CLEAN_sign_dyn(int16_t *sig, shake256_context *rng,
? fpr_sigma_min_10
: fpr_sigma_min_9;
PQCLEAN_FALCON512_CLEAN_prng_init(&spc.p, rng);
samp = sampler;
samp = PQCLEAN_FALCON512_CLEAN_sampler;
samp_ctx = &spc;
/*

View File

@ -649,7 +649,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
}
/*
* Compute s1 = s2*h - c0 mod phi mod q (in tt[]).
* Compute -s1 = s2*h - c0 mod phi mod q (in tt[]).
*/
mq_NTT(tt, logn);
mq_poly_montymul_ntt(tt, h, logn);
@ -657,7 +657,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
mq_poly_sub(tt, c0, logn);
/*
* Normalize s1 elements into the [-q/2..q/2] range.
* Normalize -s1 elements into the [-q/2..q/2] range.
*/
for (u = 0; u < n; u ++) {
int32_t w;
@ -668,7 +668,7 @@ PQCLEAN_FALCON512_CLEAN_verify_raw(const uint16_t *c0, const int16_t *s2,
}
/*
* Signature is valid if and only if the aggregate (s1,s2) vector
* Signature is valid if and only if the aggregate (-s1,s2) vector
* is short enough.
*/
return PQCLEAN_FALCON512_CLEAN_is_short((int16_t *)tt, s2, logn);
@ -699,7 +699,7 @@ PQCLEAN_FALCON512_CLEAN_compute_public(uint16_t *h,
return 1;
}
/* see internal.h */
/* see inner.h */
int
PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
const int8_t *f, const int8_t *g, const int8_t *F,
@ -743,3 +743,110 @@ PQCLEAN_FALCON512_CLEAN_complete_private(int8_t *G,
}
return 1;
}
/* see inner.h */
int
PQCLEAN_FALCON512_CLEAN_is_invertible(
const int16_t *s2, unsigned logn, uint8_t *tmp) {
size_t u, n;
uint16_t *tt;
uint32_t r;
n = (size_t)1 << logn;
tt = (uint16_t *)tmp;
for (u = 0; u < n; u ++) {
uint32_t w;
w = (uint32_t)s2[u];
w += Q & -(w >> 31);
tt[u] = (uint16_t)w;
}
mq_NTT(tt, logn);
r = 0;
for (u = 0; u < n; u ++) {
r |= (uint32_t)(tt[u] - 1);
}
return (int)(1u - (r >> 31));
}
/* see inner.h */
int
PQCLEAN_FALCON512_CLEAN_verify_recover(uint16_t *h,
const uint16_t *c0, const int16_t *s1, const int16_t *s2,
unsigned logn, uint8_t *tmp) {
size_t u, n;
uint16_t *tt;
uint32_t r;
n = (size_t)1 << logn;
/*
* Reduce elements of s1 and s2 modulo q; then write s2 into tt[]
* and c0 - s1 into h[].
*/
tt = (uint16_t *)tmp;
for (u = 0; u < n; u ++) {
uint32_t w;
w = (uint32_t)s2[u];
w += Q & -(w >> 31);
tt[u] = (uint16_t)w;
w = (uint32_t)s1[u];
w += Q & -(w >> 31);
w = mq_sub(c0[u], w);
h[u] = (uint16_t)w;
}
/*
* Compute h = (c0 - s1) / s2. If one of the coefficients of s2
* is zero (in NTT representation) then the operation fails. We
* keep that information into a flag so that we do not deviate
* from strict constant-time processing; if all coefficients of
* s2 are non-zero, then the high bit of r will be zero.
*/
mq_NTT(tt, logn);
mq_NTT(h, logn);
r = 0;
for (u = 0; u < n; u ++) {
r |= (uint32_t)(tt[u] - 1);
h[u] = (uint16_t)mq_div_12289(h[u], tt[u]);
}
mq_iNTT(h, logn);
/*
* Signature is acceptable if and only if it is short enough,
* and s2 was invertible mod phi mod q. The caller must still
* check that the rebuilt public key matches the expected
* value (e.g. through a hash).
*/
r = ~r & (uint32_t) - PQCLEAN_FALCON512_CLEAN_is_short(s1, s2, logn);
return (int)(r >> 31);
}
/* see inner.h */
int
PQCLEAN_FALCON512_CLEAN_count_nttzero(const int16_t *sig, unsigned logn, uint8_t *tmp) {
uint16_t *s2;
size_t u, n;
uint32_t r;
n = (size_t)1 << logn;
s2 = (uint16_t *)tmp;
for (u = 0; u < n; u ++) {
uint32_t w;
w = (uint32_t)sig[u];
w += Q & -(w >> 31);
s2[u] = (uint16_t)w;
}
mq_NTT(s2, logn);
r = 0;
for (u = 0; u < n; u ++) {
uint32_t w;
w = (uint32_t)s2[u] - 1u;
r += (w >> 31);
}
return (int)r;
}