pqc/crypto_sign/falcon-512/clean/fpr.h

/*
 * Floating-point operations.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2017-2019  Falcon Project
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
 */


/* ====================================================================== */
/*
 * Custom floating-point implementation with integer arithmetics. We
 * use IEEE-754 "binary64" format, with some simplifications:
 *
 *   - Top bit is s = 1 for negative, 0 for positive.
 *
 *   - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).
 *
 *   - Mantissa m uses the 52 low bits.
 *
 * Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))
 * i.e. the mantissa really is a 53-bit number (less than 2.0, but not
 * less than 1.0), but the top bit (equal to 1 by definition) is omitted
 * in the encoding.
 *
 * In IEEE-754, there are some special values:
 *
 *   - If e = 2047, then the value is either an infinite (m = 0) or
 *     a NaN (m != 0).
 *
 *   - If e = 0, then the value is either a zero (m = 0) or a subnormal,
 *     aka "denormalized number" (m != 0).
 *
 * Of these, we only need the zeros. The caller is responsible for not
 * providing operands that would lead to infinites, NaNs or subnormals.
 * If inputs are such that values go out of range, then indeterminate
 * values are returned (it would still be deterministic, but no specific
 * value may be relied upon).
 *
 * At the C level, the three parts are stored in a 64-bit unsigned
 * word.
 *
 * One may note that a property of the IEEE-754 format is that order
 * is preserved for positive values: if two positive floating-point
 * values x and y are such that x < y, then their respective encodings
 * as _signed_ 64-bit integers i64(x) and i64(y) will be such that
 * i64(x) < i64(y). For negative values, order is reversed: if x < 0,
 * y < 0, and x < y, then ia64(x) > ia64(y).
 *
 * IMPORTANT ASSUMPTIONS:
 * ======================
 *
 * For proper computations, and constant-time behaviour, we assume the
 * following:
 *
 *   - 32x32->64 multiplication (unsigned) has an execution time that
 *     is independent of its operands. This is true of most modern
 *     x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+
 *     and M3 (in the M0 and M0+, this is done in software, so it depends
 *     on that routine), and the PowerPC cores from the G3/G4 lines.
 *     For more info, see: https://www.bearssl.org/ctmul.html
 *
 *   - Left-shifts and right-shifts of 32-bit values have an execution
 *     time which does not depend on the shifted value nor on the
 *     shift count. An historical exception is the Pentium IV, but most
 *     modern CPU have barrel shifters. Some small microcontrollers
 *     might have varying-time shifts (not the ARM Cortex M*, though).
 *
 *   - Right-shift of a signed negative value performs a sign extension.
 *     As per the C standard, this operation returns an
 *     implementation-defined result (this is NOT an "undefined
 *     behaviour"). On most/all systems, an arithmetic shift is
 *     performed, because this is what makes most sense.
 */

/*
 * Normally we should declare the 'fpr' type to be a struct or union
 * around the internal 64-bit value; however, we want to use the
 * direct 64-bit integer type to enable a lighter call convention on
 * ARM platforms. This means that direct (invalid) use of operators
 * such as '*' or '+' will not be caught by the compiler. We rely on
 * the "normal" (non-emulated) code to detect such instances.
 */
typedef uint64_t fpr;

/*
 * For computations, we split values into an integral mantissa in the
 * 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is
 * "sticky" (it is set to 1 if any of the bits below it is 1); when
 * re-encoding, the low two bits are dropped, but may induce an
 * increment in the value for proper rounding.
 */

/*
 * Right-shift a 64-bit unsigned value by a possibly secret shift count.
 * We assumed that the underlying architecture had a barrel shifter for
 * 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will
 * typically invoke a software routine that is not necessarily
 * constant-time; hence the function below.
 *
 * Shift count n MUST be in the 0..63 range.
 */
static inline uint64_t
fpr_ursh(uint64_t x, int n) {
    x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);
    return x >> (n & 31);
}

/*
 * Right-shift a 64-bit signed value by a possibly secret shift count
 * (see fpr_ursh() for the rationale).
 *
 * Shift count n MUST be in the 0..63 range.
 */
static inline int64_t
fpr_irsh(int64_t x, int n) {
    x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);
    return x >> (n & 31);
}

/*
 * Left-shift a 64-bit unsigned value by a possibly secret shift count
 * (see fpr_ursh() for the rationale).
 *
 * Shift count n MUST be in the 0..63 range.
 */
static inline uint64_t
fpr_ulsh(uint64_t x, int n) {
    x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);
    return x << (n & 31);
}

/*
 * Expectations:
 *   s = 0 or 1
 *   exponent e is "arbitrary" and unbiased
 *   2^54 <= m < 2^55
 * Numerical value is (-1)^2 * m * 2^e
 *
 * Exponents which are too low lead to value zero. If the exponent is
 * too large, the returned value is indeterminate.
 *
 * If m = 0, then a zero is returned (using the provided sign).
 * If e < -1076, then a zero is returned (regardless of the value of m).
 * If e >= -1076 and e != 0, m must be within the expected range
 * (2^54 to 2^55-1).
 */
static inline fpr
FPR(int s, int e, uint64_t m) {
    fpr x;
    uint32_t t;
    unsigned f;

    /*
     * If e >= -1076, then the value is "normal"; otherwise, it
     * should be a subnormal, which we clamp down to zero.
     */
    e += 1076;
    t = (uint32_t)e >> 31;
    m &= (uint64_t)t - 1;

    /*
     * If m = 0 then we want a zero; make e = 0 too, but conserve
     * the sign.
     */
    t = (uint32_t)(m >> 54);
    e &= -(int)t;

    /*
     * The 52 mantissa bits come from m. Value m has its top bit set
     * (unless it is a zero); we leave it "as is": the top bit will
     * increment the exponent by 1, except when m = 0, which is
     * exactly what we want.
     */
    x = (((uint64_t)s << 63) | (m >> 2)) + ((uint64_t)(uint32_t)e << 52);

    /*
     * Rounding: if the low three bits of m are 011, 110 or 111,
     * then the value should be incremented to get the next
     * representable value. This implements the usual
     * round-to-nearest rule (with preference to even values in case
     * of a tie). Note that the increment may make a carry spill
     * into the exponent field, which is again exactly what we want
     * in that case.
     */
    f = (unsigned)m & 7U;
    x += (0xC8U >> f) & 1;
    return x;
}

#define fpr_scaled   PQCLEAN_FALCON512_CLEAN_fpr_scaled
fpr fpr_scaled(int64_t i, int sc);

static inline fpr
fpr_of(int64_t i) {
    return fpr_scaled(i, 0);
}

static const fpr fpr_q = 4667981563525332992;
static const fpr fpr_inverse_of_q = 4545632735260551042;
static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;
static const fpr fpr_inv_sigma = 4573359825155195350;
static const fpr fpr_sigma_min_9 = 4608495221497168882;
static const fpr fpr_sigma_min_10 = 4608586345619182117;
static const fpr fpr_log2 = 4604418534313441775;
static const fpr fpr_inv_log2 = 4609176140021203710;
static const fpr fpr_bnorm_max = 4670353323383631276;
static const fpr fpr_zero = 0;
static const fpr fpr_one = 4607182418800017408;
static const fpr fpr_two = 4611686018427387904;
static const fpr fpr_onehalf = 4602678819172646912;
static const fpr fpr_ptwo31 = 4746794007248502784;
static const fpr fpr_ptwo31m1 = 4746794007244308480;
static const fpr fpr_mtwo31m1 = 13970166044099084288U;
static const fpr fpr_ptwo63m1 = 4890909195324358656;
static const fpr fpr_mtwo63m1 = 14114281232179134464U;
static const fpr fpr_ptwo63 = 4890909195324358656;

static inline int64_t
fpr_rint(fpr x) {
    uint64_t m, d;
    int e;
    uint32_t s, dd, f;

    /*
     * We assume that the value fits in -(2^63-1)..+(2^63-1). We can
     * thus extract the mantissa as a 63-bit integer, then right-shift
     * it as needed.
     */
    m = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
    e = 1085 - ((int)(x >> 52) & 0x7FF);

    /*
     * If a shift of more than 63 bits is needed, then simply set m
     * to zero. This also covers the case of an input operand equal
     * to zero.
     */
    m &= -(uint64_t)((uint32_t)(e - 64) >> 31);
    e &= 63;

    /*
     * Right-shift m as needed. Shift count is e. Proper rounding
     * mandates that:
     *   - If the highest dropped bit is zero, then round low.
     *   - If the highest dropped bit is one, and at least one of the
     *     other dropped bits is one, then round up.
     *   - If the highest dropped bit is one, and all other dropped
     *     bits are zero, then round up if the lowest kept bit is 1,
     *     or low otherwise (i.e. ties are broken by "rounding to even").
     *
     * We thus first extract a word consisting of all the dropped bit
     * AND the lowest kept bit; then we shrink it down to three bits,
     * the lowest being "sticky".
     */
    d = fpr_ulsh(m, 63 - e);
    dd = (uint32_t)d | ((uint32_t)(d >> 32) & 0x1FFFFFFF);
    f = (uint32_t)(d >> 61) | ((dd | -dd) >> 31);
    m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);

    /*
     * Apply the sign bit.
     */
    s = (uint32_t)(x >> 63);
    return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;
}

static inline int64_t
fpr_floor(fpr x) {
    uint64_t t;
    int64_t xi;
    int e, cc;

    /*
     * We extract the integer as a _signed_ 64-bit integer with
     * a scaling factor. Since we assume that the value fits
     * in the -(2^63-1)..+(2^63-1) range, we can left-shift the
     * absolute value to make it in the 2^62..2^63-1 range: we
     * will only need a right-shift afterwards.
     */
    e = (int)(x >> 52) & 0x7FF;
    t = x >> 63;
    xi = (int64_t)(((x << 10) | ((uint64_t)1 << 62))
                   & (((uint64_t)1 << 63) - 1));
    xi = (xi ^ -(int64_t)t) + (int64_t)t;
    cc = 1085 - e;

    /*
     * We perform an arithmetic right-shift on the value. This
     * applies floor() semantics on both positive and negative values
     * (rounding toward minus infinity).
     */
    xi = fpr_irsh(xi, cc & 63);

    /*
     * If the true shift count was 64 or more, then we should instead
     * replace xi with 0 (if nonnegative) or -1 (if negative). Edge
     * case: -0 will be floored to -1, not 0 (whether this is correct
     * is debatable; in any case, the other functions normalize zero
     * to +0).
     *
     * For an input of zero, the non-shifted xi was incorrect (we used
     * a top implicit bit of value 1, not 0), but this does not matter
     * since this operation will clamp it down.
     */
    xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);
    return xi;
}

static inline int64_t
fpr_trunc(fpr x) {
    uint64_t t, xu;
    int e, cc;

    /*
     * Extract the absolute value. Since we assume that the value
     * fits in the -(2^63-1)..+(2^63-1) range, we can left-shift
     * the absolute value into the 2^62..2^63-1 range, and then
     * do a right shift afterwards.
     */
    e = (int)(x >> 52) & 0x7FF;
    xu = ((x << 10) | ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);
    cc = 1085 - e;
    xu = fpr_ursh(xu, cc & 63);

    /*
     * If the exponent is too low (cc > 63), then the shift was wrong
     * and we must clamp the value to 0. This also covers the case
     * of an input equal to zero.
     */
    xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);

    /*
     * Apply back the sign, if the source value is negative.
     */
    t = x >> 63;
    xu = (xu ^ -t) + t;
    return *(int64_t *)&xu;
}

#define fpr_add   PQCLEAN_FALCON512_CLEAN_fpr_add
fpr fpr_add(fpr x, fpr y);

static inline fpr
fpr_sub(fpr x, fpr y) {
    y ^= (uint64_t)1 << 63;
    return fpr_add(x, y);
}

static inline fpr
fpr_neg(fpr x) {
    x ^= (uint64_t)1 << 63;
    return x;
}

static inline fpr
fpr_half(fpr x) {
    /*
     * To divide a value by 2, we just have to subtract 1 from its
     * exponent, but we have to take care of zero.
     */
    uint32_t t;

    x -= (uint64_t)1 << 52;
    t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;
    x &= (uint64_t)t - 1;
    return x;
}

static inline fpr
fpr_double(fpr x) {
    /*
     * To double a value, we just increment by one the exponent. We
     * don't care about infinites or NaNs; however, 0 is a
     * special case.
     */
    x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;
    return x;
}

#define fpr_mul   PQCLEAN_FALCON512_CLEAN_fpr_mul
fpr fpr_mul(fpr x, fpr y);

static inline fpr
fpr_sqr(fpr x) {
    return fpr_mul(x, x);
}

#define fpr_div   PQCLEAN_FALCON512_CLEAN_fpr_div
fpr fpr_div(fpr x, fpr y);

static inline fpr
fpr_inv(fpr x) {
    return fpr_div(4607182418800017408u, x);
}

#define fpr_sqrt   PQCLEAN_FALCON512_CLEAN_fpr_sqrt
fpr fpr_sqrt(fpr x);

static inline int
fpr_lt(fpr x, fpr y) {
    /*
     * If x >= 0 or y >= 0, a signed comparison yields the proper
     * result:
     *   - For positive values, the order is preserved.
     *   - The sign bit is at the same place as in integers, so
     *     sign is preserved.
     *
     * If both x and y are negative, then the order is reversed.
     * We cannot simply invert the comparison result in that case
     * because it would not handle the edge case x = y properly.
     */
    int cc0, cc1;

    cc0 = *(int64_t *)&x < *(int64_t *)&y;
    cc1 = *(int64_t *)&x > *(int64_t *)&y;
    return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));
}

/*
 * Compute exp(x) for x such that |x| <= ln 2. We want a precision of 50
 * bits or so.
 */
#define fpr_expm_p63   PQCLEAN_FALCON512_CLEAN_fpr_expm_p63
uint64_t fpr_expm_p63(fpr x);

#define fpr_gm_tab   PQCLEAN_FALCON512_CLEAN_fpr_gm_tab
extern const fpr fpr_gm_tab[];

#define fpr_p2_tab   PQCLEAN_FALCON512_CLEAN_fpr_p2_tab
extern const fpr fpr_p2_tab[];

/* ====================================================================== */
Falcon implementations (integer-only code, constant-time). 2019-07-21 00:44:25 +01:00			`/*`
			`* Floating-point operations.`
			`*`
			`* ==========================(LICENSE BEGIN)============================`
			`*`
			`* Copyright (c) 2017-2019 Falcon Project`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining`
			`* a copy of this software and associated documentation files (the`
			`* "Software"), to deal in the Software without restriction, including`
			`* without limitation the rights to use, copy, modify, merge, publish,`
			`* distribute, sublicense, and/or sell copies of the Software, and to`
			`* permit persons to whom the Software is furnished to do so, subject to`
			`* the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be`
			`* included in all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,`
			`* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.`
			`* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY`
			`* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,`
			`* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE`
			`* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.`
			`*`
			`* ===========================(LICENSE END)=============================`
			`*`
			`* @author Thomas Pornin <thomas.pornin@nccgroup.com>`
			`*/`


			`/* ====================================================================== */`
			`/*`
			`* Custom floating-point implementation with integer arithmetics. We`
			`* use IEEE-754 "binary64" format, with some simplifications:`
			`*`
			`* - Top bit is s = 1 for negative, 0 for positive.`
			`*`
			`* - Exponent e uses the next 11 bits (bits 52 to 62, inclusive).`
			`*`
			`* - Mantissa m uses the 52 low bits.`
			`*`
			`* Encoded value is, in general: (-1)^s * 2^(e-1023) * (1 + m*2^(-52))`
			`* i.e. the mantissa really is a 53-bit number (less than 2.0, but not`
			`* less than 1.0), but the top bit (equal to 1 by definition) is omitted`
			`* in the encoding.`
			`*`
			`* In IEEE-754, there are some special values:`
			`*`
			`* - If e = 2047, then the value is either an infinite (m = 0) or`
			`* a NaN (m != 0).`
			`*`
			`* - If e = 0, then the value is either a zero (m = 0) or a subnormal,`
			`* aka "denormalized number" (m != 0).`
			`*`
			`* Of these, we only need the zeros. The caller is responsible for not`
			`* providing operands that would lead to infinites, NaNs or subnormals.`
			`* If inputs are such that values go out of range, then indeterminate`
			`* values are returned (it would still be deterministic, but no specific`
			`* value may be relied upon).`
			`*`
			`* At the C level, the three parts are stored in a 64-bit unsigned`
			`* word.`
			`*`
			`* One may note that a property of the IEEE-754 format is that order`
			`* is preserved for positive values: if two positive floating-point`
			`* values x and y are such that x < y, then their respective encodings`
			`* as _signed_ 64-bit integers i64(x) and i64(y) will be such that`
			`* i64(x) < i64(y). For negative values, order is reversed: if x < 0,`
			`* y < 0, and x < y, then ia64(x) > ia64(y).`
			`*`
			`* IMPORTANT ASSUMPTIONS:`
			`* ======================`
			`*`
			`* For proper computations, and constant-time behaviour, we assume the`
			`* following:`
			`*`
			`* - 32x32->64 multiplication (unsigned) has an execution time that`
			`* is independent of its operands. This is true of most modern`
			`* x86 and ARM cores. Notable exceptions are the ARM Cortex M0, M0+`
			`* and M3 (in the M0 and M0+, this is done in software, so it depends`
			`* on that routine), and the PowerPC cores from the G3/G4 lines.`
			`* For more info, see: https://www.bearssl.org/ctmul.html`
			`*`
			`* - Left-shifts and right-shifts of 32-bit values have an execution`
			`* time which does not depend on the shifted value nor on the`
			`* shift count. An historical exception is the Pentium IV, but most`
			`* modern CPU have barrel shifters. Some small microcontrollers`
			`* might have varying-time shifts (not the ARM Cortex M*, though).`
			`*`
			`* - Right-shift of a signed negative value performs a sign extension.`
			`* As per the C standard, this operation returns an`
			`* implementation-defined result (this is NOT an "undefined`
			`* behaviour"). On most/all systems, an arithmetic shift is`
			`* performed, because this is what makes most sense.`
			`*/`

			`/*`
			`* Normally we should declare the 'fpr' type to be a struct or union`
			`* around the internal 64-bit value; however, we want to use the`
			`* direct 64-bit integer type to enable a lighter call convention on`
			`* ARM platforms. This means that direct (invalid) use of operators`
			`* such as '*' or '+' will not be caught by the compiler. We rely on`
			`* the "normal" (non-emulated) code to detect such instances.`
			`*/`
			`typedef uint64_t fpr;`

			`/*`
			`* For computations, we split values into an integral mantissa in the`
			`* 2^54..2^55 range, and an (adjusted) exponent. The lowest bit is`
			`* "sticky" (it is set to 1 if any of the bits below it is 1); when`
			`* re-encoding, the low two bits are dropped, but may induce an`
			`* increment in the value for proper rounding.`
			`*/`

			`/*`
			`* Right-shift a 64-bit unsigned value by a possibly secret shift count.`
			`* We assumed that the underlying architecture had a barrel shifter for`
			`* 32-bit shifts, but for 64-bit shifts on a 32-bit system, this will`
			`* typically invoke a software routine that is not necessarily`
			`* constant-time; hence the function below.`
			`*`
			`* Shift count n MUST be in the 0..63 range.`
			`*/`
			`static inline uint64_t`
			`fpr_ursh(uint64_t x, int n) {`
			`x ^= (x ^ (x >> 32)) & -(uint64_t)(n >> 5);`
			`return x >> (n & 31);`
			`}`

			`/*`
			`* Right-shift a 64-bit signed value by a possibly secret shift count`
			`* (see fpr_ursh() for the rationale).`
			`*`
			`* Shift count n MUST be in the 0..63 range.`
			`*/`
			`static inline int64_t`
			`fpr_irsh(int64_t x, int n) {`
			`x ^= (x ^ (x >> 32)) & -(int64_t)(n >> 5);`
			`return x >> (n & 31);`
			`}`

			`/*`
			`* Left-shift a 64-bit unsigned value by a possibly secret shift count`
			`* (see fpr_ursh() for the rationale).`
			`*`
			`* Shift count n MUST be in the 0..63 range.`
			`*/`
			`static inline uint64_t`
			`fpr_ulsh(uint64_t x, int n) {`
			`x ^= (x ^ (x << 32)) & -(uint64_t)(n >> 5);`
			`return x << (n & 31);`
			`}`

			`/*`
			`* Expectations:`
			`* s = 0 or 1`
			`* exponent e is "arbitrary" and unbiased`
			`* 2^54 <= m < 2^55`
			`* Numerical value is (-1)^2 * m * 2^e`
			`*`
			`* Exponents which are too low lead to value zero. If the exponent is`
			`* too large, the returned value is indeterminate.`
			`*`
			`* If m = 0, then a zero is returned (using the provided sign).`
			`* If e < -1076, then a zero is returned (regardless of the value of m).`
			`* If e >= -1076 and e != 0, m must be within the expected range`
			`* (2^54 to 2^55-1).`
			`*/`
			`static inline fpr`
			`FPR(int s, int e, uint64_t m) {`
			`fpr x;`
			`uint32_t t;`
			`unsigned f;`

			`/*`
			`* If e >= -1076, then the value is "normal"; otherwise, it`
			`* should be a subnormal, which we clamp down to zero.`
			`*/`
			`e += 1076;`
			`t = (uint32_t)e >> 31;`
			`m &= (uint64_t)t - 1;`

			`/*`
			`* If m = 0 then we want a zero; make e = 0 too, but conserve`
			`* the sign.`
			`*/`
			`t = (uint32_t)(m >> 54);`
			`e &= -(int)t;`

			`/*`
			`* The 52 mantissa bits come from m. Value m has its top bit set`
			`* (unless it is a zero); we leave it "as is": the top bit will`
			`* increment the exponent by 1, except when m = 0, which is`
			`* exactly what we want.`
			`*/`
			`x = (((uint64_t)s << 63) \| (m >> 2)) + ((uint64_t)(uint32_t)e << 52);`

			`/*`
			`* Rounding: if the low three bits of m are 011, 110 or 111,`
			`* then the value should be incremented to get the next`
			`* representable value. This implements the usual`
			`* round-to-nearest rule (with preference to even values in case`
			`* of a tie). Note that the increment may make a carry spill`
			`* into the exponent field, which is again exactly what we want`
			`* in that case.`
			`*/`
			`f = (unsigned)m & 7U;`
			`x += (0xC8U >> f) & 1;`
			`return x;`
			`}`

			`#define fpr_scaled PQCLEAN_FALCON512_CLEAN_fpr_scaled`
			`fpr fpr_scaled(int64_t i, int sc);`

			`static inline fpr`
			`fpr_of(int64_t i) {`
			`return fpr_scaled(i, 0);`
			`}`

			`static const fpr fpr_q = 4667981563525332992;`
			`static const fpr fpr_inverse_of_q = 4545632735260551042;`
			`static const fpr fpr_inv_2sqrsigma0 = 4594603506513722306;`
			`static const fpr fpr_inv_sigma = 4573359825155195350;`
			`static const fpr fpr_sigma_min_9 = 4608495221497168882;`
			`static const fpr fpr_sigma_min_10 = 4608586345619182117;`
			`static const fpr fpr_log2 = 4604418534313441775;`
			`static const fpr fpr_inv_log2 = 4609176140021203710;`
			`static const fpr fpr_bnorm_max = 4670353323383631276;`
			`static const fpr fpr_zero = 0;`
			`static const fpr fpr_one = 4607182418800017408;`
			`static const fpr fpr_two = 4611686018427387904;`
			`static const fpr fpr_onehalf = 4602678819172646912;`
			`static const fpr fpr_ptwo31 = 4746794007248502784;`
			`static const fpr fpr_ptwo31m1 = 4746794007244308480;`
			`static const fpr fpr_mtwo31m1 = 13970166044099084288U;`
			`static const fpr fpr_ptwo63m1 = 4890909195324358656;`
			`static const fpr fpr_mtwo63m1 = 14114281232179134464U;`
			`static const fpr fpr_ptwo63 = 4890909195324358656;`

			`static inline int64_t`
			`fpr_rint(fpr x) {`
			`uint64_t m, d;`
			`int e;`
Some cosmetic changes to appease clang-tidy. 2019-07-21 01:35:30 +01:00			`uint32_t s, dd, f;`
Falcon implementations (integer-only code, constant-time). 2019-07-21 00:44:25 +01:00
			`/*`
			`* We assume that the value fits in -(2^63-1)..+(2^63-1). We can`
			`* thus extract the mantissa as a 63-bit integer, then right-shift`
			`* it as needed.`
			`*/`
			`m = ((x << 10) \| ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);`
			`e = 1085 - ((int)(x >> 52) & 0x7FF);`

			`/*`
			`* If a shift of more than 63 bits is needed, then simply set m`
			`* to zero. This also covers the case of an input operand equal`
			`* to zero.`
			`*/`
			`m &= -(uint64_t)((uint32_t)(e - 64) >> 31);`
			`e &= 63;`

			`/*`
			`* Right-shift m as needed. Shift count is e. Proper rounding`
			`* mandates that:`
			`* - If the highest dropped bit is zero, then round low.`
			`* - If the highest dropped bit is one, and at least one of the`
			`* other dropped bits is one, then round up.`
			`* - If the highest dropped bit is one, and all other dropped`
			`* bits are zero, then round up if the lowest kept bit is 1,`
			`* or low otherwise (i.e. ties are broken by "rounding to even").`
			`*`
			`* We thus first extract a word consisting of all the dropped bit`
			`* AND the lowest kept bit; then we shrink it down to three bits,`
			`* the lowest being "sticky".`
			`*/`
			`d = fpr_ulsh(m, 63 - e);`
			`dd = (uint32_t)d \| ((uint32_t)(d >> 32) & 0x1FFFFFFF);`
Some cosmetic changes to appease clang-tidy. 2019-07-21 01:35:30 +01:00			`f = (uint32_t)(d >> 61) \| ((dd \| -dd) >> 31);`
Falcon implementations (integer-only code, constant-time). 2019-07-21 00:44:25 +01:00			`m = fpr_ursh(m, e) + (uint64_t)((0xC8U >> f) & 1U);`

			`/*`
			`* Apply the sign bit.`
			`*/`
			`s = (uint32_t)(x >> 63);`
			`return ((int64_t)m ^ -(int64_t)s) + (int64_t)s;`
			`}`

Removed 'restrict' keyword (it's C99, but MSVC 2017 does not support it...). This does not seem to impact performance. 2019-07-21 02:27:27 +01:00			`static inline int64_t`
Falcon implementations (integer-only code, constant-time). 2019-07-21 00:44:25 +01:00			`fpr_floor(fpr x) {`
			`uint64_t t;`
			`int64_t xi;`
			`int e, cc;`

			`/*`
			`* We extract the integer as a _signed_ 64-bit integer with`
			`* a scaling factor. Since we assume that the value fits`
			`* in the -(2^63-1)..+(2^63-1) range, we can left-shift the`
			`* absolute value to make it in the 2^62..2^63-1 range: we`
			`* will only need a right-shift afterwards.`
			`*/`
			`e = (int)(x >> 52) & 0x7FF;`
			`t = x >> 63;`
			`xi = (int64_t)(((x << 10) \| ((uint64_t)1 << 62))`
			`& (((uint64_t)1 << 63) - 1));`
			`xi = (xi ^ -(int64_t)t) + (int64_t)t;`
			`cc = 1085 - e;`

			`/*`
			`* We perform an arithmetic right-shift on the value. This`
			`* applies floor() semantics on both positive and negative values`
			`* (rounding toward minus infinity).`
			`*/`
			`xi = fpr_irsh(xi, cc & 63);`

			`/*`
			`* If the true shift count was 64 or more, then we should instead`
			`* replace xi with 0 (if nonnegative) or -1 (if negative). Edge`
			`* case: -0 will be floored to -1, not 0 (whether this is correct`
			`* is debatable; in any case, the other functions normalize zero`
			`* to +0).`
			`*`
			`* For an input of zero, the non-shifted xi was incorrect (we used`
			`* a top implicit bit of value 1, not 0), but this does not matter`
			`* since this operation will clamp it down.`
			`*/`
			`xi ^= (xi ^ -(int64_t)t) & -(int64_t)((uint32_t)(63 - cc) >> 31);`
			`return xi;`
			`}`

			`static inline int64_t`
			`fpr_trunc(fpr x) {`
			`uint64_t t, xu;`
			`int e, cc;`

			`/*`
			`* Extract the absolute value. Since we assume that the value`
			`* fits in the -(2^63-1)..+(2^63-1) range, we can left-shift`
			`* the absolute value into the 2^62..2^63-1 range, and then`
			`* do a right shift afterwards.`
			`*/`
			`e = (int)(x >> 52) & 0x7FF;`
			`xu = ((x << 10) \| ((uint64_t)1 << 62)) & (((uint64_t)1 << 63) - 1);`
			`cc = 1085 - e;`
			`xu = fpr_ursh(xu, cc & 63);`

			`/*`
			`* If the exponent is too low (cc > 63), then the shift was wrong`
			`* and we must clamp the value to 0. This also covers the case`
			`* of an input equal to zero.`
			`*/`
			`xu &= -(uint64_t)((uint32_t)(cc - 64) >> 31);`

			`/*`
			`* Apply back the sign, if the source value is negative.`
			`*/`
			`t = x >> 63;`
			`xu = (xu ^ -t) + t;`
			`return (int64_t )&xu;`
			`}`

			`#define fpr_add PQCLEAN_FALCON512_CLEAN_fpr_add`
			`fpr fpr_add(fpr x, fpr y);`

			`static inline fpr`
			`fpr_sub(fpr x, fpr y) {`
			`y ^= (uint64_t)1 << 63;`
			`return fpr_add(x, y);`
			`}`

			`static inline fpr`
			`fpr_neg(fpr x) {`
			`x ^= (uint64_t)1 << 63;`
			`return x;`
			`}`

			`static inline fpr`
			`fpr_half(fpr x) {`
			`/*`
			`* To divide a value by 2, we just have to subtract 1 from its`
			`* exponent, but we have to take care of zero.`
			`*/`
			`uint32_t t;`

			`x -= (uint64_t)1 << 52;`
			`t = (((uint32_t)(x >> 52) & 0x7FF) + 1) >> 11;`
			`x &= (uint64_t)t - 1;`
			`return x;`
			`}`

			`static inline fpr`
			`fpr_double(fpr x) {`
			`/*`
			`* To double a value, we just increment by one the exponent. We`
			`* don't care about infinites or NaNs; however, 0 is a`
			`* special case.`
			`*/`
			`x += (uint64_t)((((unsigned)(x >> 52) & 0x7FFU) + 0x7FFU) >> 11) << 52;`
			`return x;`
			`}`

			`#define fpr_mul PQCLEAN_FALCON512_CLEAN_fpr_mul`
			`fpr fpr_mul(fpr x, fpr y);`

			`static inline fpr`
			`fpr_sqr(fpr x) {`
			`return fpr_mul(x, x);`
			`}`

			`#define fpr_div PQCLEAN_FALCON512_CLEAN_fpr_div`
			`fpr fpr_div(fpr x, fpr y);`

			`static inline fpr`
			`fpr_inv(fpr x) {`
			`return fpr_div(4607182418800017408u, x);`
			`}`

			`#define fpr_sqrt PQCLEAN_FALCON512_CLEAN_fpr_sqrt`
			`fpr fpr_sqrt(fpr x);`

			`static inline int`
			`fpr_lt(fpr x, fpr y) {`
			`/*`
			`* If x >= 0 or y >= 0, a signed comparison yields the proper`
			`* result:`
			`* - For positive values, the order is preserved.`
			`* - The sign bit is at the same place as in integers, so`
			`* sign is preserved.`
			`*`
			`* If both x and y are negative, then the order is reversed.`
			`* We cannot simply invert the comparison result in that case`
			`* because it would not handle the edge case x = y properly.`
			`*/`
			`int cc0, cc1;`

			`cc0 = (int64_t )&x < (int64_t )&y;`
			`cc1 = (int64_t )&x > (int64_t )&y;`
			`return cc0 ^ ((cc0 ^ cc1) & (int)((x & y) >> 63));`
			`}`

			`/*`
			`* Compute exp(x) for x such that \|x\| <= ln 2. We want a precision of 50`
			`* bits or so.`
			`*/`
			`#define fpr_expm_p63 PQCLEAN_FALCON512_CLEAN_fpr_expm_p63`
			`uint64_t fpr_expm_p63(fpr x);`

			`#define fpr_gm_tab PQCLEAN_FALCON512_CLEAN_fpr_gm_tab`
			`extern const fpr fpr_gm_tab[];`

			`#define fpr_p2_tab PQCLEAN_FALCON512_CLEAN_fpr_p2_tab`
			`extern const fpr fpr_p2_tab[];`

			`/* ====================================================================== */`