pqc/crypto_sign/falcon-512/avx2/fpr.h

#ifndef PQCLEAN_FALCON512_AVX2_FPR_H
#define PQCLEAN_FALCON512_AVX2_FPR_H

/*
 * Floating-point operations.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2017-2019  Falcon Project
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@nccgroup.com>
 */


/* ====================================================================== */

#include <immintrin.h>
#include <math.h>

#define FMADD(a, b, c)   _mm256_add_pd(_mm256_mul_pd(a, b), c)
#define FMSUB(a, b, c)   _mm256_sub_pd(_mm256_mul_pd(a, b), c)

/*
 * We wrap the native 'double' type into a structure so that the C compiler
 * complains if we inadvertently use raw arithmetic operators on the 'fpr'
 * type instead of using the inline functions below. This should have no
 * extra runtime cost, since all the functions below are 'inline'.
 */
typedef struct {
    double v;
} fpr;

static inline fpr
FPR(double v) {
    fpr x;

    x.v = v;
    return x;
}

static inline fpr
fpr_of(int64_t i) {
    return FPR((double)i);
}

static const fpr fpr_q = { 12289.0 };
static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };
static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };
static const fpr fpr_inv_sigma = { .005819826392951607426919370871 };
static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 };
static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 };
static const fpr fpr_log2 = { 0.69314718055994530941723212146 };
static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };
static const fpr fpr_bnorm_max = { 16822.4121 };
static const fpr fpr_zero = { 0.0 };
static const fpr fpr_one = { 1.0 };
static const fpr fpr_two = { 2.0 };
static const fpr fpr_onehalf = { 0.5 };
static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };
static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };
static const fpr fpr_ptwo31 = { 2147483648.0 };
static const fpr fpr_ptwo31m1 = { 2147483647.0 };
static const fpr fpr_mtwo31m1 = { -2147483647.0 };
static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };
static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };
static const fpr fpr_ptwo63 = { 9223372036854775808.0 };

static inline int64_t
fpr_rint(fpr x) {
    /*
     * We do not want to use llrint() since it might be not
     * constant-time.
     *
     * Suppose that x >= 0. If x >= 2^52, then it is already an
     * integer. Otherwise, if x < 2^52, then computing x+2^52 will
     * yield a value that will be rounded to the nearest integer
     * with exactly the right rules (round-to-nearest-even).
     *
     * In order to have constant-time processing, we must do the
     * computation for both x >= 0 and x < 0 cases, and use a
     * cast to an integer to access the sign and select the proper
     * value. Such casts also allow us to find out if |x| < 2^52.
     */
    int64_t sx, tx, rp, rn, m;
    uint32_t ub;

    sx = (int64_t)(x.v - 1.0);
    tx = (int64_t)x.v;
    rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;
    rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;

    /*
     * If tx >= 2^52 or tx < -2^52, then result is tx.
     * Otherwise, if sx >= 0, then result is rp.
     * Otherwise, result is rn. We use the fact that when x is
     * close to 0 (|x| <= 0.25) then both rp and rn are correct;
     * and if x is not close to 0, then trunc(x-1.0) yields the
     * appropriate sign.
     */

    /*
     * Clamp rp to zero if tx < 0.
     * Clamp rn to zero if tx >= 0.
     */
    m = sx >> 63;
    rn &= m;
    rp &= ~m;

    /*
     * Get the 12 upper bits of tx; if they are not all zeros or
     * all ones, then tx >= 2^52 or tx < -2^52, and we clamp both
     * rp and rn to zero. Otherwise, we clamp tx to zero.
     */
    ub = (uint32_t)((uint64_t)tx >> 52);
    m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);
    rp &= m;
    rn &= m;
    tx &= ~m;

    /*
     * Only one of tx, rn or rp (at most) can be non-zero at this
     * point.
     */
    return tx | rn | rp;
}

static inline int64_t
fpr_floor(fpr x) {
    int64_t r;

    /*
     * The cast performs a trunc() (rounding toward 0) and thus is
     * wrong by 1 for most negative values. The correction below is
     * constant-time as long as the compiler turns the
     * floating-point conversion result into a 0/1 integer without a
     * conditional branch or another non-constant-time construction.
     * This should hold on all modern architectures with an FPU (and
     * if it is false on a given arch, then chances are that the FPU
     * itself is not constant-time, making the point moot).
     */
    r = (int64_t)x.v;
    return r - (x.v < (double)r);
}

static inline int64_t
fpr_trunc(fpr x) {
    return (int64_t)x.v;
}

static inline fpr
fpr_add(fpr x, fpr y) {
    return FPR(x.v + y.v);
}

static inline fpr
fpr_sub(fpr x, fpr y) {
    return FPR(x.v - y.v);
}

static inline fpr
fpr_neg(fpr x) {
    return FPR(-x.v);
}

static inline fpr
fpr_half(fpr x) {
    return FPR(x.v * 0.5);
}

static inline fpr
fpr_double(fpr x) {
    return FPR(x.v + x.v);
}

static inline fpr
fpr_mul(fpr x, fpr y) {
    return FPR(x.v * y.v);
}

static inline fpr
fpr_sqr(fpr x) {
    return FPR(x.v * x.v);
}

static inline fpr
fpr_inv(fpr x) {
    return FPR(1.0 / x.v);
}

static inline fpr
fpr_div(fpr x, fpr y) {
    return FPR(x.v / y.v);
}

static inline void
fpr_sqrt_avx2(double *t) {
    __m128d x;

    x = _mm_load1_pd(t);
    x = _mm_sqrt_pd(x);
    _mm_storel_pd(t, x);
}

static inline fpr
fpr_sqrt(fpr x) {
    /*
     * We prefer not to have a dependency on libm when it can be
     * avoided. On x86, calling the sqrt() libm function inlines
     * the relevant opcode (fsqrt or sqrtsd, depending on whether
     * the 387 FPU or SSE2 is used for floating-point operations)
     * but then makes an optional call to the library function
     * for proper error handling, in case the operand is negative.
     *
     * To avoid this dependency, we use intrinsics or inline assembly
     * on recognized platforms:
     *
     *  - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.
     *
     *  - On GCC/Clang with SSE maths, we use SSE2 intrinsics.
     *
     *  - On GCC/Clang on i386, or MSVC on i386, we use inline assembly
     *    to call the 387 FPU fsqrt opcode.
     *
     *  - On GCC/Clang/XLC on PowerPC, we use inline assembly to call
     *    the fsqrt opcode (Clang needs a special hack).
     *
     *  - On GCC/Clang on ARM with hardware floating-point, we use
     *    inline assembly to call the vqsrt.f64 opcode. Due to a
     *    complex ecosystem of compilers and assembly syntaxes, we
     *    have to call it "fsqrt" or "fsqrtd", depending on case.
     *
     * If the platform is not recognized, a call to the system
     * library function sqrt() is performed. On some compilers, this
     * may actually inline the relevant opcode, and call the library
     * function only when the input is invalid (e.g. negative);
     * Falcon never actually calls sqrt() on a negative value, but
     * the dependency to libm will still be there.
     */

    fpr_sqrt_avx2(&x.v);
    return x;
}

static inline int
fpr_lt(fpr x, fpr y) {
    return x.v < y.v;
}

static inline uint64_t
fpr_expm_p63(fpr x, fpr ccs) {
    /*
     * Polynomial approximation of exp(-x) is taken from FACCT:
     *   https://eprint.iacr.org/2018/1234
     * Specifically, values are extracted from the implementation
     * referenced from the FACCT article, and available at:
     *   https://github.com/raykzhao/gaussian
     * Tests over more than 24 billions of random inputs in the
     * 0..log(2) range have never shown a deviation larger than
     * 2^(-50) from the true mathematical value.
     */


    /*
     * AVX2 implementation uses more operations than Horner's method,
     * but with a lower expression tree depth. This helps because
     * additions and multiplications have a latency of 4 cycles on
     * a Skylake, but the CPU can issue two of them per cycle.
     */

    static const union {
        double d[12];
        __m256d v[3];
    } c = {
        {
            0.999999999999994892974086724280,
            0.500000000000019206858326015208,
            0.166666666666984014666397229121,
            0.041666666666110491190622155955,
            0.008333333327800835146903501993,
            0.001388888894063186997887560103,
            0.000198412739277311890541063977,
            0.000024801566833585381209939524,
            0.000002755586350219122514855659,
            0.000000275607356160477811864927,
            0.000000025299506379442070029551,
            0.000000002073772366009083061987
        }
    };

    double d1, d2, d4, d8, y;
    __m256d d14, d58, d9c;

    d1 = -x.v;
    d2 = d1 * d1;
    d4 = d2 * d2;
    d8 = d4 * d4;
    d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);
    d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));
    d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));
    d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));
    d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);
    d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);
    d9c = _mm256_hadd_pd(d9c, d9c);
    y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)
        + _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));
    y *= ccs.v;

    /*
     * Final conversion goes through int64_t first, because that's what
     * the underlying opcode (vcvttsd2si) will do, and we know that the
     * result will fit, since x >= 0 and ccs < 1. If we did the
     * conversion directly to uint64_t, then the compiler would add some
     * extra code to cover the case of a source value of 2^63 or more,
     * and though the alternate path would never be exercised, the
     * extra comparison would cost us some cycles.
     */
    return (uint64_t)(int64_t)(y * fpr_ptwo63.v);

}

#define fpr_gm_tab   PQCLEAN_FALCON512_AVX2_fpr_gm_tab
extern const fpr fpr_gm_tab[];

#define fpr_p2_tab   PQCLEAN_FALCON512_AVX2_fpr_p2_tab
extern const fpr fpr_p2_tab[];

/* ====================================================================== */
#endif
Add AVX2 Falcon 2020-10-21 21:37:33 +01:00			`#ifndef PQCLEAN_FALCON512_AVX2_FPR_H`
			`#define PQCLEAN_FALCON512_AVX2_FPR_H`

			`/*`
			`* Floating-point operations.`
			`*`
			`* ==========================(LICENSE BEGIN)============================`
			`*`
			`* Copyright (c) 2017-2019 Falcon Project`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining`
			`* a copy of this software and associated documentation files (the`
			`* "Software"), to deal in the Software without restriction, including`
			`* without limitation the rights to use, copy, modify, merge, publish,`
			`* distribute, sublicense, and/or sell copies of the Software, and to`
			`* permit persons to whom the Software is furnished to do so, subject to`
			`* the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be`
			`* included in all copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,`
			`* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.`
			`* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY`
			`* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,`
			`* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE`
			`* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.`
			`*`
			`* ===========================(LICENSE END)=============================`
			`*`
			`* @author Thomas Pornin <thomas.pornin@nccgroup.com>`
			`*/`


			`/* ====================================================================== */`

			`#include <immintrin.h>`
			`#include <math.h>`

			`#define FMADD(a, b, c) _mm256_add_pd(_mm256_mul_pd(a, b), c)`
			`#define FMSUB(a, b, c) _mm256_sub_pd(_mm256_mul_pd(a, b), c)`

			`/*`
			`* We wrap the native 'double' type into a structure so that the C compiler`
			`* complains if we inadvertently use raw arithmetic operators on the 'fpr'`
			`* type instead of using the inline functions below. This should have no`
			`* extra runtime cost, since all the functions below are 'inline'.`
			`*/`
			`typedef struct {`
			`double v;`
			`} fpr;`

			`static inline fpr`
			`FPR(double v) {`
			`fpr x;`

			`x.v = v;`
			`return x;`
			`}`

			`static inline fpr`
			`fpr_of(int64_t i) {`
			`return FPR((double)i);`
			`}`

			`static const fpr fpr_q = { 12289.0 };`
			`static const fpr fpr_inverse_of_q = { 1.0 / 12289.0 };`
			`static const fpr fpr_inv_2sqrsigma0 = { .150865048875372721532312163019 };`
			`static const fpr fpr_inv_sigma = { .005819826392951607426919370871 };`
			`static const fpr fpr_sigma_min_9 = { 1.291500756233514568549480827642 };`
			`static const fpr fpr_sigma_min_10 = { 1.311734375905083682667395805765 };`
			`static const fpr fpr_log2 = { 0.69314718055994530941723212146 };`
			`static const fpr fpr_inv_log2 = { 1.4426950408889634073599246810 };`
			`static const fpr fpr_bnorm_max = { 16822.4121 };`
			`static const fpr fpr_zero = { 0.0 };`
			`static const fpr fpr_one = { 1.0 };`
			`static const fpr fpr_two = { 2.0 };`
			`static const fpr fpr_onehalf = { 0.5 };`
			`static const fpr fpr_invsqrt2 = { 0.707106781186547524400844362105 };`
			`static const fpr fpr_invsqrt8 = { 0.353553390593273762200422181052 };`
			`static const fpr fpr_ptwo31 = { 2147483648.0 };`
			`static const fpr fpr_ptwo31m1 = { 2147483647.0 };`
			`static const fpr fpr_mtwo31m1 = { -2147483647.0 };`
			`static const fpr fpr_ptwo63m1 = { 9223372036854775807.0 };`
			`static const fpr fpr_mtwo63m1 = { -9223372036854775807.0 };`
			`static const fpr fpr_ptwo63 = { 9223372036854775808.0 };`

			`static inline int64_t`
			`fpr_rint(fpr x) {`
			`/*`
			`* We do not want to use llrint() since it might be not`
			`* constant-time.`
			`*`
			`* Suppose that x >= 0. If x >= 2^52, then it is already an`
			`* integer. Otherwise, if x < 2^52, then computing x+2^52 will`
			`* yield a value that will be rounded to the nearest integer`
			`* with exactly the right rules (round-to-nearest-even).`
			`*`
			`* In order to have constant-time processing, we must do the`
			`* computation for both x >= 0 and x < 0 cases, and use a`
			`* cast to an integer to access the sign and select the proper`
			`* value. Such casts also allow us to find out if \|x\| < 2^52.`
			`*/`
			`int64_t sx, tx, rp, rn, m;`
			`uint32_t ub;`

			`sx = (int64_t)(x.v - 1.0);`
			`tx = (int64_t)x.v;`
			`rp = (int64_t)(x.v + 4503599627370496.0) - 4503599627370496;`
			`rn = (int64_t)(x.v - 4503599627370496.0) + 4503599627370496;`

			`/*`
			`* If tx >= 2^52 or tx < -2^52, then result is tx.`
			`* Otherwise, if sx >= 0, then result is rp.`
			`* Otherwise, result is rn. We use the fact that when x is`
			`* close to 0 (\|x\| <= 0.25) then both rp and rn are correct;`
			`* and if x is not close to 0, then trunc(x-1.0) yields the`
			`* appropriate sign.`
			`*/`

			`/*`
			`* Clamp rp to zero if tx < 0.`
			`* Clamp rn to zero if tx >= 0.`
			`*/`
			`m = sx >> 63;`
			`rn &= m;`
			`rp &= ~m;`

			`/*`
			`* Get the 12 upper bits of tx; if they are not all zeros or`
			`* all ones, then tx >= 2^52 or tx < -2^52, and we clamp both`
			`* rp and rn to zero. Otherwise, we clamp tx to zero.`
			`*/`
			`ub = (uint32_t)((uint64_t)tx >> 52);`
			`m = -(int64_t)((((ub + 1) & 0xFFF) - 2) >> 31);`
			`rp &= m;`
			`rn &= m;`
			`tx &= ~m;`

			`/*`
			`* Only one of tx, rn or rp (at most) can be non-zero at this`
			`* point.`
			`*/`
			`return tx \| rn \| rp;`
			`}`

			`static inline int64_t`
			`fpr_floor(fpr x) {`
			`int64_t r;`

			`/*`
			`* The cast performs a trunc() (rounding toward 0) and thus is`
			`* wrong by 1 for most negative values. The correction below is`
			`* constant-time as long as the compiler turns the`
			`* floating-point conversion result into a 0/1 integer without a`
			`* conditional branch or another non-constant-time construction.`
			`* This should hold on all modern architectures with an FPU (and`
			`* if it is false on a given arch, then chances are that the FPU`
			`* itself is not constant-time, making the point moot).`
			`*/`
			`r = (int64_t)x.v;`
			`return r - (x.v < (double)r);`
			`}`

			`static inline int64_t`
			`fpr_trunc(fpr x) {`
			`return (int64_t)x.v;`
			`}`

			`static inline fpr`
			`fpr_add(fpr x, fpr y) {`
			`return FPR(x.v + y.v);`
			`}`

			`static inline fpr`
			`fpr_sub(fpr x, fpr y) {`
			`return FPR(x.v - y.v);`
			`}`

			`static inline fpr`
			`fpr_neg(fpr x) {`
			`return FPR(-x.v);`
			`}`

			`static inline fpr`
			`fpr_half(fpr x) {`
			`return FPR(x.v * 0.5);`
			`}`

			`static inline fpr`
			`fpr_double(fpr x) {`
			`return FPR(x.v + x.v);`
			`}`

			`static inline fpr`
			`fpr_mul(fpr x, fpr y) {`
			`return FPR(x.v * y.v);`
			`}`

			`static inline fpr`
			`fpr_sqr(fpr x) {`
			`return FPR(x.v * x.v);`
			`}`

			`static inline fpr`
			`fpr_inv(fpr x) {`
			`return FPR(1.0 / x.v);`
			`}`

			`static inline fpr`
			`fpr_div(fpr x, fpr y) {`
			`return FPR(x.v / y.v);`
			`}`

			`static inline void`
			`fpr_sqrt_avx2(double *t) {`
			`__m128d x;`

			`x = _mm_load1_pd(t);`
			`x = _mm_sqrt_pd(x);`
			`_mm_storel_pd(t, x);`
			`}`

			`static inline fpr`
			`fpr_sqrt(fpr x) {`
			`/*`
			`* We prefer not to have a dependency on libm when it can be`
			`* avoided. On x86, calling the sqrt() libm function inlines`
			`* the relevant opcode (fsqrt or sqrtsd, depending on whether`
			`* the 387 FPU or SSE2 is used for floating-point operations)`
			`* but then makes an optional call to the library function`
			`* for proper error handling, in case the operand is negative.`
			`*`
			`* To avoid this dependency, we use intrinsics or inline assembly`
			`* on recognized platforms:`
			`*`
			`* - If AVX2 is explicitly enabled, then we use SSE2 intrinsics.`
			`*`
			`* - On GCC/Clang with SSE maths, we use SSE2 intrinsics.`
			`*`
			`* - On GCC/Clang on i386, or MSVC on i386, we use inline assembly`
			`* to call the 387 FPU fsqrt opcode.`
			`*`
			`* - On GCC/Clang/XLC on PowerPC, we use inline assembly to call`
			`* the fsqrt opcode (Clang needs a special hack).`
			`*`
			`* - On GCC/Clang on ARM with hardware floating-point, we use`
			`* inline assembly to call the vqsrt.f64 opcode. Due to a`
			`* complex ecosystem of compilers and assembly syntaxes, we`
			`* have to call it "fsqrt" or "fsqrtd", depending on case.`
			`*`
			`* If the platform is not recognized, a call to the system`
			`* library function sqrt() is performed. On some compilers, this`
			`* may actually inline the relevant opcode, and call the library`
			`* function only when the input is invalid (e.g. negative);`
			`* Falcon never actually calls sqrt() on a negative value, but`
			`* the dependency to libm will still be there.`
			`*/`

			`fpr_sqrt_avx2(&x.v);`
			`return x;`
			`}`

			`static inline int`
			`fpr_lt(fpr x, fpr y) {`
			`return x.v < y.v;`
			`}`

			`static inline uint64_t`
			`fpr_expm_p63(fpr x, fpr ccs) {`
			`/*`
			`* Polynomial approximation of exp(-x) is taken from FACCT:`
			`* https://eprint.iacr.org/2018/1234`
			`* Specifically, values are extracted from the implementation`
			`* referenced from the FACCT article, and available at:`
			`* https://github.com/raykzhao/gaussian`
			`* Tests over more than 24 billions of random inputs in the`
			`* 0..log(2) range have never shown a deviation larger than`
			`* 2^(-50) from the true mathematical value.`
			`*/`


			`/*`
			`* AVX2 implementation uses more operations than Horner's method,`
			`* but with a lower expression tree depth. This helps because`
			`* additions and multiplications have a latency of 4 cycles on`
			`* a Skylake, but the CPU can issue two of them per cycle.`
			`*/`

			`static const union {`
			`double d[12];`
			`__m256d v[3];`
			`} c = {`
			`{`
			`0.999999999999994892974086724280,`
			`0.500000000000019206858326015208,`
			`0.166666666666984014666397229121,`
			`0.041666666666110491190622155955,`
			`0.008333333327800835146903501993,`
			`0.001388888894063186997887560103,`
			`0.000198412739277311890541063977,`
			`0.000024801566833585381209939524,`
			`0.000002755586350219122514855659,`
			`0.000000275607356160477811864927,`
			`0.000000025299506379442070029551,`
			`0.000000002073772366009083061987`
			`}`
			`};`

			`double d1, d2, d4, d8, y;`
			`__m256d d14, d58, d9c;`

			`d1 = -x.v;`
			`d2 = d1 * d1;`
			`d4 = d2 * d2;`
			`d8 = d4 * d4;`
			`d14 = _mm256_set_pd(d4, d2 * d1, d2, d1);`
			`d58 = _mm256_mul_pd(d14, _mm256_set1_pd(d4));`
			`d9c = _mm256_mul_pd(d14, _mm256_set1_pd(d8));`
			`d14 = _mm256_mul_pd(d14, _mm256_loadu_pd(&c.d[0]));`
			`d58 = FMADD(d58, _mm256_loadu_pd(&c.d[4]), d14);`
			`d9c = FMADD(d9c, _mm256_loadu_pd(&c.d[8]), d58);`
			`d9c = _mm256_hadd_pd(d9c, d9c);`
			`y = 1.0 + _mm_cvtsd_f64(_mm256_castpd256_pd128(d9c)) // _mm256_cvtsd_f64(d9c)`
			`+ _mm_cvtsd_f64(_mm256_extractf128_pd(d9c, 1));`
			`y *= ccs.v;`

			`/*`
			`* Final conversion goes through int64_t first, because that's what`
			`* the underlying opcode (vcvttsd2si) will do, and we know that the`
			`* result will fit, since x >= 0 and ccs < 1. If we did the`
			`* conversion directly to uint64_t, then the compiler would add some`
			`* extra code to cover the case of a source value of 2^63 or more,`
			`* and though the alternate path would never be exercised, the`
			`* extra comparison would cost us some cycles.`
			`*/`
			`return (uint64_t)(int64_t)(y * fpr_ptwo63.v);`

			`}`

			`#define fpr_gm_tab PQCLEAN_FALCON512_AVX2_fpr_gm_tab`
			`extern const fpr fpr_gm_tab[];`

			`#define fpr_p2_tab PQCLEAN_FALCON512_AVX2_fpr_p2_tab`
			`extern const fpr fpr_p2_tab[];`

			`/* ====================================================================== */`
			`#endif`