1313 lines
41 KiB
C
1313 lines
41 KiB
C
#include "inner.h"
|
|
|
|
/*
|
|
* Falcon signature generation.
|
|
*
|
|
* ==========================(LICENSE BEGIN)============================
|
|
*
|
|
* Copyright (c) 2017-2019 Falcon Project
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining
|
|
* a copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sublicense, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
* ===========================(LICENSE END)=============================
|
|
*
|
|
* @author Thomas Pornin <thomas.pornin@nccgroup.com>
|
|
*/
|
|
|
|
|
|
/* =================================================================== */
|
|
|
|
/*
|
|
* Compute degree N from logarithm 'logn'.
|
|
*/
|
|
#define MKN(logn) ((size_t)1 << (logn))
|
|
|
|
/* =================================================================== */
|
|
/*
|
|
* Binary case:
|
|
* N = 2^logn
|
|
* phi = X^N+1
|
|
*/
|
|
|
|
/*
|
|
* Get the size of the LDL tree for an input with polynomials of size
|
|
* 2^logn. The size is expressed in the number of elements.
|
|
*/
|
|
static inline unsigned
|
|
ffLDL_treesize(unsigned logn) {
|
|
/*
|
|
* For logn = 0 (polynomials are constant), the "tree" is a
|
|
* single element. Otherwise, the tree node has size 2^logn, and
|
|
* has two child trees for size logn-1 each. Thus, treesize s()
|
|
* must fulfill these two relations:
|
|
*
|
|
* s(0) = 1
|
|
* s(logn) = (2^logn) + 2*s(logn-1)
|
|
*/
|
|
return (logn + 1) << logn;
|
|
}
|
|
|
|
/*
|
|
* Inner function for ffLDL_fft(). It expects the matrix to be both
|
|
* auto-adjoint and quasicyclic; also, it uses the source operands
|
|
* as modifiable temporaries.
|
|
*
|
|
* tmp[] must have room for at least one polynomial.
|
|
*/
|
|
static void
|
|
ffLDL_fft_inner(fpr *tree,
|
|
fpr *g0, fpr *g1, unsigned logn, fpr *tmp) {
|
|
size_t n, hn;
|
|
|
|
n = MKN(logn);
|
|
if (n == 1) {
|
|
tree[0] = g0[0];
|
|
return;
|
|
}
|
|
hn = n >> 1;
|
|
|
|
/*
|
|
* The LDL decomposition yields L (which is written in the tree)
|
|
* and the diagonal of D. Since d00 = g0, we just write d11
|
|
* into tmp.
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_poly_LDLmv_fft(tmp, tree, g0, g1, g0, logn);
|
|
|
|
/*
|
|
* Split d00 (currently in g0) and d11 (currently in tmp). We
|
|
* reuse g0 and g1 as temporary storage spaces:
|
|
* d00 splits into g1, g1+hn
|
|
* d11 splits into g0, g0+hn
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(g1, g1 + hn, g0, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(g0, g0 + hn, tmp, logn);
|
|
|
|
/*
|
|
* Each split result is the first row of a new auto-adjoint
|
|
* quasicyclic matrix for the next recursive step.
|
|
*/
|
|
ffLDL_fft_inner(tree + n,
|
|
g1, g1 + hn, logn - 1, tmp);
|
|
ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
|
|
g0, g0 + hn, logn - 1, tmp);
|
|
}
|
|
|
|
/*
|
|
* Compute the ffLDL tree of an auto-adjoint matrix G. The matrix
|
|
* is provided as three polynomials (FFT representation).
|
|
*
|
|
* The "tree" array is filled with the computed tree, of size
|
|
* (logn+1)*(2^logn) elements (see ffLDL_treesize()).
|
|
*
|
|
* Input arrays MUST NOT overlap, except possibly the three unmodified
|
|
* arrays g00, g01 and g11. tmp[] should have room for at least three
|
|
* polynomials of 2^logn elements each.
|
|
*/
|
|
static void
|
|
ffLDL_fft(fpr *tree, const fpr *g00,
|
|
const fpr *g01, const fpr *g11,
|
|
unsigned logn, fpr *tmp) {
|
|
size_t n, hn;
|
|
fpr *d00, *d11;
|
|
|
|
n = MKN(logn);
|
|
if (n == 1) {
|
|
tree[0] = g00[0];
|
|
return;
|
|
}
|
|
hn = n >> 1;
|
|
d00 = tmp;
|
|
d11 = tmp + n;
|
|
tmp += n << 1;
|
|
|
|
memcpy(d00, g00, n * sizeof * g00);
|
|
PQCLEAN_FALCON1024_AVX2_poly_LDLmv_fft(d11, tree, g00, g01, g11, logn);
|
|
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(tmp, tmp + hn, d00, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(d00, d00 + hn, d11, logn);
|
|
memcpy(d11, tmp, n * sizeof * tmp);
|
|
ffLDL_fft_inner(tree + n,
|
|
d11, d11 + hn, logn - 1, tmp);
|
|
ffLDL_fft_inner(tree + n + ffLDL_treesize(logn - 1),
|
|
d00, d00 + hn, logn - 1, tmp);
|
|
}
|
|
|
|
/*
|
|
* Normalize an ffLDL tree: each leaf of value x is replaced with
|
|
* sigma / sqrt(x).
|
|
*/
|
|
static void
|
|
ffLDL_binary_normalize(fpr *tree, unsigned logn) {
|
|
/*
|
|
* TODO: make an iterative version.
|
|
*/
|
|
size_t n;
|
|
|
|
n = MKN(logn);
|
|
if (n == 1) {
|
|
/*
|
|
* We actually store in the tree leaf the inverse of
|
|
* the value mandated by the specification: this
|
|
* saves a division both here and in the sampler.
|
|
*/
|
|
tree[0] = fpr_mul(fpr_sqrt(tree[0]), fpr_inv_sigma);
|
|
} else {
|
|
ffLDL_binary_normalize(tree + n, logn - 1);
|
|
ffLDL_binary_normalize(tree + n + ffLDL_treesize(logn - 1),
|
|
logn - 1);
|
|
}
|
|
}
|
|
|
|
/* =================================================================== */
|
|
|
|
/*
|
|
* Convert an integer polynomial (with small values) into the
|
|
* representation with complex numbers.
|
|
*/
|
|
static void
|
|
smallints_to_fpr(fpr *r, const int8_t *t, unsigned logn) {
|
|
size_t n, u;
|
|
|
|
n = MKN(logn);
|
|
for (u = 0; u < n; u ++) {
|
|
r[u] = fpr_of(t[u]);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* The expanded private key contains:
|
|
* - The B0 matrix (four elements)
|
|
* - The ffLDL tree
|
|
*/
|
|
|
|
static inline size_t
|
|
skoff_b00(unsigned logn) {
|
|
(void)logn;
|
|
return 0;
|
|
}
|
|
|
|
static inline size_t
|
|
skoff_b01(unsigned logn) {
|
|
return MKN(logn);
|
|
}
|
|
|
|
static inline size_t
|
|
skoff_b10(unsigned logn) {
|
|
return 2 * MKN(logn);
|
|
}
|
|
|
|
static inline size_t
|
|
skoff_b11(unsigned logn) {
|
|
return 3 * MKN(logn);
|
|
}
|
|
|
|
static inline size_t
|
|
skoff_tree(unsigned logn) {
|
|
return 4 * MKN(logn);
|
|
}
|
|
|
|
/* see inner.h */
|
|
void
|
|
PQCLEAN_FALCON1024_AVX2_expand_privkey(fpr *expanded_key,
|
|
const int8_t *f, const int8_t *g,
|
|
const int8_t *F, const int8_t *G,
|
|
unsigned logn, uint8_t *tmp) {
|
|
size_t n;
|
|
fpr *rf, *rg, *rF, *rG;
|
|
fpr *b00, *b01, *b10, *b11;
|
|
fpr *g00, *g01, *g11, *gxx;
|
|
fpr *tree;
|
|
|
|
n = MKN(logn);
|
|
b00 = expanded_key + skoff_b00(logn);
|
|
b01 = expanded_key + skoff_b01(logn);
|
|
b10 = expanded_key + skoff_b10(logn);
|
|
b11 = expanded_key + skoff_b11(logn);
|
|
tree = expanded_key + skoff_tree(logn);
|
|
|
|
/*
|
|
* We load the private key elements directly into the B0 matrix,
|
|
* since B0 = [[g, -f], [G, -F]].
|
|
*/
|
|
rf = b01;
|
|
rg = b00;
|
|
rF = b11;
|
|
rG = b10;
|
|
|
|
smallints_to_fpr(rf, f, logn);
|
|
smallints_to_fpr(rg, g, logn);
|
|
smallints_to_fpr(rF, F, logn);
|
|
smallints_to_fpr(rG, G, logn);
|
|
|
|
/*
|
|
* Compute the FFT for the key elements, and negate f and F.
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_FFT(rf, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(rg, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(rF, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(rG, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_neg(rf, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_neg(rF, logn);
|
|
|
|
/*
|
|
* The Gram matrix is G = B·B*. Formulas are:
|
|
* g00 = b00*adj(b00) + b01*adj(b01)
|
|
* g01 = b00*adj(b10) + b01*adj(b11)
|
|
* g10 = b10*adj(b00) + b11*adj(b01)
|
|
* g11 = b10*adj(b10) + b11*adj(b11)
|
|
*
|
|
* For historical reasons, this implementation uses
|
|
* g00, g01 and g11 (upper triangle).
|
|
*/
|
|
g00 = (fpr *)tmp;
|
|
g01 = g00 + n;
|
|
g11 = g01 + n;
|
|
gxx = g11 + n;
|
|
|
|
memcpy(g00, b00, n * sizeof * b00);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(g00, logn);
|
|
memcpy(gxx, b01, n * sizeof * b01);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(gxx, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(g00, gxx, logn);
|
|
|
|
memcpy(g01, b00, n * sizeof * b00);
|
|
PQCLEAN_FALCON1024_AVX2_poly_muladj_fft(g01, b10, logn);
|
|
memcpy(gxx, b01, n * sizeof * b01);
|
|
PQCLEAN_FALCON1024_AVX2_poly_muladj_fft(gxx, b11, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(g01, gxx, logn);
|
|
|
|
memcpy(g11, b10, n * sizeof * b10);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(g11, logn);
|
|
memcpy(gxx, b11, n * sizeof * b11);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(gxx, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(g11, gxx, logn);
|
|
|
|
/*
|
|
* Compute the Falcon tree.
|
|
*/
|
|
ffLDL_fft(tree, g00, g01, g11, logn, gxx);
|
|
|
|
/*
|
|
* Normalize tree.
|
|
*/
|
|
ffLDL_binary_normalize(tree, logn);
|
|
}
|
|
|
|
typedef int (*samplerZ)(void *ctx, fpr mu, fpr sigma);
|
|
|
|
/*
|
|
* Perform Fast Fourier Sampling for target vector t. The Gram matrix
|
|
* is provided (G = [[g00, g01], [adj(g01), g11]]). The sampled vector
|
|
* is written over (t0,t1). The Gram matrix is modified as well. The
|
|
* tmp[] buffer must have room for four polynomials.
|
|
*/
|
|
static void
|
|
ffSampling_fft_dyntree(samplerZ samp, void *samp_ctx,
|
|
fpr *t0, fpr *t1,
|
|
fpr *g00, fpr *g01, fpr *g11,
|
|
unsigned logn, fpr *tmp) {
|
|
size_t n, hn;
|
|
fpr *z0, *z1;
|
|
|
|
/*
|
|
* Deepest level: the LDL tree leaf value is just g00 (the
|
|
* array has length only 1 at this point); we normalize it
|
|
* with regards to sigma, then use it for sampling.
|
|
*/
|
|
if (logn == 0) {
|
|
fpr leaf;
|
|
|
|
leaf = g00[0];
|
|
leaf = fpr_mul(fpr_sqrt(leaf), fpr_inv_sigma);
|
|
t0[0] = fpr_of(samp(samp_ctx, t0[0], leaf));
|
|
t1[0] = fpr_of(samp(samp_ctx, t1[0], leaf));
|
|
return;
|
|
}
|
|
|
|
n = (size_t)1 << logn;
|
|
hn = n >> 1;
|
|
|
|
/*
|
|
* Decompose G into LDL. We only need d00 (identical to g00),
|
|
* d11, and l10; we do that in place.
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_poly_LDL_fft(g00, g01, g11, logn);
|
|
|
|
/*
|
|
* Split d00 and d11 and expand them into half-size quasi-cyclic
|
|
* Gram matrices. We also save l10 in tmp[].
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(tmp, tmp + hn, g00, logn);
|
|
memcpy(g00, tmp, n * sizeof * tmp);
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(tmp, tmp + hn, g11, logn);
|
|
memcpy(g11, tmp, n * sizeof * tmp);
|
|
memcpy(tmp, g01, n * sizeof * g01);
|
|
memcpy(g01, g00, hn * sizeof * g00);
|
|
memcpy(g01 + hn, g11, hn * sizeof * g00);
|
|
|
|
/*
|
|
* The half-size Gram matrices for the recursive LDL tree
|
|
* building are now:
|
|
* - left sub-tree: g00, g00+hn, g01
|
|
* - right sub-tree: g11, g11+hn, g01+hn
|
|
* l10 is in tmp[].
|
|
*/
|
|
|
|
/*
|
|
* We split t1 and use the first recursive call on the two
|
|
* halves, using the right sub-tree. The result is merged
|
|
* back into tmp + 2*n.
|
|
*/
|
|
z1 = tmp + n;
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
|
|
ffSampling_fft_dyntree(samp, samp_ctx, z1, z1 + hn,
|
|
g11, g11 + hn, g01 + hn, logn - 1, z1 + n);
|
|
PQCLEAN_FALCON1024_AVX2_poly_merge_fft(tmp + (n << 1), z1, z1 + hn, logn);
|
|
|
|
/*
|
|
* Compute tb0 = t0 + (t1 - z1) * l10.
|
|
* At that point, l10 is in tmp, t1 is unmodified, and z1 is
|
|
* in tmp + (n << 1). The buffer in z1 is free.
|
|
*
|
|
* In the end, z1 is written over t1, and tb0 is in t0.
|
|
*/
|
|
memcpy(z1, t1, n * sizeof * t1);
|
|
PQCLEAN_FALCON1024_AVX2_poly_sub(z1, tmp + (n << 1), logn);
|
|
memcpy(t1, tmp + (n << 1), n * sizeof * tmp);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(tmp, z1, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(t0, tmp, logn);
|
|
|
|
/*
|
|
* Second recursive invocation, on the split tb0 (currently in t0)
|
|
* and the left sub-tree.
|
|
*/
|
|
z0 = tmp;
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(z0, z0 + hn, t0, logn);
|
|
ffSampling_fft_dyntree(samp, samp_ctx, z0, z0 + hn,
|
|
g00, g00 + hn, g01, logn - 1, z0 + n);
|
|
PQCLEAN_FALCON1024_AVX2_poly_merge_fft(t0, z0, z0 + hn, logn);
|
|
}
|
|
|
|
/*
|
|
* Perform Fast Fourier Sampling for target vector t and LDL tree T.
|
|
* tmp[] must have size for at least two polynomials of size 2^logn.
|
|
*/
|
|
static void
|
|
ffSampling_fft(samplerZ samp, void *samp_ctx,
|
|
fpr *z0, fpr *z1,
|
|
const fpr *tree,
|
|
const fpr *t0, const fpr *t1, unsigned logn,
|
|
fpr *tmp) {
|
|
size_t n, hn;
|
|
const fpr *tree0, *tree1;
|
|
|
|
/*
|
|
* When logn == 2, we inline the last two recursion levels.
|
|
*/
|
|
if (logn == 2) {
|
|
fpr w0, w1, w2, w3, sigma;
|
|
__m128d ww0, ww1, wa, wb, wc, wd;
|
|
__m128d wy0, wy1, wz0, wz1;
|
|
__m128d half, invsqrt8, invsqrt2, neghi, neglo;
|
|
int si0, si1, si2, si3;
|
|
|
|
tree0 = tree + 4;
|
|
tree1 = tree + 8;
|
|
|
|
half = _mm_set1_pd(0.5);
|
|
invsqrt8 = _mm_set1_pd(0.353553390593273762200422181052);
|
|
invsqrt2 = _mm_set1_pd(0.707106781186547524400844362105);
|
|
neghi = _mm_set_pd(-0.0, 0.0);
|
|
neglo = _mm_set_pd(0.0, -0.0);
|
|
|
|
/*
|
|
* We split t1 into w*, then do the recursive invocation,
|
|
* with output in w*. We finally merge back into z1.
|
|
*/
|
|
ww0 = _mm_loadu_pd(&t1[0].v);
|
|
ww1 = _mm_loadu_pd(&t1[2].v);
|
|
wa = _mm_unpacklo_pd(ww0, ww1);
|
|
wb = _mm_unpackhi_pd(ww0, ww1);
|
|
wc = _mm_add_pd(wa, wb);
|
|
ww0 = _mm_mul_pd(wc, half);
|
|
wc = _mm_sub_pd(wa, wb);
|
|
wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
|
|
ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
|
|
|
|
w2.v = _mm_cvtsd_f64(ww1);
|
|
w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
|
|
wa = ww1;
|
|
sigma = tree1[3];
|
|
si2 = samp(samp_ctx, w2, sigma);
|
|
si3 = samp(samp_ctx, w3, sigma);
|
|
ww1 = _mm_set_pd((double)si3, (double)si2);
|
|
wa = _mm_sub_pd(wa, ww1);
|
|
wb = _mm_loadu_pd(&tree1[0].v);
|
|
wc = _mm_mul_pd(wa, wb);
|
|
wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
|
|
wa = _mm_unpacklo_pd(wc, wd);
|
|
wb = _mm_unpackhi_pd(wc, wd);
|
|
ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
|
|
w0.v = _mm_cvtsd_f64(ww0);
|
|
w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
|
|
sigma = tree1[2];
|
|
si0 = samp(samp_ctx, w0, sigma);
|
|
si1 = samp(samp_ctx, w1, sigma);
|
|
ww0 = _mm_set_pd((double)si1, (double)si0);
|
|
|
|
wc = _mm_mul_pd(
|
|
_mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
|
|
invsqrt2);
|
|
wa = _mm_add_pd(ww0, wc);
|
|
wb = _mm_sub_pd(ww0, wc);
|
|
ww0 = _mm_unpacklo_pd(wa, wb);
|
|
ww1 = _mm_unpackhi_pd(wa, wb);
|
|
_mm_storeu_pd(&z1[0].v, ww0);
|
|
_mm_storeu_pd(&z1[2].v, ww1);
|
|
|
|
/*
|
|
* Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in w*.
|
|
*/
|
|
wy0 = _mm_sub_pd(_mm_loadu_pd(&t1[0].v), ww0);
|
|
wy1 = _mm_sub_pd(_mm_loadu_pd(&t1[2].v), ww1);
|
|
wz0 = _mm_loadu_pd(&tree[0].v);
|
|
wz1 = _mm_loadu_pd(&tree[2].v);
|
|
ww0 = _mm_sub_pd(_mm_mul_pd(wy0, wz0), _mm_mul_pd(wy1, wz1));
|
|
ww1 = _mm_add_pd(_mm_mul_pd(wy0, wz1), _mm_mul_pd(wy1, wz0));
|
|
ww0 = _mm_add_pd(ww0, _mm_loadu_pd(&t0[0].v));
|
|
ww1 = _mm_add_pd(ww1, _mm_loadu_pd(&t0[2].v));
|
|
|
|
/*
|
|
* Second recursive invocation.
|
|
*/
|
|
wa = _mm_unpacklo_pd(ww0, ww1);
|
|
wb = _mm_unpackhi_pd(ww0, ww1);
|
|
wc = _mm_add_pd(wa, wb);
|
|
ww0 = _mm_mul_pd(wc, half);
|
|
wc = _mm_sub_pd(wa, wb);
|
|
wd = _mm_xor_pd(_mm_permute_pd(wc, 1), neghi);
|
|
ww1 = _mm_mul_pd(_mm_add_pd(wc, wd), invsqrt8);
|
|
|
|
w2.v = _mm_cvtsd_f64(ww1);
|
|
w3.v = _mm_cvtsd_f64(_mm_permute_pd(ww1, 1));
|
|
wa = ww1;
|
|
sigma = tree0[3];
|
|
si2 = samp(samp_ctx, w2, sigma);
|
|
si3 = samp(samp_ctx, w3, sigma);
|
|
ww1 = _mm_set_pd((double)si3, (double)si2);
|
|
wa = _mm_sub_pd(wa, ww1);
|
|
wb = _mm_loadu_pd(&tree0[0].v);
|
|
wc = _mm_mul_pd(wa, wb);
|
|
wd = _mm_mul_pd(wa, _mm_permute_pd(wb, 1));
|
|
wa = _mm_unpacklo_pd(wc, wd);
|
|
wb = _mm_unpackhi_pd(wc, wd);
|
|
ww0 = _mm_add_pd(ww0, _mm_add_pd(wa, _mm_xor_pd(wb, neglo)));
|
|
w0.v = _mm_cvtsd_f64(ww0);
|
|
w1.v = _mm_cvtsd_f64(_mm_permute_pd(ww0, 1));
|
|
sigma = tree0[2];
|
|
si0 = samp(samp_ctx, w0, sigma);
|
|
si1 = samp(samp_ctx, w1, sigma);
|
|
ww0 = _mm_set_pd((double)si1, (double)si0);
|
|
|
|
wc = _mm_mul_pd(
|
|
_mm_set_pd((double)(si2 + si3), (double)(si2 - si3)),
|
|
invsqrt2);
|
|
wa = _mm_add_pd(ww0, wc);
|
|
wb = _mm_sub_pd(ww0, wc);
|
|
ww0 = _mm_unpacklo_pd(wa, wb);
|
|
ww1 = _mm_unpackhi_pd(wa, wb);
|
|
_mm_storeu_pd(&z0[0].v, ww0);
|
|
_mm_storeu_pd(&z0[2].v, ww1);
|
|
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Case logn == 1 is reachable only when using Falcon-2 (the
|
|
* smallest size for which Falcon is mathematically defined, but
|
|
* of course way too insecure to be of any use).
|
|
*/
|
|
if (logn == 1) {
|
|
fpr x0, x1, y0, y1, sigma;
|
|
fpr a_re, a_im, b_re, b_im, c_re, c_im;
|
|
|
|
x0 = t1[0];
|
|
x1 = t1[1];
|
|
sigma = tree[3];
|
|
z1[0] = y0 = fpr_of(samp(samp_ctx, x0, sigma));
|
|
z1[1] = y1 = fpr_of(samp(samp_ctx, x1, sigma));
|
|
a_re = fpr_sub(x0, y0);
|
|
a_im = fpr_sub(x1, y1);
|
|
b_re = tree[0];
|
|
b_im = tree[1];
|
|
c_re = fpr_sub(fpr_mul(a_re, b_re), fpr_mul(a_im, b_im));
|
|
c_im = fpr_add(fpr_mul(a_re, b_im), fpr_mul(a_im, b_re));
|
|
x0 = fpr_add(c_re, t0[0]);
|
|
x1 = fpr_add(c_im, t0[1]);
|
|
sigma = tree[2];
|
|
z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
|
|
z0[1] = fpr_of(samp(samp_ctx, x1, sigma));
|
|
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Normal end of recursion is for logn == 0. Since the last
|
|
* steps of the recursions were inlined in the blocks above
|
|
* (when logn == 1 or 2), this case is not reachable, and is
|
|
* retained here only for documentation purposes.
|
|
|
|
if (logn == 0) {
|
|
fpr x0, x1, sigma;
|
|
|
|
x0 = t0[0];
|
|
x1 = t1[0];
|
|
sigma = tree[0];
|
|
z0[0] = fpr_of(samp(samp_ctx, x0, sigma));
|
|
z1[0] = fpr_of(samp(samp_ctx, x1, sigma));
|
|
return;
|
|
}
|
|
|
|
*/
|
|
|
|
/*
|
|
* General recursive case (logn >= 3).
|
|
*/
|
|
|
|
n = (size_t)1 << logn;
|
|
hn = n >> 1;
|
|
tree0 = tree + n;
|
|
tree1 = tree + n + ffLDL_treesize(logn - 1);
|
|
|
|
/*
|
|
* We split t1 into z1 (reused as temporary storage), then do
|
|
* the recursive invocation, with output in tmp. We finally
|
|
* merge back into z1.
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(z1, z1 + hn, t1, logn);
|
|
ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
|
|
tree1, z1, z1 + hn, logn - 1, tmp + n);
|
|
PQCLEAN_FALCON1024_AVX2_poly_merge_fft(z1, tmp, tmp + hn, logn);
|
|
|
|
/*
|
|
* Compute tb0 = t0 + (t1 - z1) * L. Value tb0 ends up in tmp[].
|
|
*/
|
|
memcpy(tmp, t1, n * sizeof * t1);
|
|
PQCLEAN_FALCON1024_AVX2_poly_sub(tmp, z1, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(tmp, tree, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(tmp, t0, logn);
|
|
|
|
/*
|
|
* Second recursive invocation.
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_poly_split_fft(z0, z0 + hn, tmp, logn);
|
|
ffSampling_fft(samp, samp_ctx, tmp, tmp + hn,
|
|
tree0, z0, z0 + hn, logn - 1, tmp + n);
|
|
PQCLEAN_FALCON1024_AVX2_poly_merge_fft(z0, tmp, tmp + hn, logn);
|
|
}
|
|
|
|
/*
|
|
* Compute a signature: the signature contains two vectors, s1 and s2.
|
|
* The s1 vector is not returned. The squared norm of (s1,s2) is
|
|
* computed, and if it is short enough, then s2 is returned into the
|
|
* s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
|
|
* returned; the caller should then try again. This function uses an
|
|
* expanded key.
|
|
*
|
|
* tmp[] must have room for at least six polynomials.
|
|
*/
|
|
static int
|
|
do_sign_tree(samplerZ samp, void *samp_ctx, int16_t *s2,
|
|
const fpr *expanded_key,
|
|
const uint16_t *hm,
|
|
unsigned logn, fpr *tmp) {
|
|
size_t n, u;
|
|
fpr *t0, *t1, *tx, *ty;
|
|
const fpr *b00, *b01, *b10, *b11, *tree;
|
|
fpr ni;
|
|
uint32_t sqn, ng;
|
|
int16_t *s1tmp, *s2tmp;
|
|
|
|
n = MKN(logn);
|
|
t0 = tmp;
|
|
t1 = t0 + n;
|
|
b00 = expanded_key + skoff_b00(logn);
|
|
b01 = expanded_key + skoff_b01(logn);
|
|
b10 = expanded_key + skoff_b10(logn);
|
|
b11 = expanded_key + skoff_b11(logn);
|
|
tree = expanded_key + skoff_tree(logn);
|
|
|
|
/*
|
|
* Set the target vector to [hm, 0] (hm is the hashed message).
|
|
*/
|
|
for (u = 0; u < n; u ++) {
|
|
t0[u] = fpr_of(hm[u]);
|
|
/* This is implicit.
|
|
t1[u] = fpr_zero;
|
|
*/
|
|
}
|
|
|
|
/*
|
|
* Apply the lattice basis to obtain the real target
|
|
* vector (after normalization with regards to modulus).
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_FFT(t0, logn);
|
|
ni = fpr_inverse_of_q;
|
|
memcpy(t1, t0, n * sizeof * t0);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(t1, b01, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(t0, b11, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulconst(t0, ni, logn);
|
|
|
|
tx = t1 + n;
|
|
ty = tx + n;
|
|
|
|
/*
|
|
* Apply sampling. Output is written back in [tx, ty].
|
|
*/
|
|
ffSampling_fft(samp, samp_ctx, tx, ty, tree, t0, t1, logn, ty + n);
|
|
|
|
/*
|
|
* Get the lattice point corresponding to that tiny vector.
|
|
*/
|
|
memcpy(t0, tx, n * sizeof * tx);
|
|
memcpy(t1, ty, n * sizeof * ty);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(tx, b00, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(ty, b10, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(tx, ty, logn);
|
|
memcpy(ty, t0, n * sizeof * t0);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(ty, b01, logn);
|
|
|
|
memcpy(t0, tx, n * sizeof * tx);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(t1, b11, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(t1, ty, logn);
|
|
|
|
PQCLEAN_FALCON1024_AVX2_iFFT(t0, logn);
|
|
PQCLEAN_FALCON1024_AVX2_iFFT(t1, logn);
|
|
|
|
/*
|
|
* Compute the signature.
|
|
*/
|
|
s1tmp = (int16_t *)tx;
|
|
sqn = 0;
|
|
ng = 0;
|
|
for (u = 0; u < n; u ++) {
|
|
int32_t z;
|
|
|
|
z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
|
|
sqn += (uint32_t)(z * z);
|
|
ng |= sqn;
|
|
s1tmp[u] = (int16_t)z;
|
|
}
|
|
sqn |= -(ng >> 31);
|
|
|
|
/*
|
|
* With "normal" degrees (e.g. 512 or 1024), it is very
|
|
* improbable that the computed vector is not short enough;
|
|
* however, it may happen in practice for the very reduced
|
|
* versions (e.g. degree 16 or below). In that case, the caller
|
|
* will loop, and we must not write anything into s2[] because
|
|
* s2[] may overlap with the hashed message hm[] and we need
|
|
* hm[] for the next iteration.
|
|
*/
|
|
s2tmp = (int16_t *)tmp;
|
|
for (u = 0; u < n; u ++) {
|
|
s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
|
|
}
|
|
if (PQCLEAN_FALCON1024_AVX2_is_short_half(sqn, s2tmp, logn)) {
|
|
memcpy(s2, s2tmp, n * sizeof * s2);
|
|
memcpy(tmp, s1tmp, n * sizeof * s1tmp);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Compute a signature: the signature contains two vectors, s1 and s2.
|
|
* The s1 vector is not returned. The squared norm of (s1,s2) is
|
|
* computed, and if it is short enough, then s2 is returned into the
|
|
* s2[] buffer, and 1 is returned; otherwise, s2[] is untouched and 0 is
|
|
* returned; the caller should then try again.
|
|
*
|
|
* tmp[] must have room for at least nine polynomials.
|
|
*/
|
|
static int
|
|
do_sign_dyn(samplerZ samp, void *samp_ctx, int16_t *s2,
|
|
const int8_t *f, const int8_t *g,
|
|
const int8_t *F, const int8_t *G,
|
|
const uint16_t *hm, unsigned logn, fpr *tmp) {
|
|
size_t n, u;
|
|
fpr *t0, *t1, *tx, *ty;
|
|
fpr *b00, *b01, *b10, *b11, *g00, *g01, *g11;
|
|
fpr ni;
|
|
uint32_t sqn, ng;
|
|
int16_t *s1tmp, *s2tmp;
|
|
|
|
n = MKN(logn);
|
|
|
|
/*
|
|
* Lattice basis is B = [[g, -f], [G, -F]]. We convert it to FFT.
|
|
*/
|
|
b00 = tmp;
|
|
b01 = b00 + n;
|
|
b10 = b01 + n;
|
|
b11 = b10 + n;
|
|
smallints_to_fpr(b01, f, logn);
|
|
smallints_to_fpr(b00, g, logn);
|
|
smallints_to_fpr(b11, F, logn);
|
|
smallints_to_fpr(b10, G, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(b01, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(b00, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(b11, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(b10, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_neg(b01, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_neg(b11, logn);
|
|
|
|
/*
|
|
* Compute the Gram matrix G = B·B*. Formulas are:
|
|
* g00 = b00*adj(b00) + b01*adj(b01)
|
|
* g01 = b00*adj(b10) + b01*adj(b11)
|
|
* g10 = b10*adj(b00) + b11*adj(b01)
|
|
* g11 = b10*adj(b10) + b11*adj(b11)
|
|
*
|
|
* For historical reasons, this implementation uses
|
|
* g00, g01 and g11 (upper triangle). g10 is not kept
|
|
* since it is equal to adj(g01).
|
|
*
|
|
* We _replace_ the matrix B with the Gram matrix, but we
|
|
* must keep b01 and b11 for computing the target vector.
|
|
*/
|
|
t0 = b11 + n;
|
|
t1 = t0 + n;
|
|
|
|
memcpy(t0, b01, n * sizeof * b01);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(t0, logn); // t0 <- b01*adj(b01)
|
|
|
|
memcpy(t1, b00, n * sizeof * b00);
|
|
PQCLEAN_FALCON1024_AVX2_poly_muladj_fft(t1, b10, logn); // t1 <- b00*adj(b10)
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(b00, logn); // b00 <- b00*adj(b00)
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(b00, t0, logn); // b00 <- g00
|
|
memcpy(t0, b01, n * sizeof * b01);
|
|
PQCLEAN_FALCON1024_AVX2_poly_muladj_fft(b01, b11, logn); // b01 <- b01*adj(b11)
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(b01, t1, logn); // b01 <- g01
|
|
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(b10, logn); // b10 <- b10*adj(b10)
|
|
memcpy(t1, b11, n * sizeof * b11);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulselfadj_fft(t1, logn); // t1 <- b11*adj(b11)
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(b10, t1, logn); // b10 <- g11
|
|
|
|
/*
|
|
* We rename variables to make things clearer. The three elements
|
|
* of the Gram matrix uses the first 3*n slots of tmp[], followed
|
|
* by b11 and b01 (in that order).
|
|
*/
|
|
g00 = b00;
|
|
g01 = b01;
|
|
g11 = b10;
|
|
b01 = t0;
|
|
t0 = b01 + n;
|
|
t1 = t0 + n;
|
|
|
|
/*
|
|
* Memory layout at that point:
|
|
* g00 g01 g11 b11 b01 t0 t1
|
|
*/
|
|
|
|
/*
|
|
* Set the target vector to [hm, 0] (hm is the hashed message).
|
|
*/
|
|
for (u = 0; u < n; u ++) {
|
|
t0[u] = fpr_of(hm[u]);
|
|
/* This is implicit.
|
|
t1[u] = fpr_zero;
|
|
*/
|
|
}
|
|
|
|
/*
|
|
* Apply the lattice basis to obtain the real target
|
|
* vector (after normalization with regards to modulus).
|
|
*/
|
|
PQCLEAN_FALCON1024_AVX2_FFT(t0, logn);
|
|
ni = fpr_inverse_of_q;
|
|
memcpy(t1, t0, n * sizeof * t0);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(t1, b01, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulconst(t1, fpr_neg(ni), logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(t0, b11, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mulconst(t0, ni, logn);
|
|
|
|
/*
|
|
* b01 and b11 can be discarded, so we move back (t0,t1).
|
|
* Memory layout is now:
|
|
* g00 g01 g11 t0 t1
|
|
*/
|
|
memcpy(b11, t0, n * 2 * sizeof * t0);
|
|
t0 = g11 + n;
|
|
t1 = t0 + n;
|
|
|
|
/*
|
|
* Apply sampling; result is written over (t0,t1).
|
|
*/
|
|
ffSampling_fft_dyntree(samp, samp_ctx,
|
|
t0, t1, g00, g01, g11, logn, t1 + n);
|
|
|
|
/*
|
|
* We arrange the layout back to:
|
|
* b00 b01 b10 b11 t0 t1
|
|
*
|
|
* We did not conserve the matrix basis, so we must recompute
|
|
* it now.
|
|
*/
|
|
b00 = tmp;
|
|
b01 = b00 + n;
|
|
b10 = b01 + n;
|
|
b11 = b10 + n;
|
|
memmove(b11 + n, t0, n * 2 * sizeof * t0);
|
|
t0 = b11 + n;
|
|
t1 = t0 + n;
|
|
smallints_to_fpr(b01, f, logn);
|
|
smallints_to_fpr(b00, g, logn);
|
|
smallints_to_fpr(b11, F, logn);
|
|
smallints_to_fpr(b10, G, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(b01, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(b00, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(b11, logn);
|
|
PQCLEAN_FALCON1024_AVX2_FFT(b10, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_neg(b01, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_neg(b11, logn);
|
|
tx = t1 + n;
|
|
ty = tx + n;
|
|
|
|
/*
|
|
* Get the lattice point corresponding to that tiny vector.
|
|
*/
|
|
memcpy(tx, t0, n * sizeof * t0);
|
|
memcpy(ty, t1, n * sizeof * t1);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(tx, b00, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(ty, b10, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(tx, ty, logn);
|
|
memcpy(ty, t0, n * sizeof * t0);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(ty, b01, logn);
|
|
|
|
memcpy(t0, tx, n * sizeof * tx);
|
|
PQCLEAN_FALCON1024_AVX2_poly_mul_fft(t1, b11, logn);
|
|
PQCLEAN_FALCON1024_AVX2_poly_add(t1, ty, logn);
|
|
PQCLEAN_FALCON1024_AVX2_iFFT(t0, logn);
|
|
PQCLEAN_FALCON1024_AVX2_iFFT(t1, logn);
|
|
|
|
s1tmp = (int16_t *)tx;
|
|
sqn = 0;
|
|
ng = 0;
|
|
for (u = 0; u < n; u ++) {
|
|
int32_t z;
|
|
|
|
z = (int32_t)hm[u] - (int32_t)fpr_rint(t0[u]);
|
|
sqn += (uint32_t)(z * z);
|
|
ng |= sqn;
|
|
s1tmp[u] = (int16_t)z;
|
|
}
|
|
sqn |= -(ng >> 31);
|
|
|
|
/*
|
|
* With "normal" degrees (e.g. 512 or 1024), it is very
|
|
* improbable that the computed vector is not short enough;
|
|
* however, it may happen in practice for the very reduced
|
|
* versions (e.g. degree 16 or below). In that case, the caller
|
|
* will loop, and we must not write anything into s2[] because
|
|
* s2[] may overlap with the hashed message hm[] and we need
|
|
* hm[] for the next iteration.
|
|
*/
|
|
s2tmp = (int16_t *)tmp;
|
|
for (u = 0; u < n; u ++) {
|
|
s2tmp[u] = (int16_t) - fpr_rint(t1[u]);
|
|
}
|
|
if (PQCLEAN_FALCON1024_AVX2_is_short_half(sqn, s2tmp, logn)) {
|
|
memcpy(s2, s2tmp, n * sizeof * s2);
|
|
memcpy(tmp, s1tmp, n * sizeof * s1tmp);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Sample an integer value along a half-gaussian distribution centered
|
|
* on zero and standard deviation 1.8205, with a precision of 72 bits.
|
|
*/
|
|
int
|
|
PQCLEAN_FALCON1024_AVX2_gaussian0_sampler(prng *p) {
|
|
|
|
/*
|
|
* High words.
|
|
*/
|
|
static const union {
|
|
uint16_t u16[16];
|
|
__m256i ymm[1];
|
|
} rhi15 = {
|
|
{
|
|
0x51FB, 0x2A69, 0x113E, 0x0568,
|
|
0x014A, 0x003B, 0x0008, 0x0000,
|
|
0x0000, 0x0000, 0x0000, 0x0000,
|
|
0x0000, 0x0000, 0x0000, 0x0000
|
|
}
|
|
};
|
|
|
|
static const union {
|
|
uint64_t u64[20];
|
|
__m256i ymm[5];
|
|
} rlo57 = {
|
|
{
|
|
0x1F42ED3AC391802, 0x12B181F3F7DDB82,
|
|
0x1CDD0934829C1FF, 0x1754377C7994AE4,
|
|
0x1846CAEF33F1F6F, 0x14AC754ED74BD5F,
|
|
0x024DD542B776AE4, 0x1A1FFDC65AD63DA,
|
|
0x01F80D88A7B6428, 0x001C3FDB2040C69,
|
|
0x00012CF24D031FB, 0x00000949F8B091F,
|
|
0x0000003665DA998, 0x00000000EBF6EBB,
|
|
0x0000000002F5D7E, 0x000000000007098,
|
|
0x0000000000000C6, 0x000000000000001,
|
|
0x000000000000000, 0x000000000000000
|
|
}
|
|
};
|
|
|
|
uint64_t lo;
|
|
unsigned hi;
|
|
__m256i xhi, rhi, gthi, eqhi, eqm;
|
|
__m256i xlo, gtlo0, gtlo1, gtlo2, gtlo3, gtlo4;
|
|
__m128i t, zt;
|
|
int r;
|
|
|
|
/*
|
|
* Get a 72-bit random value and split it into a low part
|
|
* (57 bits) and a high part (15 bits)
|
|
*/
|
|
lo = prng_get_u64(p);
|
|
hi = prng_get_u8(p);
|
|
hi = (hi << 7) | (unsigned)(lo >> 57);
|
|
lo &= 0x1FFFFFFFFFFFFFF;
|
|
|
|
/*
|
|
* Broadcast the high part and compare it with the relevant
|
|
* values. We need both a "greater than" and an "equal"
|
|
* comparisons.
|
|
*/
|
|
xhi = _mm256_broadcastw_epi16(_mm_cvtsi32_si128((int32_t)hi));
|
|
rhi = _mm256_loadu_si256(&rhi15.ymm[0]);
|
|
gthi = _mm256_cmpgt_epi16(rhi, xhi);
|
|
eqhi = _mm256_cmpeq_epi16(rhi, xhi);
|
|
|
|
/*
|
|
* The result is the number of 72-bit values (among the list of 19)
|
|
* which are greater than the 72-bit random value. We first count
|
|
* all non-zero 16-bit elements in the first eight of gthi. Such
|
|
* elements have value -1 or 0, so we first negate them.
|
|
*/
|
|
t = _mm_srli_epi16(_mm256_castsi256_si128(gthi), 15);
|
|
zt = _mm_setzero_si128();
|
|
t = _mm_hadd_epi16(t, zt);
|
|
t = _mm_hadd_epi16(t, zt);
|
|
t = _mm_hadd_epi16(t, zt);
|
|
r = _mm_cvtsi128_si32(t);
|
|
|
|
/*
|
|
* We must look at the low bits for all values for which the
|
|
* high bits are an "equal" match; values 8-18 all have the
|
|
* same high bits (0).
|
|
* On 32-bit systems, 'lo' really is two registers, requiring
|
|
* some extra code.
|
|
*/
|
|
xlo = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(*(int64_t *)&lo));
|
|
gtlo0 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[0]), xlo);
|
|
gtlo1 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[1]), xlo);
|
|
gtlo2 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[2]), xlo);
|
|
gtlo3 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[3]), xlo);
|
|
gtlo4 = _mm256_cmpgt_epi64(_mm256_loadu_si256(&rlo57.ymm[4]), xlo);
|
|
|
|
/*
|
|
* Keep only comparison results that correspond to the non-zero
|
|
* elements in eqhi.
|
|
*/
|
|
gtlo0 = _mm256_and_si256(gtlo0, _mm256_cvtepi16_epi64(
|
|
_mm256_castsi256_si128(eqhi)));
|
|
gtlo1 = _mm256_and_si256(gtlo1, _mm256_cvtepi16_epi64(
|
|
_mm256_castsi256_si128(_mm256_bsrli_epi128(eqhi, 8))));
|
|
eqm = _mm256_permute4x64_epi64(eqhi, 0xFF);
|
|
gtlo2 = _mm256_and_si256(gtlo2, eqm);
|
|
gtlo3 = _mm256_and_si256(gtlo3, eqm);
|
|
gtlo4 = _mm256_and_si256(gtlo4, eqm);
|
|
|
|
/*
|
|
* Add all values to count the total number of "-1" elements.
|
|
* Since the first eight "high" words are all different, only
|
|
* one element (at most) in gtlo0:gtlo1 can be non-zero; however,
|
|
* if the high word of the random value is zero, then many
|
|
* elements of gtlo2:gtlo3:gtlo4 can be non-zero.
|
|
*/
|
|
gtlo0 = _mm256_or_si256(gtlo0, gtlo1);
|
|
gtlo0 = _mm256_add_epi64(
|
|
_mm256_add_epi64(gtlo0, gtlo2),
|
|
_mm256_add_epi64(gtlo3, gtlo4));
|
|
t = _mm_add_epi64(
|
|
_mm256_castsi256_si128(gtlo0),
|
|
_mm256_extracti128_si256(gtlo0, 1));
|
|
t = _mm_add_epi64(t, _mm_srli_si128(t, 8));
|
|
r -= _mm_cvtsi128_si32(t);
|
|
|
|
return r;
|
|
|
|
}
|
|
|
|
/*
|
|
* Sample a bit with probability exp(-x) for some x >= 0.
|
|
*/
|
|
static int
|
|
BerExp(prng *p, fpr x, fpr ccs) {
|
|
int s, i;
|
|
fpr r;
|
|
uint32_t sw, w;
|
|
uint64_t z;
|
|
|
|
/*
|
|
* Reduce x modulo log(2): x = s*log(2) + r, with s an integer,
|
|
* and 0 <= r < log(2). Since x >= 0, we can use fpr_trunc().
|
|
*/
|
|
s = (int)fpr_trunc(fpr_mul(x, fpr_inv_log2));
|
|
r = fpr_sub(x, fpr_mul(fpr_of(s), fpr_log2));
|
|
|
|
/*
|
|
* It may happen (quite rarely) that s >= 64; if sigma = 1.2
|
|
* (the minimum value for sigma), r = 0 and b = 1, then we get
|
|
* s >= 64 if the half-Gaussian produced a z >= 13, which happens
|
|
* with probability about 0.000000000230383991, which is
|
|
* approximatively equal to 2^(-32). In any case, if s >= 64,
|
|
* then BerExp will be non-zero with probability less than
|
|
* 2^(-64), so we can simply saturate s at 63.
|
|
*/
|
|
sw = (uint32_t)s;
|
|
sw ^= (sw ^ 63) & -((63 - sw) >> 31);
|
|
s = (int)sw;
|
|
|
|
/*
|
|
* Compute exp(-r); we know that 0 <= r < log(2) at this point, so
|
|
* we can use fpr_expm_p63(), which yields a result scaled to 2^63.
|
|
* We scale it up to 2^64, then right-shift it by s bits because
|
|
* we really want exp(-x) = 2^(-s)*exp(-r).
|
|
*
|
|
* The "-1" operation makes sure that the value fits on 64 bits
|
|
* (i.e. if r = 0, we may get 2^64, and we prefer 2^64-1 in that
|
|
* case). The bias is negligible since fpr_expm_p63() only computes
|
|
* with 51 bits of precision or so.
|
|
*/
|
|
z = ((fpr_expm_p63(r, ccs) << 1) - 1) >> s;
|
|
|
|
/*
|
|
* Sample a bit with probability exp(-x). Since x = s*log(2) + r,
|
|
* exp(-x) = 2^-s * exp(-r), we compare lazily exp(-x) with the
|
|
* PRNG output to limit its consumption, the sign of the difference
|
|
* yields the expected result.
|
|
*/
|
|
i = 64;
|
|
do {
|
|
i -= 8;
|
|
w = prng_get_u8(p) - ((uint32_t)(z >> i) & 0xFF);
|
|
} while (!w && i > 0);
|
|
return (int)(w >> 31);
|
|
}
|
|
|
|
/*
|
|
* The sampler produces a random integer that follows a discrete Gaussian
|
|
* distribution, centered on mu, and with standard deviation sigma. The
|
|
* provided parameter isigma is equal to 1/sigma.
|
|
*
|
|
* The value of sigma MUST lie between 1 and 2 (i.e. isigma lies between
|
|
* 0.5 and 1); in Falcon, sigma should always be between 1.2 and 1.9.
|
|
*/
|
|
int
|
|
PQCLEAN_FALCON1024_AVX2_sampler(void *ctx, fpr mu, fpr isigma) {
|
|
sampler_context *spc;
|
|
int s, z0, z, b;
|
|
fpr r, dss, ccs, x;
|
|
|
|
spc = ctx;
|
|
|
|
/*
|
|
* Center is mu. We compute mu = s + r where s is an integer
|
|
* and 0 <= r < 1.
|
|
*/
|
|
s = (int)fpr_floor(mu);
|
|
r = fpr_sub(mu, fpr_of(s));
|
|
|
|
/*
|
|
* dss = 1/(2*sigma^2) = 0.5*(isigma^2).
|
|
*/
|
|
dss = fpr_half(fpr_sqr(isigma));
|
|
|
|
/*
|
|
* ccs = sigma_min / sigma = sigma_min * isigma.
|
|
*/
|
|
ccs = fpr_mul(isigma, spc->sigma_min);
|
|
|
|
/*
|
|
* We now need to sample on center r.
|
|
*/
|
|
for (;;) {
|
|
/*
|
|
* Sample z for a Gaussian distribution. Then get a
|
|
* random bit b to turn the sampling into a bimodal
|
|
* distribution: if b = 1, we use z+1, otherwise we
|
|
* use -z. We thus have two situations:
|
|
*
|
|
* - b = 1: z >= 1 and sampled against a Gaussian
|
|
* centered on 1.
|
|
* - b = 0: z <= 0 and sampled against a Gaussian
|
|
* centered on 0.
|
|
*/
|
|
z0 = PQCLEAN_FALCON1024_AVX2_gaussian0_sampler(&spc->p);
|
|
b = (int)prng_get_u8(&spc->p) & 1;
|
|
z = b + ((b << 1) - 1) * z0;
|
|
|
|
/*
|
|
* Rejection sampling. We want a Gaussian centered on r;
|
|
* but we sampled against a Gaussian centered on b (0 or
|
|
* 1). But we know that z is always in the range where
|
|
* our sampling distribution is greater than the Gaussian
|
|
* distribution, so rejection works.
|
|
*
|
|
* We got z with distribution:
|
|
* G(z) = exp(-((z-b)^2)/(2*sigma0^2))
|
|
* We target distribution:
|
|
* S(z) = exp(-((z-r)^2)/(2*sigma^2))
|
|
* Rejection sampling works by keeping the value z with
|
|
* probability S(z)/G(z), and starting again otherwise.
|
|
* This requires S(z) <= G(z), which is the case here.
|
|
* Thus, we simply need to keep our z with probability:
|
|
* P = exp(-x)
|
|
* where:
|
|
* x = ((z-r)^2)/(2*sigma^2) - ((z-b)^2)/(2*sigma0^2)
|
|
*
|
|
* Here, we scale up the Bernouilli distribution, which
|
|
* makes rejection more probable, but makes rejection
|
|
* rate sufficiently decorrelated from the Gaussian
|
|
* center and standard deviation that the whole sampler
|
|
* can be said to be constant-time.
|
|
*/
|
|
x = fpr_mul(fpr_sqr(fpr_sub(fpr_of(z), r)), dss);
|
|
x = fpr_sub(x, fpr_mul(fpr_of(z0 * z0), fpr_inv_2sqrsigma0));
|
|
if (BerExp(&spc->p, x, ccs)) {
|
|
/*
|
|
* Rejection sampling was centered on r, but the
|
|
* actual center is mu = s + r.
|
|
*/
|
|
return s + z;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* see inner.h */
|
|
void
|
|
PQCLEAN_FALCON1024_AVX2_sign_tree(int16_t *sig, inner_shake256_context *rng,
|
|
const fpr *expanded_key,
|
|
const uint16_t *hm, unsigned logn, uint8_t *tmp) {
|
|
fpr *ftmp;
|
|
|
|
ftmp = (fpr *)tmp;
|
|
for (;;) {
|
|
/*
|
|
* Signature produces short vectors s1 and s2. The
|
|
* signature is acceptable only if the aggregate vector
|
|
* s1,s2 is short; we must use the same bound as the
|
|
* verifier.
|
|
*
|
|
* If the signature is acceptable, then we return only s2
|
|
* (the verifier recomputes s1 from s2, the hashed message,
|
|
* and the public key).
|
|
*/
|
|
sampler_context spc;
|
|
samplerZ samp;
|
|
void *samp_ctx;
|
|
|
|
/*
|
|
* Normal sampling. We use a fast PRNG seeded from our
|
|
* SHAKE context ('rng').
|
|
*/
|
|
if (logn == 10) {
|
|
spc.sigma_min = fpr_sigma_min_10;
|
|
} else {
|
|
spc.sigma_min = fpr_sigma_min_9;
|
|
}
|
|
PQCLEAN_FALCON1024_AVX2_prng_init(&spc.p, rng);
|
|
samp = PQCLEAN_FALCON1024_AVX2_sampler;
|
|
samp_ctx = &spc;
|
|
|
|
/*
|
|
* Do the actual signature.
|
|
*/
|
|
if (do_sign_tree(samp, samp_ctx, sig,
|
|
expanded_key, hm, logn, ftmp)) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* see inner.h */
|
|
void
|
|
PQCLEAN_FALCON1024_AVX2_sign_dyn(int16_t *sig, inner_shake256_context *rng,
|
|
const int8_t *f, const int8_t *g,
|
|
const int8_t *F, const int8_t *G,
|
|
const uint16_t *hm, unsigned logn, uint8_t *tmp) {
|
|
fpr *ftmp;
|
|
|
|
ftmp = (fpr *)tmp;
|
|
for (;;) {
|
|
/*
|
|
* Signature produces short vectors s1 and s2. The
|
|
* signature is acceptable only if the aggregate vector
|
|
* s1,s2 is short; we must use the same bound as the
|
|
* verifier.
|
|
*
|
|
* If the signature is acceptable, then we return only s2
|
|
* (the verifier recomputes s1 from s2, the hashed message,
|
|
* and the public key).
|
|
*/
|
|
sampler_context spc;
|
|
samplerZ samp;
|
|
void *samp_ctx;
|
|
|
|
/*
|
|
* Normal sampling. We use a fast PRNG seeded from our
|
|
* SHAKE context ('rng').
|
|
*/
|
|
if (logn == 10) {
|
|
spc.sigma_min = fpr_sigma_min_10;
|
|
} else {
|
|
spc.sigma_min = fpr_sigma_min_9;
|
|
}
|
|
PQCLEAN_FALCON1024_AVX2_prng_init(&spc.p, rng);
|
|
samp = PQCLEAN_FALCON1024_AVX2_sampler;
|
|
samp_ctx = &spc;
|
|
|
|
/*
|
|
* Do the actual signature.
|
|
*/
|
|
if (do_sign_dyn(samp, samp_ctx, sig,
|
|
f, g, F, G, hm, logn, ftmp)) {
|
|
break;
|
|
}
|
|
}
|
|
}
|